linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
@ 2006-06-09  7:41 Shailabh Nagar
  2006-06-09  8:00 ` Andrew Morton
  2006-06-09 15:55 ` Chris Sturtivant
  0 siblings, 2 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-09  7:41 UTC (permalink / raw)
  To: Jay Lan, csturtiv, Balbir Singh; +Cc: linux-kernel

Jay, Chris, 
Could you check if this patch does the needful ? 

Its tested and runs fine for me. A quick response would 
be appreciated so that it can be included in -mm before 
the 2.6.18 merge window begins.

I decided against adding the configuration to the taskstats 
interface directly (as another command) since the sysfs solution
is much simpler and the configuration operation is infrequent.

Balbir, all, comments welcome.

--Shailabh


Selective sending of per-tgid statistics in taskstats interface

The taskstats interface currently sends both per-pid and per-tgid stats
whenever a thread exits and its thread group is non-empty. Some potential
users of taskstats, currently SGI's CSA, do not need the per-tgid stats.

Hence, this patch introduces a configuration parameter
	/sys/kernel/taskstats_tgid_exit
through which a privileged user can turn on/off sending of per-tgid stats on
task exit. The default is on. Regardless of the parameter, explicit commands
requesting per-tgid stats are always satisfied.

--

Signed-Off-By: Shailabh Nagar <nagar@watson.ibm.com>


 Documentation/accounting/taskstats.txt |   42 ++++++++++++++++++++++++---------
 include/linux/taskstats_kern.h         |   14 +++--------
 kernel/ksysfs.c                        |    9 +++++++
 kernel/taskstats.c                     |   26 ++++++++++++++++++++
 4 files changed, 70 insertions(+), 21 deletions(-)

Index: linux-2.6.17-rc5-mm3/include/linux/taskstats_kern.h
===================================================================
--- linux-2.6.17-rc5-mm3.orig/include/linux/taskstats_kern.h	2006-06-09 02:02:31.000000000 -0400
+++ linux-2.6.17-rc5-mm3/include/linux/taskstats_kern.h	2006-06-09 02:04:42.000000000 -0400
@@ -18,13 +18,6 @@ enum {
 #ifdef CONFIG_TASKSTATS
 extern kmem_cache_t *taskstats_cache;

-static inline void taskstats_exit_alloc(struct taskstats **ptidstats,
-					struct taskstats **ptgidstats)
-{
-	*ptidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
-	*ptgidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
-}
-
 static inline void taskstats_exit_free(struct taskstats *tidstats,
 					struct taskstats *tgidstats)
 {
@@ -34,17 +27,18 @@ static inline void taskstats_exit_free(s
 		kmem_cache_free(taskstats_cache, tgidstats);
 }

+extern void taskstats_exit_alloc(struct taskstats **, struct taskstats **);
 extern void taskstats_exit_send(struct task_struct *, struct taskstats *,
 				struct taskstats *);
 extern void taskstats_init_early(void);

 #else
-static inline void taskstats_exit_alloc(struct taskstats **ptidstats,
-					struct taskstats **ptgidstats)
-{}
 static inline void taskstats_exit_free(struct taskstats *ptidstats,
 					struct taskstats *ptgidstats)
 {}
+static inline void taskstats_exit_alloc(struct taskstats **ptidstats,
+					struct taskstats **ptgidstats)
+{}
 static inline void taskstats_exit_send(struct task_struct *tsk,
 					struct taskstats *tidstats,
 					struct taskstats *tgidstats)
Index: linux-2.6.17-rc5-mm3/kernel/ksysfs.c
===================================================================
--- linux-2.6.17-rc5-mm3.orig/kernel/ksysfs.c	2006-06-09 02:02:31.000000000 -0400
+++ linux-2.6.17-rc5-mm3/kernel/ksysfs.c	2006-06-09 02:04:42.000000000 -0400
@@ -63,6 +63,12 @@ static ssize_t kexec_crash_loaded_show(s
 KERNEL_ATTR_RO(kexec_crash_loaded);
 #endif /* CONFIG_KEXEC */

+#ifdef CONFIG_TASKSTATS
+extern ssize_t taskstats_tgid_exit_show(struct subsystem *subsys, char *page);
+extern ssize_t taskstats_tgid_exit_store(struct subsystem *subsys, const char *page, size_t count);
+KERNEL_ATTR_RW(taskstats_tgid_exit);
+#endif
+
 decl_subsys(kernel, NULL, NULL);
 EXPORT_SYMBOL_GPL(kernel_subsys);

@@ -75,6 +81,9 @@ static struct attribute * kernel_attrs[]
 	&kexec_loaded_attr.attr,
 	&kexec_crash_loaded_attr.attr,
 #endif
+#ifdef CONFIG_TASKSTATS
+	&taskstats_tgid_exit_attr.attr,
+#endif
 	NULL
 };

Index: linux-2.6.17-rc5-mm3/kernel/taskstats.c
===================================================================
--- linux-2.6.17-rc5-mm3.orig/kernel/taskstats.c	2006-06-09 02:02:31.000000000 -0400
+++ linux-2.6.17-rc5-mm3/kernel/taskstats.c	2006-06-09 02:04:42.000000000 -0400
@@ -24,6 +24,7 @@

 static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
 static int family_registered = 0;
+static int tgid_exit_send = 1;   /* Should tgid stats be sent on exit */
 kmem_cache_t *taskstats_cache;
 static DEFINE_MUTEX(taskstats_exit_mutex);

@@ -229,6 +230,15 @@ err:
 	return rc;
 }

+void taskstats_exit_alloc(struct taskstats **ptidstats,
+					struct taskstats **ptgidstats)
+{
+	*ptidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
+	*ptgidstats = NULL;
+	if (tgid_exit_send)
+		*ptgidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
+}
+
 /* Send pid data out on exit */
 void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 			struct taskstats *tgidstats)
@@ -254,6 +264,7 @@ void taskstats_exit_send(struct task_str
 	size = nla_total_size(sizeof(u32)) +
 		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);

+	/* Allocation should not depend on tgid_exit_send value */
 	if (is_thread_group)
 		size = 2 * size;	/* PID + STATS + TGID + STATS */

@@ -271,6 +282,9 @@ void taskstats_exit_send(struct task_str
 			*tidstats);
 	nla_nest_end(rep_skb, na);

+	/* Do not check tgid_exit_send value here. If it was unset during
+	 * taskstats_exit_alloc(), tgidstats will be NULL
+	 */
 	if (!is_thread_group || !tgidstats) {
 		send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
 		goto ret;
@@ -345,3 +359,15 @@ err:
  * mechanisms precedes initialization of the taskstats interface
  */
 late_initcall(taskstats_init);
+
+/* configuration through sysfs */
+ssize_t taskstats_tgid_exit_show(struct subsystem *subsys, char *page)
+{
+	return sprintf(page, "%d\n", tgid_exit_send);
+}
+ssize_t taskstats_tgid_exit_store(struct subsystem *subsys, const char *page, size_t count)
+{
+	char *p = (char *)page;
+	tgid_exit_send = simple_strtoul(p, &p, 10);
+	return count;
+}
Index: linux-2.6.17-rc5-mm3/Documentation/accounting/taskstats.txt
===================================================================
--- linux-2.6.17-rc5-mm3.orig/Documentation/accounting/taskstats.txt	2006-06-07 12:03:14.000000000 -0400
+++ linux-2.6.17-rc5-mm3/Documentation/accounting/taskstats.txt	2006-06-09 02:35:07.000000000 -0400
@@ -32,13 +32,28 @@ The response contains statistics for a t
 statistics for all tasks of the process (if tgid is specified).

 To obtain statistics for tasks which are exiting, userspace opens a multicast
-netlink socket. Each time a task exits, two records are sent by the kernel to
-each listener on the multicast socket. The first the per-pid task's statistics
-and the second is the sum for all tasks of the process to which the task
-belongs (the task does not need to be the thread group leader). The need for
-per-tgid stats to be sent for each exiting task is explained in the per-tgid
-stats section below.
+netlink socket. Each time a task exits, its per-pid statistics are sent by
+the kernel to each listener on the multicast socket.

+If
+a) the value of /sys/kernel/taskstats_tgid_exit is non-zero and
+b) the task's thread_group has other members
+then a second record is also sent, consisting of the sum for all tasks of the
+thread group to which the task belongs. The task does not need to be the thread
+group leader. The utility for per-tgid stats to be sent for each exiting task
+is explained in the per-tgid stats section below.
+
+# echo 0 > /sys/kernel/taskstats_tgid_exit
+turns off sending of per-tgid stats on task exit
+
+# echo 1 > /sys/kernel/taskstats_tgid_exit
+turns it back on (which is the default)
+
+Commands requesting per-tgid stats are not affected by this configuration
+parameter and are always satisified by the kernel. Also, when the last thread
+of a thread group, or a solitary thread exits, only the per-pid stats are sent
+since they are identical to the per-tgid stats at that point in time.
+
 getdelays.c is a simple utility demonstrating usage of the taskstats interface
 for reporting delay accounting statistics.

@@ -100,8 +115,8 @@ per-tgid stats

 Taskstats provides per-process stats, in addition to per-task stats, since
 resource management is often done at a process granularity and aggregating task
-stats in userspace alone is inefficient and potentially inaccurate (due to lack
-of atomicity).
+stats in userspace alone is inefficient and potentially inaccurate due to lack
+of atomicity.

 However, maintaining per-process, in addition to per-task stats, within the
 kernel has space and time overheads. Hence the taskstats implementation
@@ -115,9 +130,14 @@ statistic from the kernel.

 The approach taken by taskstats is to return the per-tgid stats *each* time
 a task exits, in addition to the per-pid stats for that task. Userspace can
-maintain task<->process mappings and use them to maintain the per-process stats
-in userspace, updating the aggregate appropriately as the tasks of a process
-exit.
+maintain task<->process mappings and use them to maintain the per-process
+stats, updating the aggregate appropriately as the tasks of a process
+exit. Userspace must also expect only per-pid stats to be sent when the last
+thread of a thread group exits (also when that is the only thread in the thread
+group, which is a common case).
+
+Installations that don't need per-tgid stats can disable their collection and
+sending on task exit as described in the Usage section.

 Extending taskstats
 -------------------


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-09  7:41 [Patch][RFC] Disabling per-tgid stats on task exit in taskstats Shailabh Nagar
@ 2006-06-09  8:00 ` Andrew Morton
  2006-06-09 10:51   ` Balbir Singh
  2006-06-09 15:55 ` Chris Sturtivant
  1 sibling, 1 reply; 134+ messages in thread
From: Andrew Morton @ 2006-06-09  8:00 UTC (permalink / raw)
  To: Shailabh Nagar; +Cc: jlan, csturtiv, balbir, linux-kernel

On Fri, 09 Jun 2006 03:41:04 -0400
Shailabh Nagar <nagar@watson.ibm.com> wrote:

> Hence, this patch introduces a configuration parameter
> 	/sys/kernel/taskstats_tgid_exit
> through which a privileged user can turn on/off sending of per-tgid stats on
> task exit.

That seems a bit clumsy.  What happens if one consumer wants the per-tgid
stats and another does not?

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-09  8:00 ` Andrew Morton
@ 2006-06-09 10:51   ` Balbir Singh
  2006-06-09 11:21     ` Andrew Morton
  0 siblings, 1 reply; 134+ messages in thread
From: Balbir Singh @ 2006-06-09 10:51 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Shailabh Nagar, jlan, csturtiv, linux-kernel

Andrew Morton wrote:
> On Fri, 09 Jun 2006 03:41:04 -0400
> Shailabh Nagar <nagar@watson.ibm.com> wrote:
> 
> 
>>Hence, this patch introduces a configuration parameter
>>	/sys/kernel/taskstats_tgid_exit
>>through which a privileged user can turn on/off sending of per-tgid stats on
>>task exit.
> 
> 
> That seems a bit clumsy.  What happens if one consumer wants the per-tgid
> stats and another does not?

For all subsystems that re-use the taskstats structure from the exit path,
we have the issue that you mentioned. Thats because several statistics co-exist
in the same structure. These subsystems can keep their tgid-stats empty by not
filling up anything in fill_tgid() or using this patch to selectively enable/disable
tgid stats.

For other subsystems, they could pass tgidstats as NULL to taskstats_exit_send().

-- 

	Balbir Singh,
	Linux Technology Center,
	IBM Software Labs

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-09 10:51   ` Balbir Singh
@ 2006-06-09 11:21     ` Andrew Morton
  2006-06-09 13:20       ` Shailabh Nagar
                         ` (2 more replies)
  0 siblings, 3 replies; 134+ messages in thread
From: Andrew Morton @ 2006-06-09 11:21 UTC (permalink / raw)
  To: balbir; +Cc: nagar, jlan, csturtiv, linux-kernel

On Fri, 09 Jun 2006 16:21:46 +0530
Balbir Singh <balbir@in.ibm.com> wrote:

> Andrew Morton wrote:
> > On Fri, 09 Jun 2006 03:41:04 -0400
> > Shailabh Nagar <nagar@watson.ibm.com> wrote:
> > 
> > 
> >>Hence, this patch introduces a configuration parameter
> >>	/sys/kernel/taskstats_tgid_exit
> >>through which a privileged user can turn on/off sending of per-tgid stats on
> >>task exit.
> > 
> > 
> > That seems a bit clumsy.  What happens if one consumer wants the per-tgid
> > stats and another does not?
> 
> For all subsystems that re-use the taskstats structure from the exit path,
> we have the issue that you mentioned. Thats because several statistics co-exist
> in the same structure. These subsystems can keep their tgid-stats empty by not
> filling up anything in fill_tgid() or using this patch to selectively enable/disable
> tgid stats.
> 
> For other subsystems, they could pass tgidstats as NULL to taskstats_exit_send().
> 

I don't understand.  If a subsystem exists then it fills in its slots in
the taskstats structure, doesn't it?

No other subsystem needs a global knob, does it?

You see the problem - if one userspace package wants the tgid-stats and
another concurrently-running one does now, what do we do?  Just leave it
enabled and run a bit slower?

If so, how much slower?  Your changelog says some potential users don't
need the tgid-stats, but so what?  I assume this patch is a performance
thing?  If so, has it been quantified?


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-09 11:21     ` Andrew Morton
@ 2006-06-09 13:20       ` Shailabh Nagar
  2006-06-09 18:25         ` Jay Lan
  2006-06-09 15:36       ` Balbir Singh
  2006-06-09 21:56       ` Shailabh Nagar
  2 siblings, 1 reply; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-09 13:20 UTC (permalink / raw)
  To: Andrew Morton; +Cc: balbir, jlan, csturtiv, linux-kernel

Andrew Morton wrote:

>On Fri, 09 Jun 2006 16:21:46 +0530
>Balbir Singh <balbir@in.ibm.com> wrote:
>
>  
>
>>Andrew Morton wrote:
>>    
>>
>>>On Fri, 09 Jun 2006 03:41:04 -0400
>>>Shailabh Nagar <nagar@watson.ibm.com> wrote:
>>>
>>>
>>>      
>>>
>>>>Hence, this patch introduces a configuration parameter
>>>>	/sys/kernel/taskstats_tgid_exit
>>>>through which a privileged user can turn on/off sending of per-tgid stats on
>>>>task exit.
>>>>        
>>>>
>>>That seems a bit clumsy.  What happens if one consumer wants the per-tgid
>>>stats and another does not?
>>>      
>>>
Then the tgid stat sending on exit will need to be turned on for everyone.

>>For all subsystems that re-use the taskstats structure from the exit path,
>>we have the issue that you mentioned. Thats because several statistics co-exist
>>in the same structure. These subsystems can keep their tgid-stats empty by not
>>filling up anything in fill_tgid() or using this patch to selectively enable/disable
>>tgid stats.
>>    
>>
>>For other subsystems, they could pass tgidstats as NULL to taskstats_exit_send().
>>
>>    
>>
>
>I don't understand.  If a subsystem exists then it fills in its slots in
>the taskstats structure, doesn't it?
>  
>
It can choose not to, by not inserting its "fill my fields" function 
inside the do..while_each_thread
loop within fill_tgid. So while they would still necessarily receive the 
per-tgid taskstats struct on exit
(because some other subsystem needs it), they can atleast save on 
filling up their part of the struct
if they don't need it.

>No other subsystem needs a global knob, does it?
>  
>
I didn't understand.

>You see the problem - if one userspace package wants the tgid-stats and
>another concurrently-running one does now, what do we do?  Just leave it
>enabled and run a bit slower?
>  
>
Yes, thats what will have to be done. If one user wants, all users will 
need to get the stats. They can
limit their impact by not processing the parts of the netlink message 
that correspond to the per-tgid stats
(since the per-tgid stats are sent as a separate attribute, thats easy 
to do).

This patch covers the use case where someone like CSA is the only user 
(delay accounting is turned off)
and wants to reduce the performance impact of the kernel allocating, 
sending and userspace discarding
the per-tgid stats.

>If so, how much slower?  Your changelog says some potential users don't
>need the tgid-stats, but so what?  I assume this patch is a performance
>thing?  
>
Yes, its a performance optimization.

>If so, has it been quantified?
>  
>
No :-(
Will try to get some numbers.
Jay/Chris, if you can try to do that too, for the kind of usage that is 
typical of CSA,
that would be great.

--Shailabh


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-09 11:21     ` Andrew Morton
  2006-06-09 13:20       ` Shailabh Nagar
@ 2006-06-09 15:36       ` Balbir Singh
  2006-06-09 18:35         ` Jay Lan
  2006-06-09 21:56       ` Shailabh Nagar
  2 siblings, 1 reply; 134+ messages in thread
From: Balbir Singh @ 2006-06-09 15:36 UTC (permalink / raw)
  To: Andrew Morton; +Cc: nagar, jlan, csturtiv, linux-kernel

Andrew Morton wrote:
> On Fri, 09 Jun 2006 16:21:46 +0530
> Balbir Singh <balbir@in.ibm.com> wrote:
> 
> 
>>Andrew Morton wrote:
>>
>>>On Fri, 09 Jun 2006 03:41:04 -0400
>>>Shailabh Nagar <nagar@watson.ibm.com> wrote:
>>>
>>>
>>>
>>>>Hence, this patch introduces a configuration parameter
>>>>	/sys/kernel/taskstats_tgid_exit
>>>>through which a privileged user can turn on/off sending of per-tgid stats on
>>>>task exit.
>>>
>>>
>>>That seems a bit clumsy.  What happens if one consumer wants the per-tgid
>>>stats and another does not?
>>
>>For all subsystems that re-use the taskstats structure from the exit path,
>>we have the issue that you mentioned. Thats because several statistics co-exist
>>in the same structure. These subsystems can keep their tgid-stats empty by not
>>filling up anything in fill_tgid() or using this patch to selectively enable/disable
>>tgid stats.
>>
>>For other subsystems, they could pass tgidstats as NULL to taskstats_exit_send().
>>
> 
> 
> I don't understand.  If a subsystem exists then it fills in its slots in
> the taskstats structure, doesn't it?
> 
> No other subsystem needs a global knob, does it?
> 
> You see the problem - if one userspace package wants the tgid-stats and
> another concurrently-running one does now, what do we do?  Just leave it
> enabled and run a bit slower?

Another option is to get the package to define their own taskstats genetlink 
attribute and fill it up in taskstats_exit_send(). This would be similar to
TASKSTATS_TYPE_AGGR_PID/TGID.

They can make this attribute independent of the taskstats structure and fill
it based on their policy (per-pid or per-tgid). But the current interface
users like CSA want to build on top of the taskstats structure.

> 
> If so, how much slower?  Your changelog says some potential users don't
> need the tgid-stats, but so what?  I assume this patch is a performance
> thing?  If so, has it been quantified?
> 


-- 

	Balbir Singh,
	Linux Technology Center,
	IBM Software Labs

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-09  7:41 [Patch][RFC] Disabling per-tgid stats on task exit in taskstats Shailabh Nagar
  2006-06-09  8:00 ` Andrew Morton
@ 2006-06-09 15:55 ` Chris Sturtivant
  1 sibling, 0 replies; 134+ messages in thread
From: Chris Sturtivant @ 2006-06-09 15:55 UTC (permalink / raw)
  To: Shailabh Nagar; +Cc: Jay Lan, Balbir Singh, linux-kernel


Shailabh Nagar wrote:
> Jay, Chris, Could you check if this patch does the needful ?
> Its tested and runs fine for me. A quick response would be appreciated 
> so that it can be included in -mm before the 2.6.18 merge window begins.
>
> I decided against adding the configuration to the taskstats interface 
> directly (as another command) since the sysfs solution
> is much simpler and the configuration operation is infrequent.
>
> Balbir, all, comments welcome.
>
> --Shailabh
>
>
Unfortunately, I'm currently battling some build problems, so hopefully 
Jay will be able to take a look through it today.

Best regards,


--Chris

-- 
-----------------------------------------------------------------
Chris Sturtivant, PhD,
Linux System Software,
SGI
(650) 933-1703
-----------------------------------------------------------------


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-09 13:20       ` Shailabh Nagar
@ 2006-06-09 18:25         ` Jay Lan
  2006-06-09 19:12           ` Shailabh Nagar
  0 siblings, 1 reply; 134+ messages in thread
From: Jay Lan @ 2006-06-09 18:25 UTC (permalink / raw)
  To: Shailabh Nagar; +Cc: Andrew Morton, balbir, jlan, csturtiv, linux-kernel

Shailabh Nagar wrote:
> Andrew Morton wrote:
>
>> On Fri, 09 Jun 2006 16:21:46 +0530
>> Balbir Singh <balbir@in.ibm.com> wrote:
>>
>>  
>>
>>> Andrew Morton wrote:
>>>   
>>>> On Fri, 09 Jun 2006 03:41:04 -0400
>>>> Shailabh Nagar <nagar@watson.ibm.com> wrote:
>>>>
>>>>
>>>>     
>>>>> Hence, this patch introduces a configuration parameter
>>>>>     /sys/kernel/taskstats_tgid_exit
>>>>> through which a privileged user can turn on/off sending of
>>>>> per-tgid stats on
>>>>> task exit.
>>>>>       
>>>> That seems a bit clumsy.  What happens if one consumer wants the
>>>> per-tgid
>>>> stats and another does not?
>>>>     
> Then the tgid stat sending on exit will need to be turned on for
> everyone.

I guess that is the limitation of taskstats. One multicast socket for
every listeners.

>
>>> For all subsystems that re-use the taskstats structure from the exit
>>> path,
>>> we have the issue that you mentioned. Thats because several
>>> statistics co-exist
>>> in the same structure. These subsystems can keep their tgid-stats
>>> empty by not
>>> filling up anything in fill_tgid() or using this patch to
>>> selectively enable/disable
>>> tgid stats.
>>>   
>>> For other subsystems, they could pass tgidstats as NULL to
>>> taskstats_exit_send().
>>>
>>>   
>>
>> I don't understand.  If a subsystem exists then it fills in its slots in
>> the taskstats structure, doesn't it?
>>  
>>
> It can choose not to, by not inserting its "fill my fields" function
> inside the do..while_each_thread
> loop within fill_tgid. So while they would still necessarily receive
> the per-tgid taskstats struct on exit
> (because some other subsystem needs it), they can atleast save on
> filling up their part of the struct
> if they don't need it.
>
>> No other subsystem needs a global knob, does it?
>>  
>>
> I didn't understand.
>
>> You see the problem - if one userspace package wants the tgid-stats and
>> another concurrently-running one does now, what do we do?  Just leave it
>> enabled and run a bit slower?
>>  
>>
> Yes, thats what will have to be done. If one user wants, all users
> will need to get the stats. They can
> limit their impact by not processing the parts of the netlink message
> that correspond to the per-tgid stats
> (since the per-tgid stats are sent as a separate attribute, thats easy
> to do).
>
> This patch covers the use case where someone like CSA is the only user
> (delay accounting is turned off)
> and wants to reduce the performance impact of the kernel allocating,
> sending and userspace discarding
> the per-tgid stats.
>
>> If so, how much slower?  Your changelog says some potential users don't
>> need the tgid-stats, but so what?  I assume this patch is a performance
>> thing? 
> Yes, its a performance optimization.

Well, for every task exists, two sets of  data (of struct taskstats) would
be sent from kernel: one is the stats for the pid, the other is the
up-to-current stats for the thread (tgid).

Strictly speakly,  the second set of data is not per-task stats. For
accounting
subsystems that do not use thread as aggregation, 50% of the data from
the kernel is useless. The option to not sending thread data is very
important.
Of course we are betting a customer site does not run two different
application on the same system.

>
>> If so, has it been quantified?
>>  
>>
> No :-(
> Will try to get some numbers.
> Jay/Chris, if you can try to do that too, for the kind of usage that
> is typical of CSA,
> that would be great.

Probably not until some time next week. But as i point out, 50% of
traffic is
not useful to CSA.

Thanks,
 - jay

>
> --Shailabh
>


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-09 15:36       ` Balbir Singh
@ 2006-06-09 18:35         ` Jay Lan
  2006-06-09 19:31           ` Shailabh Nagar
  0 siblings, 1 reply; 134+ messages in thread
From: Jay Lan @ 2006-06-09 18:35 UTC (permalink / raw)
  To: balbir; +Cc: Andrew Morton, nagar, jlan, csturtiv, linux-kernel

Balbir Singh wrote:
> Andrew Morton wrote:
>> On Fri, 09 Jun 2006 16:21:46 +0530
>> Balbir Singh <balbir@in.ibm.com> wrote:
>>
>>
>>> Andrew Morton wrote:
>>>
>>>> On Fri, 09 Jun 2006 03:41:04 -0400
>>>> Shailabh Nagar <nagar@watson.ibm.com> wrote:
>>>>
>>>>
>>>>
>>>>> Hence, this patch introduces a configuration parameter
>>>>>     /sys/kernel/taskstats_tgid_exit
>>>>> through which a privileged user can turn on/off sending of
>>>>> per-tgid stats on
>>>>> task exit.
>>>>
>>>>
>>>> That seems a bit clumsy.  What happens if one consumer wants the
>>>> per-tgid
>>>> stats and another does not?
>>>
>>> For all subsystems that re-use the taskstats structure from the exit
>>> path,
>>> we have the issue that you mentioned. Thats because several
>>> statistics co-exist
>>> in the same structure. These subsystems can keep their tgid-stats
>>> empty by not
>>> filling up anything in fill_tgid() or using this patch to
>>> selectively enable/disable
>>> tgid stats.
>>>
>>> For other subsystems, they could pass tgidstats as NULL to
>>> taskstats_exit_send().
>>>
>>
>>
>> I don't understand.  If a subsystem exists then it fills in its slots in
>> the taskstats structure, doesn't it?
>>
>> No other subsystem needs a global knob, does it?
>>
>> You see the problem - if one userspace package wants the tgid-stats and
>> another concurrently-running one does now, what do we do?  Just leave it
>> enabled and run a bit slower?
>
> Another option is to get the package to define their own taskstats
> genetlink attribute and fill it up in taskstats_exit_send(). This
> would be similar to
> TASKSTATS_TYPE_AGGR_PID/TGID.
>
> They can make this attribute independent of the taskstats structure
> and fill
> it based on their policy (per-pid or per-tgid). But the current interface
> users like CSA want to build on top of the taskstats structure.

That was my question to you from the beginning: do you propose a common
interface based on taskstats or genetlink?

If CSA defines its own taskstats genetlink attirbute, does it listen to
the same socket as delayacct? If yes, then the socket will be jammed with
duplicate information before long.

Is it an option to make per-tgid data a unicast? Ie, your daemon
periodically
polling the per-tgid stats?

Thanks,
 - jay


>
>>
>> If so, how much slower?  Your changelog says some potential users don't
>> need the tgid-stats, but so what?  I assume this patch is a performance
>> thing?  If so, has it been quantified?
>>
>
>


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-09 18:25         ` Jay Lan
@ 2006-06-09 19:12           ` Shailabh Nagar
  0 siblings, 0 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-09 19:12 UTC (permalink / raw)
  To: Jay Lan; +Cc: Andrew Morton, balbir, jlan, csturtiv, linux-kernel

Jay Lan wrote:

>Shailabh Nagar wrote:
>  
>
>>Andrew Morton wrote:
>>
>>    
>>
>>>On Fri, 09 Jun 2006 16:21:46 +0530
>>>Balbir Singh <balbir@in.ibm.com> wrote:
>>>
>>> 
>>>
>>>      
>>>
>>>>Andrew Morton wrote:
>>>>  
>>>>        
>>>>
>>>>>On Fri, 09 Jun 2006 03:41:04 -0400
>>>>>Shailabh Nagar <nagar@watson.ibm.com> wrote:
>>>>>
>>>>>
>>>>>    
>>>>>          
>>>>>
>>>>>>Hence, this patch introduces a configuration parameter
>>>>>>    /sys/kernel/taskstats_tgid_exit
>>>>>>through which a privileged user can turn on/off sending of
>>>>>>per-tgid stats on
>>>>>>task exit.
>>>>>>      
>>>>>>            
>>>>>>
>>>>>That seems a bit clumsy.  What happens if one consumer wants the
>>>>>per-tgid
>>>>>stats and another does not?
>>>>>    
>>>>>          
>>>>>
>>Then the tgid stat sending on exit will need to be turned on for
>>everyone.
>>    
>>
>
>I guess that is the limitation of taskstats. One multicast socket for
>every listeners.
>
>  
>
>>>>For all subsystems that re-use the taskstats structure from the exit
>>>>path,
>>>>we have the issue that you mentioned. Thats because several
>>>>statistics co-exist
>>>>in the same structure. These subsystems can keep their tgid-stats
>>>>empty by not
>>>>filling up anything in fill_tgid() or using this patch to
>>>>selectively enable/disable
>>>>tgid stats.
>>>>  
>>>>For other subsystems, they could pass tgidstats as NULL to
>>>>taskstats_exit_send().
>>>>
>>>>  
>>>>        
>>>>
>>>I don't understand.  If a subsystem exists then it fills in its slots in
>>>the taskstats structure, doesn't it?
>>> 
>>>
>>>      
>>>
>>It can choose not to, by not inserting its "fill my fields" function
>>inside the do..while_each_thread
>>loop within fill_tgid. So while they would still necessarily receive
>>the per-tgid taskstats struct on exit
>>(because some other subsystem needs it), they can atleast save on
>>filling up their part of the struct
>>if they don't need it.
>>
>>    
>>
>>>No other subsystem needs a global knob, does it?
>>> 
>>>
>>>      
>>>
>>I didn't understand.
>>
>>    
>>
>>>You see the problem - if one userspace package wants the tgid-stats and
>>>another concurrently-running one does now, what do we do?  Just leave it
>>>enabled and run a bit slower?
>>> 
>>>
>>>      
>>>
>>Yes, thats what will have to be done. If one user wants, all users
>>will need to get the stats. They can
>>limit their impact by not processing the parts of the netlink message
>>that correspond to the per-tgid stats
>>(since the per-tgid stats are sent as a separate attribute, thats easy
>>to do).
>>
>>This patch covers the use case where someone like CSA is the only user
>>(delay accounting is turned off)
>>and wants to reduce the performance impact of the kernel allocating,
>>sending and userspace discarding
>>the per-tgid stats.
>>
>>    
>>
>>>If so, how much slower?  Your changelog says some potential users don't
>>>need the tgid-stats, but so what?  I assume this patch is a performance
>>>thing? 
>>>      
>>>
>>Yes, its a performance optimization.
>>    
>>
>
>Well, for every task exists, two sets of  data (of struct taskstats) would
>be sent from kernel: one is the stats for the pid, the other is the
>up-to-current stats for the thread (tgid).
>
>Strictly speakly,  the second set of data is not per-task stats. For
>accounting
>subsystems that do not use thread as aggregation, 50% of the data from
>the kernel is useless. The option to not sending thread data is very
>important.
>Of course we are betting a customer site does not run two different
>application on the same system.
>
>  
>
>>>If so, has it been quantified?
>>> 
>>>
>>>      
>>>
>>No :-(
>>Will try to get some numbers.
>>Jay/Chris, if you can try to do that too, for the kind of usage that
>>is typical of CSA,
>>that would be great.
>>    
>>
>
>Probably not until some time next week. But as i point out, 50% of
>traffic is
>not useful to CSA.
>  
>
Jay,

There is one optimization that is already in the current code that is 
relevant here:

when a task is the only thread in its thread group, we only send per-pid 
stats, not the per-tgid
stats (which would be exactly the same).

So the net volume of data going out is not 2x for the whole machine.
It is 2x only when the thread that exits belongs to a thread group that 
currently has other members.

I don't know how common this case is ? I would think most processes are 
single-threaded. The apps
which are heavily multithreaded might also use them in a "pooled" model 
(i.e. fewer exits even though there
is a lot of multithreading). Java apps might be different 
though....don't they also operate using pools of threads ?

Note this is quite apart from the issue of how much impact an extra tgid 
stat has even if such exits are frequent.
I'm trying to get a handle on that number as we speak.

--Shailabh


>Thanks,
> - jay
>
>  
>
>>--Shailabh
>>
>>    
>>
>
>  
>


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-09 18:35         ` Jay Lan
@ 2006-06-09 19:31           ` Shailabh Nagar
  0 siblings, 0 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-09 19:31 UTC (permalink / raw)
  To: Jay Lan; +Cc: balbir, Andrew Morton, jlan, csturtiv, linux-kernel

Jay Lan wrote:

>Balbir Singh wrote:
>  
>
>>Andrew Morton wrote:
>>    
>>
>>>On Fri, 09 Jun 2006 16:21:46 +0530
>>>Balbir Singh <balbir@in.ibm.com> wrote:
>>>
>>>
>>>      
>>>
>>>>Andrew Morton wrote:
>>>>
>>>>        
>>>>
>>>>>On Fri, 09 Jun 2006 03:41:04 -0400
>>>>>Shailabh Nagar <nagar@watson.ibm.com> wrote:
>>>>>
>>>>>
>>>>>
>>>>>          
>>>>>
>>>>>>Hence, this patch introduces a configuration parameter
>>>>>>    /sys/kernel/taskstats_tgid_exit
>>>>>>through which a privileged user can turn on/off sending of
>>>>>>per-tgid stats on
>>>>>>task exit.
>>>>>>            
>>>>>>
>>>>>That seems a bit clumsy.  What happens if one consumer wants the
>>>>>per-tgid
>>>>>stats and another does not?
>>>>>          
>>>>>
>>>>For all subsystems that re-use the taskstats structure from the exit
>>>>path,
>>>>we have the issue that you mentioned. Thats because several
>>>>statistics co-exist
>>>>in the same structure. These subsystems can keep their tgid-stats
>>>>empty by not
>>>>filling up anything in fill_tgid() or using this patch to
>>>>selectively enable/disable
>>>>tgid stats.
>>>>
>>>>For other subsystems, they could pass tgidstats as NULL to
>>>>taskstats_exit_send().
>>>>
>>>>        
>>>>
>>>I don't understand.  If a subsystem exists then it fills in its slots in
>>>the taskstats structure, doesn't it?
>>>
>>>No other subsystem needs a global knob, does it?
>>>
>>>You see the problem - if one userspace package wants the tgid-stats and
>>>another concurrently-running one does now, what do we do?  Just leave it
>>>enabled and run a bit slower?
>>>      
>>>
>>Another option is to get the package to define their own taskstats
>>genetlink attribute and fill it up in taskstats_exit_send(). This
>>would be similar to
>>TASKSTATS_TYPE_AGGR_PID/TGID.
>>
>>They can make this attribute independent of the taskstats structure
>>and fill
>>it based on their policy (per-pid or per-tgid). But the current interface
>>users like CSA want to build on top of the taskstats structure.
>>    
>>
>
>That was my question to you from the beginning: do you propose a common
>interface based on taskstats or genetlink?
>  
>
Actually its both, at this time. Preference is for packages to use 
taskstats unless they have absolutely
nothing in common with the stats already in struct taskstats...at which 
point they could choose to use
a different structure and ship it (alongwith the taskstats structure) 
using a different netlink attribute.
Since the processing of data happens via attributes, existing users 
(like the daemons of CSA, delay accounting
etc.) can simply ignore that extra attribute coming along.

>If CSA defines its own taskstats genetlink attirbute, does it listen to
>the same socket as delayacct? If yes, then the socket will be jammed with
>duplicate information before long.
>  
>
Very true. The use of a common taskstats ensures that no duplication 
needs to occur and as long
as performance is not an issue, extending taskstats is the preferable way.

What Balbir was pointing out is that the current taskstats interface is 
flexible enough, on account of its
use of netlink attributes, to allow other users of the interface to 
define their own attributes. But this is not
something that should be pursued at this stage.

>Is it an option to make per-tgid data a unicast? Ie, your daemon
>periodically
>polling the per-tgid stats?
>  
>
That option already exists (daemon can do a GET of per-tgid stats).
However, the reason it won't work is because the stats accumalated 
between the last poll and the
exit of the task will get lost. That was the motivation behind the 
"push" of stats from the kernel in the
first place.

As I noted in the other mail, since the per-tgid stats are actually sent 
out very few times in practice
(because of the optimization to not send it when the exiting thread is 
the only one in its thread group),
the extra data/overhead is unlikely to be an issue.

--Shailabh

>Thanks,
> - jay
>
>
>  
>
>>>If so, how much slower?  Your changelog says some potential users don't
>>>need the tgid-stats, but so what?  I assume this patch is a performance
>>>thing?  If so, has it been quantified?
>>>
>>>      
>>>
>>    
>>
>
>  
>


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-09 11:21     ` Andrew Morton
  2006-06-09 13:20       ` Shailabh Nagar
  2006-06-09 15:36       ` Balbir Singh
@ 2006-06-09 21:56       ` Shailabh Nagar
  2006-06-09 22:42         ` Jay Lan
  2006-06-21 19:11         ` Jay Lan
  2 siblings, 2 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-09 21:56 UTC (permalink / raw)
  To: Andrew Morton; +Cc: balbir, jlan, csturtiv, linux-kernel

Andrew Morton wrote:
> 
> You see the problem - if one userspace package wants the tgid-stats and
> another concurrently-running one does now, what do we do?  Just leave it
> enabled and run a bit slower?
> 
> If so, how much slower?  Your changelog says some potential users don't
> need the tgid-stats, but so what?  I assume this patch is a performance
> thing?  If so, has it been quantified?


Here are some results from running a simple program (source below) that does
10 iterations of creating and then destroying 1000 threads. On the side, another utility
kept reading the pid (+tgid if present) stats from exiting tasks.


	Yes	No	Ovhd
user	0.14	0.15	-6%
system	1.61	1.54	+5%
elapsed	2.01	1.94	+3%

Yes = tgid stats printed on exit
No = not printed
Ovhd = (Yes-No)/No * 100

So even in this extreme case where the per-tgid stats are indeed
half of the total data, the overhead is not very significant.

As pointed out earlier, more representative cases are
- single threaded apps (e.g. make -jX) where the current
taskstats interface already optimizes by not sending redundant per-tgid stats, or
- server-type multithreaded apps where the exits are going to be relatively infrequent (due to
reuse of thread pools) so the extra per-tgid output is not going to have much impact.

I'd suggest we drop the idea of including this patch until we have data showing that
the overhead is an issue.

--Shailabh



#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <unistd.h>
#include <pthread.h>

int n;

void *slow_exit(void *arg)
{
	int i = (int) arg;
	usleep((n-i)*2);
}

int main(int argc, char *argv[])
{
	int i,rc, rep;
	pthread_t *ppthread;
	
	n = 5 ;
	if (argc > 1)
		n = atoi(argv[1]);

	rep = 10;
	if (argc > 2)
		rep = atoi(argv[2]);

	ppthread = malloc(n * sizeof(pthread_t));
	if (ppthread == NULL) {
		printf("Memory allocation failure\n");
		exit(-1);
	}

	while (rep) {
		for (i=0; i<n; i++) {
			rc = pthread_create(&ppthread[i], NULL,
					    slow_exit, (void *)i);
			if (rc) {
				printf("Error creating thread %d\n", i);
				exit(-1);
			}
		}
		for (i=0; i<n; i++) {
			rc = pthread_join(ppthread[i], NULL);
			if (rc) {
				printf("Error joining thread %d\n", i);
				exit(-1);
			}
		}
		rep--;
	}
}




^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-09 21:56       ` Shailabh Nagar
@ 2006-06-09 22:42         ` Jay Lan
  2006-06-09 23:22           ` Andrew Morton
  2006-06-21 19:11         ` Jay Lan
  1 sibling, 1 reply; 134+ messages in thread
From: Jay Lan @ 2006-06-09 22:42 UTC (permalink / raw)
  To: Shailabh Nagar; +Cc: Andrew Morton, balbir, jlan, csturtiv, linux-kernel

Shailabh Nagar wrote:
>Andrew Morton wrote:
>  
>>You see the problem - if one userspace package wants the tgid-stats and
>>another concurrently-running one does now, what do we do?  Just leave it
>>enabled and run a bit slower?
>>
>>If so, how much slower?  Your changelog says some potential users don't
>>need the tgid-stats, but so what?  I assume this patch is a performance
>>thing?  If so, has it been quantified?
>>    
>
>
>Here are some results from running a simple program (source below) that does
>10 iterations of creating and then destroying 1000 threads. On the side, another utility
>kept reading the pid (+tgid if present) stats from exiting tasks.
>
>
>	Yes	No	Ovhd
>user	0.14	0.15	-6%
>system	1.61	1.54	+5%
>elapsed	2.01	1.94	+3%
>
>Yes = tgid stats printed on exit
>No = not printed
>Ovhd = (Yes-No)/No * 100
>
>So even in this extreme case where the per-tgid stats are indeed
>half of the total data, the overhead is not very significant.
>
>As pointed out earlier, more representative cases are
>- single threaded apps (e.g. make -jX) where the current
>taskstats interface already optimizes by not sending redundant per-tgid stats, or
>  

How is it done?

In do_exit(), you have taskstats_exit_send(tsk, tidstats, tgidstats).
The tgid data would not be sent if
    not is_thread_group, or tgidstats is NULL.

The tgidstats is allocated within do_exit() also in
taskstats_exit_alloc(&tidstats, &tgidstats) and it seems to me
there is no flag to fail the memory allocation. Since tgidstats pointer
is valid, the data will be sent always.

Not filling up the tgid data fields would end up sending bunch of 0's
down to the userland.

>- server-type multithreaded apps where the exits are going to be relatively infrequent (due to
>reuse of thread pools) so the extra per-tgid output is not going to have much impact.
>  

I can expect to see our customers running highly multithreaded apps,
although i do not know whether those applications use the thread pools.

>I'd suggest we drop the idea of including this patch until we have data showing that
>the overhead is an issue.
>  

I do not have CSA kernel patch to test until some time next week to run
some tests. If we agree that it is a good idea to provide such an option, we
should proceed with that.

I found we should not mix tgid stats data and various subsystem stats
data defined in taststats struct together in our discussion. These are two
different things. On exit of each task, we would send one taskstats struct
data for the pid and also another taskstats struct for tgid. Within
the taskstats struct, we have data for delayacct, essential common
accting data (such as utime, stime, start_time, etc both BSD and CSA
would need), and data mostly used by CSA.
If we (including Andrew) decide to adopt taskstats as the common
accounting interface, we all need to live with taskstats struct. But, that
doea not mean we need to have both per-pid and per-tgid stats on every
process exit.

If you can show me how to not sending per-tgid with current patchset,
i would be very happy to drop this request.

Thanks!
 - jay

>--Shailabh
>
>
>
>#include <stdio.h>
>#include <stdlib.h>
>#include <sys/types.h>
>#include <unistd.h>
>#include <pthread.h>
>
>int n;
>
>void *slow_exit(void *arg)
>{
>	int i = (int) arg;
>	usleep((n-i)*2);
>}
>
>int main(int argc, char *argv[])
>{
>	int i,rc, rep;
>	pthread_t *ppthread;
>	
>	n = 5 ;
>	if (argc > 1)
>		n = atoi(argv[1]);
>
>	rep = 10;
>	if (argc > 2)
>		rep = atoi(argv[2]);
>
>	ppthread = malloc(n * sizeof(pthread_t));
>	if (ppthread == NULL) {
>		printf("Memory allocation failure\n");
>		exit(-1);
>	}
>
>	while (rep) {
>		for (i=0; i<n; i++) {
>			rc = pthread_create(&ppthread[i], NULL,
>					    slow_exit, (void *)i);
>			if (rc) {
>				printf("Error creating thread %d\n", i);
>				exit(-1);
>			}
>		}
>		for (i=0; i<n; i++) {
>			rc = pthread_join(ppthread[i], NULL);
>			if (rc) {
>				printf("Error joining thread %d\n", i);
>				exit(-1);
>			}
>		}
>		rep--;
>	}
>}
>
>
>
>  


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-09 22:42         ` Jay Lan
@ 2006-06-09 23:22           ` Andrew Morton
  2006-06-09 23:47             ` Jay Lan
  0 siblings, 1 reply; 134+ messages in thread
From: Andrew Morton @ 2006-06-09 23:22 UTC (permalink / raw)
  To: Jay Lan; +Cc: nagar, balbir, jlan, csturtiv, linux-kernel

Jay Lan <jlan@engr.sgi.com> wrote:
>
> If you can show me how to not sending per-tgid with current patchset,
> i would be very happy to drop this request.

pleeeze, not a global sysctl.  It should be some per-client subscription thing.

But the overhead at present is awfully low.  If we don't need this ability
at present (and I don't think we do) then a paper design would be
sufficient at this time.  As long as we know we can do this in the future
without breaking existing APIs then OK.


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-09 23:22           ` Andrew Morton
@ 2006-06-09 23:47             ` Jay Lan
  2006-06-09 23:56               ` Andrew Morton
                                 ` (2 more replies)
  0 siblings, 3 replies; 134+ messages in thread
From: Jay Lan @ 2006-06-09 23:47 UTC (permalink / raw)
  To: Andrew Morton; +Cc: nagar, balbir, jlan, csturtiv, linux-kernel

Andrew Morton wrote:
>Jay Lan <jlan@engr.sgi.com> wrote:
>  
>>If you can show me how to not sending per-tgid with current patchset,
>>i would be very happy to drop this request.
>>    
>
>pleeeze, not a global sysctl.  It should be some per-client subscription thing.
>  

Per-client subscription is not possible since it is the push (multicast)
model we
talk about and delayacct needs tgid.

>But the overhead at present is awfully low.  If we don't need this ability
>at present (and I don't think we do) then a paper design would be
>sufficient at this time.  As long as we know we can do this in the future
>without breaking existing APIs then OK.
>  
i can see if an exiting process is the only process in the thread group,
the (not is_thread_group) condition would be true. So, that leaves
multi-threaded applications that are not interested in tgid-data still
receive 2x taskstats data.

Is a system-wide switch that bad? A site  that needs tgid stats can live
with the performance consequence while those do not need tgid can
enjoy a pure per-task stats data. (I would argue that a thread group
is some sort of task aggregate.)

How about sending tgid stats when the last process in the group exist?
But do not send it if not the last in the thread?

Thanks,
 - jay


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-09 23:47             ` Jay Lan
@ 2006-06-09 23:56               ` Andrew Morton
  2006-06-10 12:21               ` Shailabh Nagar
  2006-06-10 13:05               ` Shailabh Nagar
  2 siblings, 0 replies; 134+ messages in thread
From: Andrew Morton @ 2006-06-09 23:56 UTC (permalink / raw)
  To: Jay Lan; +Cc: nagar, balbir, jlan, csturtiv, linux-kernel

Jay Lan <jlan@engr.sgi.com> wrote:
>
> Is a system-wide switch that bad?

Yes, it's awful.  OK, we might band-aid something like that onto an
existing feature which had compatibility requirements, but for brand-new
code, no.  Let's get it right.

> A site  that needs tgid stats can live
> with the performance consequence while those do not need tgid can
> enjoy a pure per-task stats data. (I would argue that a thread group
> is some sort of task aggregate.)

But the performance impact was negligible.  A few percent on a workload
which just sat in a fork/exit busyloop.

> How about sending tgid stats when the last process in the group exist?
> But do not send it if not the last in the thread?

That'd be one for Balbir to think about.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-09 23:47             ` Jay Lan
  2006-06-09 23:56               ` Andrew Morton
@ 2006-06-10 12:21               ` Shailabh Nagar
  2006-06-12 18:31                 ` Jay Lan
  2006-06-10 13:05               ` Shailabh Nagar
  2 siblings, 1 reply; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-10 12:21 UTC (permalink / raw)
  To: Jay Lan; +Cc: Andrew Morton, balbir, jlan, csturtiv, linux-kernel

Jay Lan wrote:

>Andrew Morton wrote:
>  
>
>

>>But the overhead at present is awfully low.  If we don't need this ability
>>at present (and I don't think we do) then a paper design would be
>>sufficient at this time.  As long as we know we can do this in the future
>>without breaking existing APIs then OK.
>> 
>>    
>>
>i can see if an exiting process is the only process in the thread group,
>the (not is_thread_group) condition would be true. So, that leaves
>multi-threaded applications that are not interested in tgid-data still
>receive 2x taskstats data.
>  
>
Jay,

Why is the 2x taskstats data for the multithreaded app a real problem ?
When differnt clients agree to use a common taskstats structure, they 
also incur the potential
overhead of receiving extra data they don't really care about (in CSA's 
case, that would be all the
delay accounting fields of struct taskstats). Isn't that, in some sense, 
the "price" of sharing a structure
or delivery mechanism ?

Of course, if this overhead becomes too much, we need to find 
alternatives. But, as already shown,
even in the extreme case where app does nothing but fork/exit, there is very
little performance impact. So I don't see how in the common case of 
multithreaded apps, where exits
are going to be at a far lesser rate, the extra per-tgid data is a real 
issue.

So, are we trying to solve a real problem ?

I'll address the alternatives in a separate mail but lets address this 
point first please.

--Shailabh

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-09 23:47             ` Jay Lan
  2006-06-09 23:56               ` Andrew Morton
  2006-06-10 12:21               ` Shailabh Nagar
@ 2006-06-10 13:05               ` Shailabh Nagar
  2006-06-12 18:54                 ` Jay Lan
  2 siblings, 1 reply; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-10 13:05 UTC (permalink / raw)
  To: Jay Lan; +Cc: Andrew Morton, balbir, jlan, csturtiv, linux-kernel

Jay Lan wrote:

>Andrew Morton wrote:
>  
>
>>Jay Lan <jlan@engr.sgi.com> wrote:
>> 
>>    
>>
>>>If you can show me how to not sending per-tgid with current patchset,
>>>i would be very happy to drop this request.
>>>   
>>>      
>>>
>>pleeeze, not a global sysctl.  It should be some per-client subscription thing.
>> 
>>    
>>
>
>Per-client subscription is not possible since it is the push (multicast)
>model we
>talk about and delayacct needs tgid.
>  
>
One way to do per-client subscription that Balbir brought up
is to have separate multicast groups for the clients wanting to receive 
per-pid stats and per-tgid stats.

However, this does change the current API since a separate connect to 
the per-tgid multicast group is needed.
So its not a option that can be tagged on later but needs to be done now.

>How about sending tgid stats when the last process in the group exist?
>But do not send it if not the last in the thread?
>
>  
>
This is doable if we have a place where the per-tgid data can be 
accumalated.
One choice that was explored and discarded was to have a struct 
taskstats allocated as part of mm struct,
and keep accumalating per-pid stats into that struct (ie. while filling 
the per-pid stat struct, accumalate into the
per-tgid struct too) which obviously doubles the collection overhead. 
Instead we chose to collect the per-tgid
stats dynamically.

However, we can consider allocating a per-tgid struct as part of the 
exit routine (when we notice a thread exiting
that is part of a thread group) and accumalate stats from each exiting 
thread of that group into the per-tgid stat and
output it alongwith the last exiting thread.

This would also save on the cost of collecting the entire per-tgid data 
each time a thread exits (as is being done now).

This solution is also a bit of an API change since the kind of data 
being received on the common multicast channel
will be different from what it is now. Also looks a little involved.


So we have solutions for the problem going forward, but not without 
changing the API.
Question is: does this really need to be done even in future ? If so, 
then we should perhaps do the change rightaway.

One more point to consider here - if a third or fourth subsystem were to 
come along to use the taskstats
interface and did not want to use the taskstats structure (since they 
have no field in common)...their clients
would still need to be able to accept getting data they don't care about 
(whether they have one or two multicast
groups). So the model for dealing with unwanted data will still need to 
be "don't process the netlink attributes
you don't care about". But thats farther into the future...


--Shailabh


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-10 12:21               ` Shailabh Nagar
@ 2006-06-12 18:31                 ` Jay Lan
  2006-06-12 21:57                   ` Shailabh Nagar
  0 siblings, 1 reply; 134+ messages in thread
From: Jay Lan @ 2006-06-12 18:31 UTC (permalink / raw)
  To: Shailabh Nagar; +Cc: Jay Lan, Andrew Morton, balbir, csturtiv, linux-kernel

Shailabh Nagar wrote:
> Jay Lan wrote:
> 
>> Andrew Morton wrote:
>>  
>>
>>
> 
>>> But the overhead at present is awfully low.  If we don't need this 
>>> ability
>>> at present (and I don't think we do) then a paper design would be
>>> sufficient at this time.  As long as we know we can do this in the 
>>> future
>>> without breaking existing APIs then OK.
>>>
>>>   
>>
>> i can see if an exiting process is the only process in the thread group,
>> the (not is_thread_group) condition would be true. So, that leaves
>> multi-threaded applications that are not interested in tgid-data still
>> receive 2x taskstats data.
>>  
>>
> Jay,
> 
> Why is the 2x taskstats data for the multithreaded app a real problem ?
> When differnt clients agree to use a common taskstats structure, they 
> also incur the potential
> overhead of receiving extra data they don't really care about (in CSA's 
> case, that would be all the
> delay accounting fields of struct taskstats). Isn't that, in some sense, 
> the "price" of sharing a structure
> or delivery mechanism ?

You are mixing the two types of overhead: 1) overhead due to tgid,
2) overhead due to extra fields of struct taskstats they don't care
about.

The type 2 overhead for CSA is very small, but is bigger for you. In our
discussion earlier, i told you (and you accpeted) that i will insert
128 bytes of data into taskstat struct. I have not finalized the CSA
work yet, but it can be 168 additional bytes or close to that number:

         /* Common Accounting Fields start */
         u32     ac_uid;                 /* User ID */
         u32     ac_gid;                 /* Group ID */
         u32     ac_pid;                 /* Process ID */
         u32     ac_ppid;                /* Parent process ID */
         struct timespec start_time;     /* Start time */
         struct timespec exit_time;      /* Exit time */
         u64     ac_utime;               /* User CPU time [usec] */
         u64     ac_stime;               /* SYstem CPU time [usec] */
         /* Common Accounting Fields end */

         /* CSA accounting fields start */
         u64     ac_sbu;                 /* System billing units */
         u16     csa_revision;           /* CSA Revision */
         u8      csa_type;               /* Record types */
         u8      csa_flag;               /* Record flags */
         u8      ac_stat;                /* Exit status */
         u8      ac_nice;                /* Nice value */
         u8      ac_sched;               /* Scheduling discipline */
         u8      pad0;                   /* Unused */
         u64     acct_rss_mem1;          /* accumulated rss usage */
         u64     acct_vm_mem1;           /* accumulated virtual memory 
usage */
         u64     hiwater_rss;            /* High-watermark of RSS usage 
*/
         u64     hiwater_vm;             /* High-water virtual memory 
usage */
         u64     ac_minflt;              /* Minor Page Fault */
         u64     ac_majflt;              /* Major Page Fault */
         u64     ac_chr;                 /* bytes read */
         u64     ac_chw;                 /* bytes written */
         u64     ac_scr;                 /* read syscalls */
         u64     ac_scw;                 /* write syscalls */
         u64     ac_jid;                 /* Job ID */
         /* CSA accounting fields end */

This is type 2 overhead. The bigger overhead in type 2, the bigger
impact of sending tgid data is bigger.

> 
> Of course, if this overhead becomes too much, we need to find 
> alternatives. But, as already shown,
> even in the extreme case where app does nothing but fork/exit, there is 
> very
> little performance impact. So I don't see how in the common case of 
> multithreaded apps, where exits
> are going to be at a far lesser rate, the extra per-tgid data is a real 
> issue.

Yes, application handles "real" work between fork and exit. But,
each task within a thread group still trigger do_exit on termination,
right?

> 
> So, are we trying to solve a real problem ?

I do not know, but i am concerned. I will run some testing with the
taskstats struct above and get some data.

Thanks,
  - jay


> 
> I'll address the alternatives in a separate mail but lets address this 
> point first please.
> 
> --Shailabh


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-10 13:05               ` Shailabh Nagar
@ 2006-06-12 18:54                 ` Jay Lan
  0 siblings, 0 replies; 134+ messages in thread
From: Jay Lan @ 2006-06-12 18:54 UTC (permalink / raw)
  To: Shailabh Nagar; +Cc: Jay Lan, Andrew Morton, balbir, csturtiv, linux-kernel

Shailabh Nagar wrote:
> Jay Lan wrote:
> 
>> Andrew Morton wrote:
>>  
>>
>>> Jay Lan <jlan@engr.sgi.com> wrote:
>>>
>>>   
>>>
>>>> If you can show me how to not sending per-tgid with current patchset,
>>>> i would be very happy to drop this request.
>>>>       
>>>
>>> pleeeze, not a global sysctl.  It should be some per-client 
>>> subscription thing.
>>>
>>>   
>>
>>
>> Per-client subscription is not possible since it is the push (multicast)
>> model we
>> talk about and delayacct needs tgid.
>>  
>>
> One way to do per-client subscription that Balbir brought up
> is to have separate multicast groups for the clients wanting to receive 
> per-pid stats and per-tgid stats.
> 
> However, this does change the current API since a separate connect to 
> the per-tgid multicast group is needed.
> So its not a option that can be tagged on later but needs to be done now.
> 
>> How about sending tgid stats when the last process in the group exist?
>> But do not send it if not the last in the thread?
>>
>>  
>>
> This is doable if we have a place where the per-tgid data can be 
> accumalated.
> One choice that was explored and discarded was to have a struct 
> taskstats allocated as part of mm struct,
> and keep accumalating per-pid stats into that struct (ie. while filling 
> the per-pid stat struct, accumalate into the
> per-tgid struct too) which obviously doubles the collection overhead. 
> Instead we chose to collect the per-tgid
> stats dynamically.
> 
> However, we can consider allocating a per-tgid struct as part of the 
> exit routine (when we notice a thread exiting
> that is part of a thread group) and accumalate stats from each exiting 
> thread of that group into the per-tgid stat and
> output it alongwith the last exiting thread.

This sounds a good plan. You do allocating a per-tgid struct only once
per thread group, right?

> 
> This would also save on the cost of collecting the entire per-tgid data 
> each time a thread exits (as is being done now).
> 
> This solution is also a bit of an API change since the kind of data 
> being received on the common multicast channel
> will be different from what it is now. Also looks a little involved.

I am confused. Wouldn't it simply a change in the test of when to
process and write the tgid data? The API seems to me unchanged? Do
i miss something?

Regards,
  - jay


> 
> 
> So we have solutions for the problem going forward, but not without 
> changing the API.
> Question is: does this really need to be done even in future ? If so, 
> then we should perhaps do the change rightaway.
> 
> One more point to consider here - if a third or fourth subsystem were to 
> come along to use the taskstats
> interface and did not want to use the taskstats structure (since they 
> have no field in common)...their clients
> would still need to be able to accept getting data they don't care about 
> (whether they have one or two multicast
> groups). So the model for dealing with unwanted data will still need to 
> be "don't process the netlink attributes
> you don't care about". But thats farther into the future...
> 
> 
> --Shailabh
> 


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-12 18:31                 ` Jay Lan
@ 2006-06-12 21:57                   ` Shailabh Nagar
  0 siblings, 0 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-12 21:57 UTC (permalink / raw)
  To: Jay Lan; +Cc: Jay Lan, Andrew Morton, balbir, csturtiv, linux-kernel

Jay Lan wrote:

> Shailabh Nagar wrote:
>
>> Jay Lan wrote:
>>
>>> Andrew Morton wrote:
>>>  
>>>
>>>
>>
>>>> But the overhead at present is awfully low.  If we don't need this 
>>>> ability
>>>> at present (and I don't think we do) then a paper design would be
>>>> sufficient at this time.  As long as we know we can do this in the 
>>>> future
>>>> without breaking existing APIs then OK.
>>>>
>>>>   
>>>
>>>
>>> i can see if an exiting process is the only process in the thread 
>>> group,
>>> the (not is_thread_group) condition would be true. So, that leaves
>>> multi-threaded applications that are not interested in tgid-data still
>>> receive 2x taskstats data.
>>>  
>>>
>> Jay,
>>
>> Why is the 2x taskstats data for the multithreaded app a real problem ?
>> When differnt clients agree to use a common taskstats structure, they 
>> also incur the potential
>> overhead of receiving extra data they don't really care about (in 
>> CSA's case, that would be all the
>> delay accounting fields of struct taskstats). Isn't that, in some 
>> sense, the "price" of sharing a structure
>> or delivery mechanism ?
>
>
> You are mixing the two types of overhead: 1) overhead due to tgid,
> 2) overhead due to extra fields of struct taskstats they don't care
> about.

You're right..I am mixing the two..but only to show to make the point that
anyway clients have to deal with extra data they don't care about. As 
long as the performance overhead
of that isn't significant, its not an issue.
Also, unlike, shared taskstats structure, discarding the excess per-tgid 
data is even easier
because it comes in its own netlink attribute.

>
> The type 2 overhead for CSA is very small, but is bigger for you. In our
> discussion earlier, i told you (and you accpeted) that i will insert
> 128 bytes of data into taskstat struct. I have not finalized the CSA
> work yet, but it can be 168 additional bytes or close to that number:
>
>         /* Common Accounting Fields start */
>         u32     ac_uid;                 /* User ID */
>         u32     ac_gid;                 /* Group ID */
>         u32     ac_pid;                 /* Process ID */
>         u32     ac_ppid;                /* Parent process ID */
>         struct timespec start_time;     /* Start time */
>         struct timespec exit_time;      /* Exit time */
>         u64     ac_utime;               /* User CPU time [usec] */
>         u64     ac_stime;               /* SYstem CPU time [usec] */
>         /* Common Accounting Fields end */
>
>         /* CSA accounting fields start */
>         u64     ac_sbu;                 /* System billing units */
>         u16     csa_revision;           /* CSA Revision */
>         u8      csa_type;               /* Record types */
>         u8      csa_flag;               /* Record flags */
>         u8      ac_stat;                /* Exit status */
>         u8      ac_nice;                /* Nice value */
>         u8      ac_sched;               /* Scheduling discipline */
>         u8      pad0;                   /* Unused */
>         u64     acct_rss_mem1;          /* accumulated rss usage */
>         u64     acct_vm_mem1;           /* accumulated virtual memory 
> usage */
>         u64     hiwater_rss;            /* High-watermark of RSS usage */
>         u64     hiwater_vm;             /* High-water virtual memory 
> usage */
>         u64     ac_minflt;              /* Minor Page Fault */
>         u64     ac_majflt;              /* Major Page Fault */
>         u64     ac_chr;                 /* bytes read */
>         u64     ac_chw;                 /* bytes written */
>         u64     ac_scr;                 /* read syscalls */
>         u64     ac_scw;                 /* write syscalls */
>         u64     ac_jid;                 /* Job ID */
>         /* CSA accounting fields end */
>
> This is type 2 overhead. The bigger overhead in type 2, the bigger
> impact of sending tgid data is bigger.

Fair enough.   So lets see what the excess 168 bytes does in terms of 
perf and make a determination
based on that ?

>>
>> Of course, if this overhead becomes too much, we need to find 
>> alternatives. But, as already shown,
>> even in the extreme case where app does nothing but fork/exit, there 
>> is very
>> little performance impact. So I don't see how in the common case of 
>> multithreaded apps, where exits
>> are going to be at a far lesser rate, the extra per-tgid data is a 
>> real issue.
>
>
> Yes, application handles "real" work between fork and exit. But,
> each task within a thread group still trigger do_exit on termination,
> right?

Yes...but I don't see the point ? If exits happen at a very slow rate, 
then the performance impact will drop
compared to if they happen at the insane rate in the toy program. So 
rate of exit is a factor..or did I not
get your point ?

>
>>
>> So, are we trying to solve a real problem ?
>
>
> I do not know, but i am concerned. I will run some testing with the
> taskstats struct above and get some data.


Sounds good. Please share asap so that 2.6.18 acceptance isn't held up.

Regards,
Shailabh


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-09 21:56       ` Shailabh Nagar
  2006-06-09 22:42         ` Jay Lan
@ 2006-06-21 19:11         ` Jay Lan
  2006-06-21 19:14           ` Jay Lan
  2006-06-21 20:38           ` Andrew Morton
  1 sibling, 2 replies; 134+ messages in thread
From: Jay Lan @ 2006-06-21 19:11 UTC (permalink / raw)
  To: Shailabh Nagar; +Cc: Andrew Morton, balbir, csturtiv, linux-kernel

Shailabh Nagar wrote:
>Andrew Morton wrote:
>  
>>You see the problem - if one userspace package wants the tgid-stats and
>>another concurrently-running one does now, what do we do?  Just leave it
>>enabled and run a bit slower?
>>
>>If so, how much slower?  Your changelog says some potential users don't
>>need the tgid-stats, but so what?  I assume this patch is a performance
>>thing?  If so, has it been quantified?
>>    
>
>
>Here are some results from running a simple program (source below) that does
>10 iterations of creating and then destroying 1000 threads. On the side, another utility
>kept reading the pid (+tgid if present) stats from exiting tasks.
>  

I ran my testing using the same program posted by Shalilabh attached in his
posting.

System: SGI a350, a two cpus IA64 machine.
Kernel:  2.6.17-rc3 + delay-acct-taskstats patch set
       + tgid-disable_patch_shailabh + exit race patch_balbir +
csa_patch_jlan

I also modified the Decumentation/accounting/getdelay.c:
   - it repeatedly does recv() to retrieve data from kernel
   - instead of using printf() to display data received, i simply write
it to
     disk as it would be for an accounting daemon. Note that currently
both the
     BSD (or GNU) accounting and the CSA writes accounting data from kernel.
     As an effort of moving accounting system to userspace, the raw data
needs
     to be written to a raw file first before further processing.

In Shailabh's testing, he ran his 'mkthreads' 10 iterations of creating and
distroying 1000 threads.  I had to increase my test to 5000 iterations
in order
to receive meaningful data: 'mkthreads 1000 5000'.

I used Shailabh's per-tgid-disable patch to run my tests with per-tgid
enabled and disabled. I used 'sa' command of 'acct' package to report
results of 5 runs of 'mkthreads 1000 5000'.

>
>
>	Yes	No	Ovhd
>user	0.14	0.15	-6%
>system	1.61	1.54	+5%
>elapsed	2.01	1.94	+3%
>
>Yes = tgid stats printed on exit
>No = not printed
>Ovhd = (Yes-No)/No * 100
>  

Here are test results:

             Yes      No     Ovhd
user         1.77    0.44    302.27%
system       0.06    0.06      0.00%
elapsed    794.60  316.40    151.14%

Also, the results of five runs of per-tgid-disabled were very
consistent (3 runs of 0,44 seconds and 2 runs of 0.45), while
those of per-tgid-enabled varies (1.56, 1.99, 1.54, 2.21, 1.57).

The impact of per-tgid stats is too significant to ignore for
those who do not need the per-tgid stats data.

Another observation that i considered bad news is that all
10 runs produced 1 to 5 recv() error with errno=105 (ENOBUF).

Here I attach my csa_taskstats patch and my modified version of
exit_recv.c.

Regards,
 - jay


>So even in this extreme case where the per-tgid stats are indeed
>half of the total data, the overhead is not very significant.
>
>As pointed out earlier, more representative cases are
>- single threaded apps (e.g. make -jX) where the current
>taskstats interface already optimizes by not sending redundant per-tgid stats, or
>- server-type multithreaded apps where the exits are going to be relatively infrequent (due to
>reuse of thread pools) so the extra per-tgid output is not going to have much impact.
>
>I'd suggest we drop the idea of including this patch until we have data showing that
>the overhead is an issue.
>
>--Shailabh
>
>
>
>#include <stdio.h>
>#include <stdlib.h>
>#include <sys/types.h>
>#include <unistd.h>
>#include <pthread.h>
>
>int n;
>
>void *slow_exit(void *arg)
>{
>	int i = (int) arg;
>	usleep((n-i)*2);
>}
>
>int main(int argc, char *argv[])
>{
>	int i,rc, rep;
>	pthread_t *ppthread;
>	
>	n = 5 ;
>	if (argc > 1)
>		n = atoi(argv[1]);
>
>	rep = 10;
>	if (argc > 2)
>		rep = atoi(argv[2]);
>
>	ppthread = malloc(n * sizeof(pthread_t));
>	if (ppthread == NULL) {
>		printf("Memory allocation failure\n");
>		exit(-1);
>	}
>
>	while (rep) {
>		for (i=0; i<n; i++) {
>			rc = pthread_create(&ppthread[i], NULL,
>					    slow_exit, (void *)i);
>			if (rc) {
>				printf("Error creating thread %d\n", i);
>				exit(-1);
>			}
>		}
>		for (i=0; i<n; i++) {
>			rc = pthread_join(ppthread[i], NULL);
>			if (rc) {
>				printf("Error joining thread %d\n", i);
>				exit(-1);
>			}
>		}
>		rep--;
>	}
>}
>
>
>
>  


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-21 19:11         ` Jay Lan
@ 2006-06-21 19:14           ` Jay Lan
  2006-06-21 19:34             ` Shailabh Nagar
  2006-06-21 20:38           ` Andrew Morton
  1 sibling, 1 reply; 134+ messages in thread
From: Jay Lan @ 2006-06-21 19:14 UTC (permalink / raw)
  To: Shailabh Nagar; +Cc: Andrew Morton, balbir, csturtiv, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 4536 bytes --]

Jay Lan wrote:
>Shailabh Nagar wrote:
>  
>>Andrew Morton wrote:
>> 
>>    
>>>You see the problem - if one userspace package wants the tgid-stats and
>>>another concurrently-running one does now, what do we do?  Just leave it
>>>enabled and run a bit slower?
>>>
>>>If so, how much slower?  Your changelog says some potential users don't
>>>need the tgid-stats, but so what?  I assume this patch is a performance
>>>thing?  If so, has it been quantified?
>>>   
>>>      
>>Here are some results from running a simple program (source below) that does
>>10 iterations of creating and then destroying 1000 threads. On the side, another utility
>>kept reading the pid (+tgid if present) stats from exiting tasks.
>> 
>>    
>
>I ran my testing using the same program posted by Shalilabh attached in his
>posting.
>
>System: SGI a350, a two cpus IA64 machine.
>Kernel:  2.6.17-rc3 + delay-acct-taskstats patch set
>       + tgid-disable_patch_shailabh + exit race patch_balbir +
>csa_patch_jlan
>
>I also modified the Decumentation/accounting/getdelay.c:
>   - it repeatedly does recv() to retrieve data from kernel
>   - instead of using printf() to display data received, i simply write
>it to
>     disk as it would be for an accounting daemon. Note that currently
>both the
>     BSD (or GNU) accounting and the CSA writes accounting data from kernel.
>     As an effort of moving accounting system to userspace, the raw data
>needs
>     to be written to a raw file first before further processing.
>
>In Shailabh's testing, he ran his 'mkthreads' 10 iterations of creating and
>distroying 1000 threads.  I had to increase my test to 5000 iterations
>in order
>to receive meaningful data: 'mkthreads 1000 5000'.
>
>I used Shailabh's per-tgid-disable patch to run my tests with per-tgid
>enabled and disabled. I used 'sa' command of 'acct' package to report
>results of 5 runs of 'mkthreads 1000 5000'.
>
>  
>>	Yes	No	Ovhd
>>user	0.14	0.15	-6%
>>system	1.61	1.54	+5%
>>elapsed	2.01	1.94	+3%
>>
>>Yes = tgid stats printed on exit
>>No = not printed
>>Ovhd = (Yes-No)/No * 100
>> 
>>    
>
>Here are test results:
>
>             Yes      No     Ovhd
>user         1.77    0.44    302.27%
>system       0.06    0.06      0.00%
>  

Please swap "user" label with "system" label. Sorry.

Also i forgot to attach the two files.

- jay

>elapsed    794.60  316.40    151.14%
>
>Also, the results of five runs of per-tgid-disabled were very
>consistent (3 runs of 0,44 seconds and 2 runs of 0.45), while
>those of per-tgid-enabled varies (1.56, 1.99, 1.54, 2.21, 1.57).
>
>The impact of per-tgid stats is too significant to ignore for
>those who do not need the per-tgid stats data.
>
>Another observation that i considered bad news is that all
>10 runs produced 1 to 5 recv() error with errno=105 (ENOBUF).
>
>Here I attach my csa_taskstats patch and my modified version of
>exit_recv.c.
>
>Regards,
> - jay
>
>
>  
>>So even in this extreme case where the per-tgid stats are indeed
>>half of the total data, the overhead is not very significant.
>>
>>As pointed out earlier, more representative cases are
>>- single threaded apps (e.g. make -jX) where the current
>>taskstats interface already optimizes by not sending redundant per-tgid stats, or
>>- server-type multithreaded apps where the exits are going to be relatively infrequent (due to
>>reuse of thread pools) so the extra per-tgid output is not going to have much impact.
>>
>>I'd suggest we drop the idea of including this patch until we have data showing that
>>the overhead is an issue.
>>
>>--Shailabh
>>
>>
>>
>>#include <stdio.h>
>>#include <stdlib.h>
>>#include <sys/types.h>
>>#include <unistd.h>
>>#include <pthread.h>
>>
>>int n;
>>
>>void *slow_exit(void *arg)
>>{
>>	int i = (int) arg;
>>	usleep((n-i)*2);
>>}
>>
>>int main(int argc, char *argv[])
>>{
>>	int i,rc, rep;
>>	pthread_t *ppthread;
>>	
>>	n = 5 ;
>>	if (argc > 1)
>>		n = atoi(argv[1]);
>>
>>	rep = 10;
>>	if (argc > 2)
>>		rep = atoi(argv[2]);
>>
>>	ppthread = malloc(n * sizeof(pthread_t));
>>	if (ppthread == NULL) {
>>		printf("Memory allocation failure\n");
>>		exit(-1);
>>	}
>>
>>	while (rep) {
>>		for (i=0; i<n; i++) {
>>			rc = pthread_create(&ppthread[i], NULL,
>>					    slow_exit, (void *)i);
>>			if (rc) {
>>				printf("Error creating thread %d\n", i);
>>				exit(-1);
>>			}
>>		}
>>		for (i=0; i<n; i++) {
>>			rc = pthread_join(ppthread[i], NULL);
>>			if (rc) {
>>				printf("Error joining thread %d\n", i);
>>				exit(-1);
>>			}
>>		}
>>		rep--;
>>	}
>>}
>>
>>
>>
>> 
>>    
>
>  


[-- Attachment #2: csa_taskstats.patch --]
[-- Type: text/plain, Size: 10542 bytes --]

Index: linux/include/linux/taskstats.h
===================================================================
--- linux.orig/include/linux/taskstats.h	2006-06-19 18:27:38.881105605 -0700
+++ linux/include/linux/taskstats.h	2006-06-20 11:37:51.278901513 -0700
@@ -15,6 +15,11 @@
 #ifndef _LINUX_TASKSTATS_H
 #define _LINUX_TASKSTATS_H
 
+#ifdef __KERNEL__
+#include <linux/time.h>
+#include <linux/sched.h>
+#endif
+
 /* Format for per-task data returned to userland when
  *	- a task exits
  *	- listener requests stats for a task
@@ -84,6 +89,40 @@ struct taskstats {
 
 	/* version of taskstats */
 	__u64	version;
+
+ 	/* Common Accounting Fields start */
+ 	__u32	ac_uid;			/* User ID */
+ 	__u32	ac_gid;			/* Group ID */
+ 	__u32	ac_pid;			/* Process ID */
+ 	__u32	ac_ppid;		/* Parent process ID */
+ 	struct timespec	start_time;	/* Start time */
+ 	struct timespec exit_time;	/* Exit time */
+ 	__u64	ac_utime;		/* User CPU time [usec] */
+ 	__u64	ac_stime;		/* SYstem CPU time [usec] */
+ 	char	ac_comm[TASK_COMM_LEN];	/* Command name */
+ 	/* Common Accounting Fields end */
+
+ 	/* CSA accounting fields start */
+ 	__u64	ac_sbu;			/* System billing units */
+ 	__u16	csa_revision;		/* CSA Revision */
+ 	__u8	csa_type;		/* Record types */
+ 	__u8	csa_flag;		/* Record flags */
+ 	__u8	ac_stat;		/* Exit status */
+ 	__u8	ac_nice;		/* Nice value */
+ 	__u8	ac_sched;		/* Scheduling discipline */
+ 	__u8	pad0;			/* Unused */
+ 	__u64	acct_rss_mem1;		/* accumulated rss usage */
+ 	__u64	acct_vm_mem1;		/* accumulated virtual memory usage */
+ 	__u64	hiwater_rss;		/* High-watermark of RSS usage */
+ 	__u64	hiwater_vm;		/* High-water virtual memory usage */
+ 	__u64	ac_minflt;		/* Minor Page Fault */
+ 	__u64	ac_majflt;		/* Major Page Fault */
+ 	__u64	ac_chr;			/* bytes read */
+ 	__u64	ac_chw;			/* bytes written */
+ 	__u64	ac_scr;			/* read syscalls */
+ 	__u64	ac_scw;			/* write syscalls */
+ 	__u64	ac_jid;			/* Job ID */
+ 	/* CSA accounting fields end */
 
 };
 
Index: linux/init/Kconfig
===================================================================
--- linux.orig/init/Kconfig	2006-06-19 18:27:38.913105990 -0700
+++ linux/init/Kconfig	2006-06-20 11:37:51.290901649 -0700
@@ -173,6 +173,31 @@ config TASK_DELAY_ACCT
 
 	  Say N if unsure.
 
+config CSA_ACCT
+	bool "Enable CSA Job Accounting (EXPERIMENTAL)"
+	depends on TASKSTATS
+	help
+	  Comprehensive System Accounting (CSA) provides job level
+	  accounting of resource usage.  The accounting records are
+	  written by the kernel into a file.  CSA user level scripts
+	  and commands process the binary accounting records and
+	  combine them by job identifier within system boot uptime
+	  periods.  These accounting records are then used to produce
+	  reports and charge fees to users.
+
+	  Say Y here if you want job level accounting to be compiled
+	  into the kernel.  Say M here if you want the writing of
+	  accounting records portion of this feature to be a loadable
+	  module.  Say N here if you do not want job level accounting
+	  (the default).
+
+	  The CSA commands and scripts package needs to be installed
+	  to process the CSA accounting records.  See
+	  http://oss.sgi.com/projects/csa for further information
+	  about CSA and download instructions for the CSA commands
+	  package and documentation.
+
+
 config SYSCTL
 	bool "Sysctl support"
 	---help---
Index: linux/kernel/Makefile
===================================================================
--- linux.orig/kernel/Makefile	2006-06-19 18:27:38.929106183 -0700
+++ linux/kernel/Makefile	2006-06-20 11:37:51.290901649 -0700
@@ -40,6 +40,7 @@ obj-$(CONFIG_RCU_TORTURE_TEST) += rcutor
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o
+obj-$(CONFIG_CSA_ACCT) += csa.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
Index: linux/kernel/csa.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux/kernel/csa.c	2006-06-20 11:37:51.294901694 -0700
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2006 Silicon Graphics, Inc All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information:  Silicon Graphics, Inc., 1500 Crittenden Lane,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ */
+
+#include <linux/taskstats.h>
+#include <linux/csa_kern.h>
+
+int csa_add_tsk(struct taskstats *stats, struct task_struct *p)
+{
+	stats->version  = 0x3132333435363738;
+	stats->ac_uid	= 0x39393939;	/* p->uid; */
+	stats->ac_gid	= 0x38383838;	/* p->gid; */
+	stats->ac_pid	= p->pid;
+	stats->ac_ppid	= (p->parent) ? p->parent->pid : 0;
+	stats->ac_utime	= p->utime * USEC_PER_TICK;
+	stats->ac_stime	= p->stime * USEC_PER_TICK;
+	/* Each process gets a minimum of a half tick cpu time */
+	if ((stats->ac_utime == 0) && (stats->ac_stime == 0)) {
+		stats->ac_stime = USEC_PER_TICK/2;
+	}
+
+	stats->start_time = p->start_time;
+	do_posix_clock_monotonic_gettime(&stats->exit_time);
+	strncpy(stats->ac_comm, p->comm, sizeof(stats->ac_comm));
+
+	stats->ac_sbu = 0;
+	stats->csa_revision = REV_CSA;
+	stats->csa_type = 0;
+	stats->csa_flag = 0;
+	stats->ac_stat  = p->exit_code;
+	stats->ac_nice  = task_nice(p);
+	stats->ac_sched = p->policy;
+	stats->acct_rss_mem1 = p->acct_rss_mem1;
+	stats->acct_vm_mem1  = p->acct_vm_mem1;
+	if (p->mm) {
+		stats->hiwater_rss   = p->mm->hiwater_rss;
+		stats->hiwater_vm    = p->mm->hiwater_vm;
+	}
+	stats->ac_minflt = p->min_flt;
+	stats->ac_majflt = p->maj_flt;
+	stats->ac_chr	= p->rchar;
+	stats->ac_chw	= p->wchar;
+	stats->ac_scr	= p->syscr;
+	stats->ac_scw	= p->syscw;
+	stats->ac_jid	= 0xffffffffffffffff;
+	return 0;
+}
Index: linux/kernel/taskstats.c
===================================================================
--- linux.orig/kernel/taskstats.c	2006-06-20 11:34:51.652867983 -0700
+++ linux/kernel/taskstats.c	2006-06-20 11:37:51.298901739 -0700
@@ -19,6 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
+#include <linux/csa_kern.h>
 #include <net/genetlink.h>
 #include <asm/atomic.h>
 
@@ -123,8 +124,16 @@ static int fill_pid(pid_t pid, struct ta
 	 */
 
 	rc = delayacct_add_tsk(stats, tsk);
+/*
+	if (rc)
+		goto err;
+ */
+	rc = csa_add_tsk(stats, tsk);
+	if (rc) {
+		goto err;
+	}
 
-	/* Define err: label here if needed */
+err:	/* Define err: label here if needed */
 	put_task_struct(tsk);
 	return rc;
 
@@ -269,12 +278,14 @@ void taskstats_exit_send(struct task_str
 		size = 2 * size;	/* PID + STATS + TGID + STATS */
 
 	rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
-	if (rc < 0)
+	if (rc < 0) {
 		goto ret;
+	}
 
 	rc = fill_pid(tsk->pid, tsk, tidstats);
-	if (rc < 0)
+	if (rc < 0) {
 		goto err_skb;
+	}
 
 	tidstats->version = TASKSTATS_VERSION;
 
Index: linux/include/linux/csa_kern.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux/include/linux/csa_kern.h	2006-06-20 11:37:51.314901921 -0700
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2006 Silicon Graphics, Inc All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information:  Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ */
+/*
+ *  CSA (Comprehensive System Accounting)
+ *  Job Accounting for Linux
+ *
+ *  This header file contains the definitions needed for job
+ *  accounting. The kernel CSA accounting module code and all
+ *  user-level programs that try to write or process the binary job
+ *  accounting data must include this file.
+ *
+ *  This kernel header file and the csa.h in the csa userland source
+ *  rpm share same data struct declaration and #define's. Do not modify
+ *  one without modify the other one as well. The compatibility between
+ *  userland and the kernel is ensured by using the 'ah_revision' field
+ *  of struct achead.
+ *
+ */
+
+#ifndef _CSA_KERN_H
+#define _CSA_KERN_H
+
+#include <linux/time.h>
+
+extern int csa_add_tsk(struct taskstats *, struct task_struct *);
+
+/*
+ * Record revision levels.
+ *
+ * These are incremented to indicate that a record's format has changed since
+ * a previous release.
+ *
+ * History:     05000   The first rev in Linux
+ *              06000   Major rework to clean up unused fields and features.
+ *                      No binary compatibility with earlier rev.
+ *		07000	Convert to taskstats interface
+ *
+ * NOTE: The header revision number was defined as 02400 in earlier version.
+ *       However, since ah_revision was defined as 15-bit field (ah_magic
+ *       takes up 17 bits), the revision number is read as twice the value in
+ *       new code. So, define it to be 05000 accordingly.
+ */
+#define REV_CSA		07000	/* Kernel: CSA base record */
+
+
+/* this defines can be removed once they're available in kernel header files */
+/* #define USEC_PER_SEC 1000000L */	/* number of usecs for 1 second */
+#define USEC_PER_TICK	(USEC_PER_SEC/HZ)
+
+
+#endif	/* _CSA_KERN_H */

[-- Attachment #3: exit_recv.c --]
[-- Type: text/plain, Size: 9110 bytes --]

/* getdelays.c
 *
 * Utility to get per-pid and per-tgid delay accounting statistics
 * Also illustrates usage of the taskstats interface
 *
 * Copyright (C) Shailabh Nagar, IBM Corp. 2005
 * Copyright (C) Balbir Singh, IBM Corp. 2006
 * Copyright (c) Jay Lan, SGI. 2006
 *
 */

#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <unistd.h>
#include <poll.h>
#include <string.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <signal.h>

#include <linux/genetlink.h>
#include <taskstats.h>

/*
 * Generic macros for dealing with netlink sockets. Might be duplicated
 * elsewhere. It is recommended that commercial grade applications use
 * libnl or libnetlink and use the interfaces provided by the library
 */
#define GENLMSG_DATA(glh)	((void *)(NLMSG_DATA(glh) + GENL_HDRLEN))
#define GENLMSG_PAYLOAD(glh)	(NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN)
#define NLA_DATA(na)		((void *)((char*)(na) + NLA_HDRLEN))
#define NLA_PAYLOAD(len)	(len - NLA_HDRLEN)

#define err(code, fmt, arg...) do { printf(fmt, ##arg); exit(code); } while (0)
int done = 0;

int dbg=0, Delayacct=0;
__u64 stime, utime;
#define PRINTF(fmt, arg...) {			\
	    if (dbg) {				\
		printf(fmt, ##arg);		\
	    }					\
	}
/*
 * Create a raw netlink socket and bind
 */
static int create_nl_socket(int protocol, int groups)
{
    socklen_t addr_len;
    int fd;
    struct sockaddr_nl local;

    fd = socket(AF_NETLINK, SOCK_RAW, protocol);
    if (fd < 0)
	return -1;

    memset(&local, 0, sizeof(local));
    local.nl_family = AF_NETLINK;
    local.nl_groups = groups;

    if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0)
	goto error;

    return fd;
  error:
    close(fd);
    return -1;
}

int sendto_fd(int s, const char *buf, int bufLen)
{
    struct sockaddr_nl nladdr;
    int r;

    memset(&nladdr, 0, sizeof(nladdr));
    nladdr.nl_family = AF_NETLINK;

    while ((r = sendto(s, buf, bufLen, 0, (struct sockaddr *) &nladdr,
		       sizeof(nladdr))) < bufLen) {
	if (r > 0) {
	    buf += r;
	    bufLen -= r;
	} else if (errno != EAGAIN)
	    return -1;
    }
    return 0;
}

/*
 * Probe the controller in genetlink to find the family id
 * for the TASKSTATS family
 */
int get_family_id(int sd)
{
    struct {
	struct nlmsghdr n;
	struct genlmsghdr g;
	char buf[256];
    } family_req;
    struct {
	struct nlmsghdr n;
	struct genlmsghdr g;
	char buf[256];
    } ans;

    int id;
    struct nlattr *na;
    int rep_len;

    /* Get family name */
    family_req.n.nlmsg_type = GENL_ID_CTRL;
    family_req.n.nlmsg_flags = NLM_F_REQUEST;
    family_req.n.nlmsg_seq = 0;
    family_req.n.nlmsg_pid = getpid();
    family_req.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
    family_req.g.cmd = CTRL_CMD_GETFAMILY;
    family_req.g.version = 0x1;
    na = (struct nlattr *) GENLMSG_DATA(&family_req);
    na->nla_type = CTRL_ATTR_FAMILY_NAME;
    na->nla_len = strlen(TASKSTATS_GENL_NAME) + 1 + NLA_HDRLEN;
    strcpy(NLA_DATA(na), TASKSTATS_GENL_NAME);
    family_req.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);

    if (sendto_fd(sd, (char *) &family_req, family_req.n.nlmsg_len) < 0)
	err(1, "error sending message via Netlink\n");

    rep_len = recv(sd, &ans, sizeof(ans), 0);

    if (rep_len < 0)
	err(1, "error receiving reply message via Netlink\n");


    /* Validate response message */
    if (!NLMSG_OK((&ans.n), rep_len))
	err(1, "invalid reply message received via Netlink\n");

    if (ans.n.nlmsg_type == NLMSG_ERROR) {	/* error */
	printf("error received NACK - leaving\n");
	exit(1);
    }


    na = (struct nlattr *) GENLMSG_DATA(&ans);
    na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
    if (na->nla_type == CTRL_ATTR_FAMILY_ID) {
	id = *(__u16 *) NLA_DATA(na);
    }
    return id;
}

void print_taskstats(struct taskstats *t)
{
    printf("\n\nCPU   %15s%15s%15s%15s\n"
	   "      %15llu%15llu%15llu%15llu\n"
	   "IO    %15s%15s\n"
	   "      %15llu%15llu\n"
	   "MEM   %15s%15s\n"
	   "      %15llu%15llu\n\n",
	   "count", "real total", "virtual total", "delay total",
	   t->cpu_count, t->cpu_run_real_total, t->cpu_run_virtual_total,
	   t->cpu_delay_total,
	   "count", "delay total",
	   t->blkio_count, t->blkio_delay_total,
	   "count", "delay total", t->swapin_count, t->swapin_delay_total);
}

void print_csa(struct taskstats *t)
{
    int sec, nsec;
    sec = t->exit_time.tv_sec - t->start_time.tv_sec;
    nsec = t->exit_time.tv_nsec - t->start_time.tv_nsec;
    printf("Command='%s'\n stime=%15llu, utime=%15llu, elapsed=%15llu msec\n",
    	t->ac_comm, t->ac_stime, t->ac_utime, sec*1000 + nsec/1000000);
    stime += t->ac_stime;
    utime += t->ac_utime;
}

void sigchld(int sig)
{
    done = 1;
}

int main(int argc, char *argv[])
{
    int rc;
    int sk_nl;
    struct nlmsghdr *nlh;
    struct genlmsghdr *genlhdr;
    __u16 id;
    struct nlattr *na;
    struct {
        struct nlmsghdr n;
	struct genlmsghdr g;
	char buf[800];
    } exitmsg;
    int	fd;

    /* For receiving */
    struct sockaddr_nl kern_nla, from_nla;
    socklen_t from_nla_len;
    int recv_len;

    int nl_sd = -1;
    int rep_len;
    int len = 0;
    int aggr_len, len2;
    struct sockaddr_nl nladdr;
    pid_t tid = 0;
    pid_t rtid = 0;
    int c;
    int count = 0, csa_summary=0;
    int write_file = 1;
    struct sigaction act = {
	.sa_handler = SIG_IGN,
    };
    struct sigaction tact = {
	.sa_handler = sigchld,
    };

    if (sigaction(SIGCHLD, &tact, NULL) < 0)
	err(1, "sigaction failed for SIGCHLD\n");

    while (1) {
    	c = getopt(argc, argv, "cdDw");
	if (c < 0)
	    break;

	switch (c) {
	    case 'c':
	    	printf("display csa\n");
		csa_summary = 1;
		break;

	    case 'd':
	    	printf("exit_recv: debug on\n");
	    	dbg = 1;
		break;

	    case 'D':
	    	printf("Delayacct summary ON\n");
		Delayacct = 1;
		break;

	    case 'w':		/* DON'T write data to a file */
	    	printf("exit_recv: write_file OFF\n");
	    	write_file = 0;
	    	break;

	    default: {
	    	printf("Unknown option %d\n", c);
		exit(-1);
	    }
	}
    }

    if (write_file)
	if ((fd = open("/var/csa/acct", O_WRONLY | O_CREAT | O_TRUNC, 
		S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)) == -1) {
	    perror("Cannot open output file\n"); exit(1);
	}

    /* Open a NETLINK_GENERIC socket with TASKSTATS_LISTEN_GROUP */

    if ((nl_sd =
	 create_nl_socket(NETLINK_GENERIC, TASKSTATS_LISTEN_GROUP)) < 0)
	err(1, "error creating Netlink socket\n");

    if (sigaction(SIGINT, &act, NULL) < 0)
        err(1, "sigaction failed for SIGINT\n");

    do {
	int i;

	rep_len = recv(nl_sd, &exitmsg, sizeof(exitmsg), 0);
	PRINTF("\n\treceived %d bytes\n", rep_len);
	nladdr.nl_family = AF_NETLINK;
	nladdr.nl_groups = TASKSTATS_LISTEN_GROUP;

	if (exitmsg.n.nlmsg_type == NLMSG_ERROR) {	/* error */
	    printf("error received NACK - leaving\n");
	    exit(1);
	}

	if (rep_len < 0) {
	    printf("error receiving reply message via Netlink, rep_len=%d, errno=%d\n",
	    	rep_len, errno);
	    continue;
	}

	PRINTF("nlmsghdr size=%d, nlmsg_len=%d, rep_len=%d\n",
		sizeof(struct nlmsghdr), exitmsg.n.nlmsg_len, rep_len);
	/* Validate response message */
	if (!NLMSG_OK((&exitmsg.n), rep_len))
	    err(1, "invalid reply message received via Netlink\n");
/* #define NLMSG_OK(nlh,len) ((len) >= (int)sizeof(struct nlmsghdr) && \
                           (nlh)->nlmsg_len >= sizeof(struct nlmsghdr) && \
			                              (nlh)->nlmsg_len <= (len))
 */

	rep_len = GENLMSG_PAYLOAD(&exitmsg.n);

	na = (struct nlattr *) GENLMSG_DATA(&exitmsg);
	len = 0;
	i = 0;
	while (len < rep_len) {
	    len += NLA_ALIGN(na->nla_len);
	    switch (na->nla_type) {
	    case TASKSTATS_TYPE_AGGR_PID:
		/* Fall through */
	    case TASKSTATS_TYPE_AGGR_TGID:
		aggr_len = NLA_PAYLOAD(na->nla_len);
		len2 = 0;
		/* For nested attributes, na follows */
		na = (struct nlattr *) NLA_DATA(na);
		done = 0;
		while (len2 < aggr_len) {
		    switch (na->nla_type) {
		    case TASKSTATS_TYPE_PID:
			rtid = *(int *) NLA_DATA(na);
			PRINTF("PID\t%d\n", rtid);
			break;
		    case TASKSTATS_TYPE_TGID:
			rtid = *(int *) NLA_DATA(na);
			PRINTF("TGID\t%d\n", rtid);
			break;
		    case TASKSTATS_TYPE_STATS:
		    	count++;
		    	if (Delayacct)
			    print_taskstats((struct taskstats *) NLA_DATA(na));
			if (fd > 0) {
			    if (write(fd, NLA_DATA(na), na->nla_len) < 0) {
			    	err(1,"write error\n");
			    }
			}
			if (csa_summary)
			    print_csa((struct taskstats *)NLA_DATA(na));
			break;
		    default:
		    	printf("Unknown nested nla_type %d\n", na->nla_type);
			break;
		    }
		    len2 += NLA_ALIGN(na->nla_len);
		    na = (struct nlattr *) ((char *) na + len2);
		    if (done)
			break;
		}
		break;

	    default:
	    	printf("Unknown nla_type %d\n", na->nla_type);
		break;
	    }
	    na = (struct nlattr *) (GENLMSG_DATA(&exitmsg) + len);
	    if (done)
		break;
	}
	if (done)
	    break;
    }
    while (1);

    printf("Total taskstats STATS read %d, stime=%llu, utime=%llu\n",
    	count, stime, utime);
    close(nl_sd);
    if (fd > 0)
    	close(fd);
    return 0;
}

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-21 19:14           ` Jay Lan
@ 2006-06-21 19:34             ` Shailabh Nagar
  2006-06-21 23:35               ` Jay Lan
  0 siblings, 1 reply; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-21 19:34 UTC (permalink / raw)
  To: Jay Lan; +Cc: Andrew Morton, balbir, csturtiv, linux-kernel

Jay Lan wrote:

>Jay Lan wrote:
>  
>
>>Shailabh Nagar wrote:
>> 
>>    
>>
>>>Andrew Morton wrote:
>>>
>>>   
>>>      
>>>
>>>>You see the problem - if one userspace package wants the tgid-stats and
>>>>another concurrently-running one does now, what do we do?  Just leave it
>>>>enabled and run a bit slower?
>>>>
>>>>If so, how much slower?  Your changelog says some potential users don't
>>>>need the tgid-stats, but so what?  I assume this patch is a performance
>>>>thing?  If so, has it been quantified?
>>>>  
>>>>     
>>>>        
>>>>
>>>Here are some results from running a simple program (source below) that does
>>>10 iterations of creating and then destroying 1000 threads. On the side, another utility
>>>kept reading the pid (+tgid if present) stats from exiting tasks.
>>>
>>>   
>>>      
>>>
>>I ran my testing using the same program posted by Shalilabh attached in his
>>posting.
>>    
>>
Thanks for running this. The results look interesting.

>>System: SGI a350, a two cpus IA64 machine.
>>Kernel:  2.6.17-rc3 + delay-acct-taskstats patch set
>>      + tgid-disable_patch_shailabh + exit race patch_balbir +
>>csa_patch_jlan
>>
>>I also modified the Decumentation/accounting/getdelay.c:
>>  - it repeatedly does recv() to retrieve data from kernel
>>  - instead of using printf() to display data received, i simply write
>>it to
>>    disk as it would be for an accounting daemon. Note that currently
>>both the
>>    BSD (or GNU) accounting and the CSA writes accounting data from kernel.
>>    As an effort of moving accounting system to userspace, the raw data
>>needs
>>    to be written to a raw file first before further processing.
>>    
>>
In exit_recv.c, you appear to be dumping the per-tgid data  received to 
disk too ?
If the accounting daemon isn't interested in per-tgid, shouldn't it be 
discarding the data immediately after
doing the recv() and only write to disk the data it wants ?
Perhaps I'm missing something.


<snip>

>>Another observation that i considered bad news is that all
>>10 runs produced 1 to 5 recv() error with errno=105 (ENOBUF).
>>    
>>
Wonder if this has to do with userspace not being able to keep up with 
the data flow because
of the pathological rate at which exits happen.

Anyway, lets look at the overhead part first perhaps.

--Shailabh

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-21 19:11         ` Jay Lan
  2006-06-21 19:14           ` Jay Lan
@ 2006-06-21 20:38           ` Andrew Morton
  2006-06-21 21:31             ` Shailabh Nagar
  1 sibling, 1 reply; 134+ messages in thread
From: Andrew Morton @ 2006-06-21 20:38 UTC (permalink / raw)
  To: Jay Lan; +Cc: nagar, balbir, csturtiv, linux-kernel

On Wed, 21 Jun 2006 12:11:13 -0700
Jay Lan <jlan@engr.sgi.com> wrote:

> Another observation that i considered bad news is that all
> 10 runs produced 1 to 5 recv() error with errno=105 (ENOBUF).

Well that's rather bad.  AFAICT most of the allocations in there are
GFP_KERNEL, so why is this happening?

Because the kernel is producing netlink messages faster than userspace can
consume them, perhaps?  If so, the sender needs to block, which means we
need to make reception of these stats a privileged operation?

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-21 20:38           ` Andrew Morton
@ 2006-06-21 21:31             ` Shailabh Nagar
  2006-06-21 21:45               ` Jay Lan
  0 siblings, 1 reply; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-21 21:31 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Jay Lan, balbir, csturtiv, linux-kernel

Andrew Morton wrote:

>On Wed, 21 Jun 2006 12:11:13 -0700
>Jay Lan <jlan@engr.sgi.com> wrote:
>
>  
>
>>Another observation that i considered bad news is that all
>>10 runs produced 1 to 5 recv() error with errno=105 (ENOBUF).
>>    
>>
>
>Well that's rather bad.  AFAICT most of the allocations in there are
>GFP_KERNEL, so why is this happening?
>  
>

Need to trace the cause.

>Because the kernel is producing netlink messages faster than userspace can
>consume them, perhaps? 
>
Hmm...possible. A quick check would be to reduce the frequency of exits 
and see.

> If so, the sender needs to block, which means we
>need to make reception of these stats a privileged operation?
>  
>
Won't it suffice to make delivery of these stats best effort, with 
userspace dealing with missing data,
rather than risk delaying exits ? The cases where exits are so frequent 
as in this program should be
very few. Making the reception privileged would kind of constrain the 
utilization of stats and I'm not
sure if  its warranted.


--Shailabh



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-21 21:31             ` Shailabh Nagar
@ 2006-06-21 21:45               ` Jay Lan
  2006-06-21 21:54                 ` Andrew Morton
  2006-06-21 21:59                 ` Shailabh Nagar
  0 siblings, 2 replies; 134+ messages in thread
From: Jay Lan @ 2006-06-21 21:45 UTC (permalink / raw)
  To: Shailabh Nagar; +Cc: Andrew Morton, balbir, csturtiv, linux-kernel

Shailabh Nagar wrote:
> Andrew Morton wrote:
>
>> On Wed, 21 Jun 2006 12:11:13 -0700
>> Jay Lan <jlan@engr.sgi.com> wrote:
>>
>>  
>>
>>> Another observation that i considered bad news is that all
>>> 10 runs produced 1 to 5 recv() error with errno=105 (ENOBUF).
>>>   
>>
>> Well that's rather bad.  AFAICT most of the allocations in there are
>> GFP_KERNEL, so why is this happening?
>>  
>>
>
> Need to trace the cause.
>
>> Because the kernel is producing netlink messages faster than
>> userspace can
>> consume them, perhaps?
> Hmm...possible. A quick check would be to reduce the frequency of
> exits and see.
>
>> If so, the sender needs to block, which means we
>> need to make reception of these stats a privileged operation?
>>  
>>
> Won't it suffice to make delivery of these stats best effort, with
> userspace dealing with missing data,

How do you recover the missed data?

> rather than risk delaying exits ? The cases where exits are so
> frequent as in this program should be

This is very true. However, it was a 2p IA64 machine. I am too frightened to
speak "512p"...

Regards,
 - jay

> very few. Making the reception privileged would kind of constrain the
> utilization of stats and I'm not
> sure if  its warranted.
>
>
> --Shailabh
>
>


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-21 21:45               ` Jay Lan
@ 2006-06-21 21:54                 ` Andrew Morton
  2006-06-21 22:19                   ` Jay Lan
  2006-06-21 21:59                 ` Shailabh Nagar
  1 sibling, 1 reply; 134+ messages in thread
From: Andrew Morton @ 2006-06-21 21:54 UTC (permalink / raw)
  To: Jay Lan; +Cc: nagar, balbir, csturtiv, linux-kernel

On Wed, 21 Jun 2006 14:45:01 -0700
Jay Lan <jlan@engr.sgi.com> wrote:

> > Won't it suffice to make delivery of these stats best effort, with
> > userspace dealing with missing data,
> 
> How do you recover the missed data?

I suspect the best we can do is to let userspace know that data was lost. 
Is the -ENOBUFS reliable?

> > rather than risk delaying exits ? The cases where exits are so
> > frequent as in this program should be
> 
> This is very true. However, it was a 2p IA64 machine. I am too frightened to
> speak "512p"...

If we have 511 CPUs generating data faster than one CPU can handle it,
something bad will happen.  We either throttle the 511 CPUs or drop data.


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-21 21:45               ` Jay Lan
  2006-06-21 21:54                 ` Andrew Morton
@ 2006-06-21 21:59                 ` Shailabh Nagar
  1 sibling, 0 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-21 21:59 UTC (permalink / raw)
  To: Jay Lan; +Cc: Andrew Morton, balbir, csturtiv, linux-kernel

Jay Lan wrote:

>Shailabh Nagar wrote:
>  
>
>>Andrew Morton wrote:
>>
>>    
>>
>>>On Wed, 21 Jun 2006 12:11:13 -0700
>>>Jay Lan <jlan@engr.sgi.com> wrote:
>>>
>>> 
>>>
>>>      
>>>
>>>>Another observation that i considered bad news is that all
>>>>10 runs produced 1 to 5 recv() error with errno=105 (ENOBUF).
>>>>  
>>>>        
>>>>
>>>Well that's rather bad.  AFAICT most of the allocations in there are
>>>GFP_KERNEL, so why is this happening?
>>> 
>>>
>>>      
>>>
>>Need to trace the cause.
>>
>>    
>>
>>>Because the kernel is producing netlink messages faster than
>>>userspace can
>>>consume them, perhaps?
>>>      
>>>
>>Hmm...possible. A quick check would be to reduce the frequency of
>>exits and see.
>>
>>    
>>
>>>If so, the sender needs to block, which means we
>>>need to make reception of these stats a privileged operation?
>>> 
>>>
>>>      
>>>
>>Won't it suffice to make delivery of these stats best effort, with
>>userspace dealing with missing data,
>>    
>>
>
>How do you recover the missed data?
>  
>
Not recover as such but just let userspace know data was dropped so it 
can work around it.

>  
>
>>rather than risk delaying exits ? The cases where exits are so
>>frequent as in this program should be
>>    
>>
>
>This is very true. However, it was a 2p IA64 machine. I am too frightened to
>speak "512p"...
>  
>
True, but then you should presumably have more receivers or some other 
strategy to consume the output
faster ? Blocking is an even
worse idea if that many CPUs will be waiting around for stats data to be 
written out....


--Shailabh


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-21 21:54                 ` Andrew Morton
@ 2006-06-21 22:19                   ` Jay Lan
  0 siblings, 0 replies; 134+ messages in thread
From: Jay Lan @ 2006-06-21 22:19 UTC (permalink / raw)
  To: Andrew Morton; +Cc: nagar, balbir, csturtiv, linux-kernel

Andrew Morton wrote:
>On Wed, 21 Jun 2006 14:45:01 -0700
>Jay Lan <jlan@engr.sgi.com> wrote:
>
>  
>>>Won't it suffice to make delivery of these stats best effort, with
>>>userspace dealing with missing data,
>>>      
>>How do you recover the missed data?
>>    
>
>I suspect the best we can do is to let userspace know that data was lost. 
>Is the -ENOBUFS reliable?
>  

We need to reduce that to an acceptable rate. In the real life, the rate
should be
must less. Under this test, i have one drop every < 5 minutes. I will
talk to
our deamon expert to see how we can improve it... and get a better define of
"acceptable rate".

- jay

>  
>>>rather than risk delaying exits ? The cases where exits are so
>>>frequent as in this program should be
>>>      
>>This is very true. However, it was a 2p IA64 machine. I am too frightened to
>>speak "512p"...
>>    
>
>If we have 511 CPUs generating data faster than one CPU can handle it,
>something bad will happen.  We either throttle the 511 CPUs or drop data.
>
>  


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-21 19:34             ` Shailabh Nagar
@ 2006-06-21 23:35               ` Jay Lan
  2006-06-21 23:45                 ` Shailabh Nagar
  2006-06-23 17:14                 ` Shailabh Nagar
  0 siblings, 2 replies; 134+ messages in thread
From: Jay Lan @ 2006-06-21 23:35 UTC (permalink / raw)
  To: Shailabh Nagar; +Cc: Andrew Morton, balbir, csturtiv, linux-kernel


>>> System: SGI a350, a two cpus IA64 machine.
>>> Kernel:  2.6.17-rc3 + delay-acct-taskstats patch set
>>>      + tgid-disable_patch_shailabh + exit race patch_balbir +
>>> csa_patch_jlan
>>>
>>> I also modified the Decumentation/accounting/getdelay.c:
>>>  - it repeatedly does recv() to retrieve data from kernel
>>>  - instead of using printf() to display data received, i simply write
>>> it to
>>>    disk as it would be for an accounting daemon. Note that currently
>>> both the
>>>    BSD (or GNU) accounting and the CSA writes accounting data from
>>> kernel.
>>>    As an effort of moving accounting system to userspace, the raw data
>>> needs
>>>    to be written to a raw file first before further processing.
>>>   
> In exit_recv.c, you appear to be dumping the per-tgid data  received
> to disk too ?
> If the accounting daemon isn't interested in per-tgid, shouldn't it be
> discarding the data immediately after
> doing the recv() and only write to disk the data it wants ?
> Perhaps I'm missing something.
>
I modified my exit_recv.c so that
1) i can totally skip data marked  TASKSTATS_TYPE_AGGR_TGID
2) i can optinally drop data after receipt without writing to disk

The first case produced a system time of 1.34 second and the second
case produced a system time of 1.25 sec.  Big improvement over 1.74
sec, but still too high compared to 0.34 sec when we disable tgid
completely.

Shailabh and me now eye on the lock patch that fixed an exit race
crash i reported. The global lock was held too long in scanning threads.
Shailabh is working on a new patch.

- jay


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-21 23:35               ` Jay Lan
@ 2006-06-21 23:45                 ` Shailabh Nagar
  2006-06-23 17:14                 ` Shailabh Nagar
  1 sibling, 0 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-21 23:45 UTC (permalink / raw)
  To: Jay Lan; +Cc: Andrew Morton, balbir, csturtiv, linux-kernel

Jay Lan wrote:

> 
> Shailabh and me now eye on the lock patch that fixed an exit race
> crash i reported. The global lock was held too long in scanning threads.
> Shailabh is working on a new patch.

To clarify further,

when I ran the same benchmark as Jay (same set of patches, on a 2.6.17 kernel)
on a uniprocessor, I see the same kind of low differential between
tgid stat sending on and off as I was seeing before.

Using /usr/bin/time ./mkthread 1000 10
		yes	no	%Ovhd
system		1.63	1.55	+5%
elapsed		1.96	1.88	+4%

(similar differences whether data is written to file or not, only
total times change)

Since his system is an SMP, one suspect is the
lock hold time of taskstats_exit_mutex. Since the fill_tgid() is done
within this mutex which serializes all task exits, and there'll be contention on the
SMP, its possible the fill_tgid's overhead is exacerbating the locking.

So I'm trying to see if a patch that uses only per-task locking will help.
Will work it out and post when patch is stable or if it helps.

--Shailabh

> 
> - jay
> 


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-21 23:35               ` Jay Lan
  2006-06-21 23:45                 ` Shailabh Nagar
@ 2006-06-23 17:14                 ` Shailabh Nagar
  2006-06-23 18:19                   ` Jay Lan
  2006-06-23 21:19                   ` Andrew Morton
  1 sibling, 2 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-23 17:14 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Jay Lan, balbir, csturtiv, linux-kernel

Hi Andrew,

Two developments on the tgid overhead issue:

1. The latest results show that overhead is significant
only when the exit rate exceeds roughly 1000 threads/second.

2. A new patch that modifies the locking used within taskstats,
brings down the overhead of the extreme case quite a bit.
I'll submit the patch along shortly in a separate mail.

To get back to the effect of exit rate, I modified the fork+exit
benchmark to vary the rate at which exits happened and
ran tests on a 4-way 1.4 GHz x86_64 box. The kernel was 2.6.17,
uses the delay accounting/taskstat patches in 2.6.17-mm1 + the new
locking patch mentioned in 2. above.

The results show that differential between tgid on and off
starts becoming significant once the exit rate crosses roughly 1000
threads/second. Below that exit rate, the difference is negligible.
Above it, the difference starts climbing rapidly.

So I guess the question is whether this rate of exit is representative
enough of real life to warrant making any more changes to the existing
patchset, beyond the locking changes in 2. above.

>From my limited experience, I think this is too high an exit rate
to be worrying about overhead.


        %ovhd of tgid on over off
        (higher is worse)

Exit     User     Sys     Elapsed
Rate     Time     Time    Time

2283      25.76  649.41   -0.14
1193     -10.53   88.81   -0.12
963      -11.90    3.28   -0.10
806       -8.54   -0.84    0.16
694       -4.41    2.38    0.03

Exit Rate: units are threads exiting per second.
Calculated by (#threads_forked+exited)/(elapsed_time)/2
Since app pretty much does only thread create and exit for 10000
threads (1000 threads, 10 iterations), this is a good measure
for exit rate.

%diff in user, sys, elapsed times calculated using
(tgid_on - tgid_off)/tgid_off * 100
where tgid_on/off times are reported by /usr/bin/time as before.

Each data point for tgid_on and tgid_off was an average
of 10 runs of the fork+exit benchmark.
The rate of exits was controlled by delaying the individual
threads through a usleep before being allowed to exit.

Machine was 4-way 1.6GHz x86_64 Opteron.

"exit_recv -w", the user program consuming the stats, was running
on the side, reading the stats but not writing to a file or
printing to screen.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-23 17:14                 ` Shailabh Nagar
@ 2006-06-23 18:19                   ` Jay Lan
  2006-06-23 18:53                     ` Shailabh Nagar
  2006-06-23 21:19                   ` Andrew Morton
  1 sibling, 1 reply; 134+ messages in thread
From: Jay Lan @ 2006-06-23 18:19 UTC (permalink / raw)
  To: Shailabh Nagar; +Cc: Andrew Morton, balbir, csturtiv, linux-kernel

Shailabh Nagar wrote:
>Hi Andrew,
>
>Two developments on the tgid overhead issue:
>
>1. The latest results show that overhead is significant
>only when the exit rate exceeds roughly 1000 threads/second.
>  

I worked with Shailabh this week to run various testing and
debugging as he requested. I was pulled off to some urgent
task yesterday and surprising saw this coming this morning...

Let's slow it down please. My last testing (after your fix in
#2 below) still showed 109% overhead at system time. And, the
per-thread group processing also increase the rate of ENOBUFS
at the receiver.

I need to check with other guys to find out if 1000 threads/sec
indeed unrealistic at our customers' environments. A good
design should allow a mechanism to turn off the penalty due to
a feature that is not common to everybody. I do not understand
your objection.

Regards,
 - jay

>2. A new patch that modifies the locking used within taskstats,
>brings down the overhead of the extreme case quite a bit.
>I'll submit the patch along shortly in a separate mail.
>  
>To get back to the effect of exit rate, I modified the fork+exit
>benchmark to vary the rate at which exits happened and
>ran tests on a 4-way 1.4 GHz x86_64 box. The kernel was 2.6.17,
>uses the delay accounting/taskstat patches in 2.6.17-mm1 + the new
>locking patch mentioned in 2. above.
>
>The results show that differential between tgid on and off
>starts becoming significant once the exit rate crosses roughly 1000
>threads/second. Below that exit rate, the difference is negligible.
>Above it, the difference starts climbing rapidly.
>
>So I guess the question is whether this rate of exit is representative
>enough of real life to warrant making any more changes to the existing
>patchset, beyond the locking changes in 2. above.
>
>>From my limited experience, I think this is too high an exit rate
>to be worrying about overhead.
>
>
>        %ovhd of tgid on over off
>        (higher is worse)
>
>Exit     User     Sys     Elapsed
>Rate     Time     Time    Time
>
>2283      25.76  649.41   -0.14
>1193     -10.53   88.81   -0.12
>963      -11.90    3.28   -0.10
>806       -8.54   -0.84    0.16
>694       -4.41    2.38    0.03
>
>Exit Rate: units are threads exiting per second.
>Calculated by (#threads_forked+exited)/(elapsed_time)/2
>Since app pretty much does only thread create and exit for 10000
>threads (1000 threads, 10 iterations), this is a good measure
>for exit rate.
>
>%diff in user, sys, elapsed times calculated using
>(tgid_on - tgid_off)/tgid_off * 100
>where tgid_on/off times are reported by /usr/bin/time as before.
>
>Each data point for tgid_on and tgid_off was an average
>of 10 runs of the fork+exit benchmark.
>The rate of exits was controlled by delaying the individual
>threads through a usleep before being allowed to exit.
>
>Machine was 4-way 1.6GHz x86_64 Opteron.
>
>"exit_recv -w", the user program consuming the stats, was running
>on the side, reading the stats but not writing to a file or
>printing to screen.
>  


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-23 18:19                   ` Jay Lan
@ 2006-06-23 18:53                     ` Shailabh Nagar
  2006-06-23 20:00                       ` Jay Lan
  0 siblings, 1 reply; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-23 18:53 UTC (permalink / raw)
  To: Jay Lan; +Cc: Andrew Morton, balbir, csturtiv, linux-kernel

Jay Lan wrote:
> Shailabh Nagar wrote:
> 
>>Hi Andrew,
>>
>>Two developments on the tgid overhead issue:
>>
>>1. The latest results show that overhead is significant
>>only when the exit rate exceeds roughly 1000 threads/second.
>> 
> 
> 
> I worked with Shailabh this week to run various testing and
> debugging as he requested. I was pulled off to some urgent
> task yesterday and surprising saw this coming this morning...

Sorry...didn't mean to surprise. I sent you the data last night
privately with request for comments.

Your testing and help has been very valuable and helped uncover
two issues: the locking patch (sent separately) and also a
dependency between taskstats and delay accounting (for which another
patch is being sent out shortly).

> Let's slow it down please. My last testing (after your fix in
> #2 below) still showed 109% overhead at system time. 

True, but my point is that the overhead is at an extremely
high exit rate. I think the test in which you saw 109% overhead
ran 5000 iterations of 1000 threads and had an elapsed time of
294 seconds (with tgid turned off) giving an exit rate of roughly
8500 exits/second, right ?

My results confirm the high overhead at these exit rates. In fact,
on the system I used, I see the 649% overhead for the 2200 exits/second case
even higher than yours) but the point is whether that exit rate
is a valid design criteria.

> And, the per-thread group processing also increase the rate of ENOBUFS
> at the receiver.

Could you quantify please ? Also, pls list the exit rate at which
this happens.

> I need to check with other guys to find out if 1000 threads/sec
> indeed unrealistic at our customers' environments. A good
> design should allow a mechanism to turn off the penalty due to
> a feature that is not common to everybody. I do not understand
> your objection.

Only objection is that design shouldn't cater to a case that is
extremely unlikely in practice. In most situations, there is no
or insignificant penalty.

Perhaps others on the list can also chip in whether this kind of exit
rate is realistic in some scenarios and where the peformance
penalty matters (i.e. not system shutdown etc.)

Please note that the exits have to be for multithreaded apps, not
single-threaded ones for which tgid sending is already turned off.

Thanks,
Shailabh

> 
> Regards,
>  - jay
> 
> 
>>2. A new patch that modifies the locking used within taskstats,
>>brings down the overhead of the extreme case quite a bit.
>>I'll submit the patch along shortly in a separate mail.
>> 
>>To get back to the effect of exit rate, I modified the fork+exit
>>benchmark to vary the rate at which exits happened and
>>ran tests on a 4-way 1.4 GHz x86_64 box. The kernel was 2.6.17,
>>uses the delay accounting/taskstat patches in 2.6.17-mm1 + the new
>>locking patch mentioned in 2. above.
>>
>>The results show that differential between tgid on and off
>>starts becoming significant once the exit rate crosses roughly 1000
>>threads/second. Below that exit rate, the difference is negligible.
>>Above it, the difference starts climbing rapidly.
>>
>>So I guess the question is whether this rate of exit is representative
>>enough of real life to warrant making any more changes to the existing
>>patchset, beyond the locking changes in 2. above.
>>
>>>From my limited experience, I think this is too high an exit rate
>>to be worrying about overhead.
>>
>>
>>       %ovhd of tgid on over off
>>       (higher is worse)
>>
>>Exit     User     Sys     Elapsed
>>Rate     Time     Time    Time
>>
>>2283      25.76  649.41   -0.14
>>1193     -10.53   88.81   -0.12
>>963      -11.90    3.28   -0.10
>>806       -8.54   -0.84    0.16
>>694       -4.41    2.38    0.03
>>
>>Exit Rate: units are threads exiting per second.
>>Calculated by (#threads_forked+exited)/(elapsed_time)/2
>>Since app pretty much does only thread create and exit for 10000
>>threads (1000 threads, 10 iterations), this is a good measure
>>for exit rate.
>>
>>%diff in user, sys, elapsed times calculated using
>>(tgid_on - tgid_off)/tgid_off * 100
>>where tgid_on/off times are reported by /usr/bin/time as before.
>>
>>Each data point for tgid_on and tgid_off was an average
>>of 10 runs of the fork+exit benchmark.
>>The rate of exits was controlled by delaying the individual
>>threads through a usleep before being allowed to exit.
>>
>>Machine was 4-way 1.6GHz x86_64 Opteron.
>>
>>"exit_recv -w", the user program consuming the stats, was running
>>on the side, reading the stats but not writing to a file or
>>printing to screen.
>> 
> 
> 


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-23 18:53                     ` Shailabh Nagar
@ 2006-06-23 20:00                       ` Jay Lan
  2006-06-23 20:16                         ` Shailabh Nagar
  0 siblings, 1 reply; 134+ messages in thread
From: Jay Lan @ 2006-06-23 20:00 UTC (permalink / raw)
  To: Shailabh Nagar; +Cc: Andrew Morton, balbir, csturtiv, linux-kernel

Shailabh Nagar wrote:
>Jay Lan wrote:
>  
>>Shailabh Nagar wrote:
>>
>>    
>>>Hi Andrew,
>>>
>>>Two developments on the tgid overhead issue:
>>>
>>>1. The latest results show that overhead is significant
>>>only when the exit rate exceeds roughly 1000 threads/second.
>>>
>>>      
>>I worked with Shailabh this week to run various testing and
>>debugging as he requested. I was pulled off to some urgent
>>task yesterday and surprising saw this coming this morning...
>>    
>
>Sorry...didn't mean to surprise. I sent you the data last night
>privately with request for comments.
>  

Yeah, i saw it, but did not have time to respond before your posting.

>Your testing and help has been very valuable and helped uncover
>two issues: the locking patch (sent separately) and also a
>dependency between taskstats and delay accounting (for which another
>patch is being sent out shortly).
>
>  
>>Let's slow it down please. My last testing (after your fix in
>>#2 below) still showed 109% overhead at system time. 
>>    
>
>True, but my point is that the overhead is at an extremely
>high exit rate. I think the test in which you saw 109% overhead
>ran 5000 iterations of 1000 threads and had an elapsed time of
>294 seconds (with tgid turned off) giving an exit rate of roughly
>8500 exits/second, right ?
>
>My results confirm the high overhead at these exit rates. In fact,
>on the system I used, I see the 649% overhead for the 2200 exits/second case
>even higher than yours) but the point is whether that exit rate
>is a valid design criteria.
>  

Agreed. The indeed the deciding factor. The exit rate in the labs
does not help answer this question. I need input from our fields.

>  
>>And, the per-thread group processing also increase the rate of ENOBUFS
>>at the receiver.
>>    
>
>Could you quantify please ? Also, pls list the exit rate at which
>this happens.
>  

I have not posted it nor quantify it because i must bring down the errors
count, or we (CSA) have to explore a different way. So any comparison
on these number at this point does not really help. Again, if the exit rate
is unrealistic, then i need to run a different set of testings. What
sleep_factor did you use? Are those printf() in your new test program
essential?

>  
>>I need to check with other guys to find out if 1000 threads/sec
>>indeed unrealistic at our customers' environments. A good
>>design should allow a mechanism to turn off the penalty due to
>>a feature that is not common to everybody. I do not understand
>>your objection.
>>    
>
>Only objection is that design shouldn't cater to a case that is
>extremely unlikely in practice. In most situations, there is no
>or insignificant penalty.
>  

If this type of exit rate can happen even once a day, the surge may cause
loss of accounting data of other processes. Again, i do not have data
to say either way yet. But i would rather spend time on working on
the ENOBUFS error than running all different tests to argue on the
per-TG switch.

Regards,
 - jay

>Perhaps others on the list can also chip in whether this kind of exit
>rate is realistic in some scenarios and where the peformance
>penalty matters (i.e. not system shutdown etc.)
>
>Please note that the exits have to be for multithreaded apps, not
>single-threaded ones for which tgid sending is already turned off.
>
>Thanks,
>Shailabh
>
>  
>>Regards,
>> - jay
>>
>>
>>    
>>>2. A new patch that modifies the locking used within taskstats,
>>>brings down the overhead of the extreme case quite a bit.
>>>I'll submit the patch along shortly in a separate mail.
>>>
>>>To get back to the effect of exit rate, I modified the fork+exit
>>>benchmark to vary the rate at which exits happened and
>>>ran tests on a 4-way 1.4 GHz x86_64 box. The kernel was 2.6.17,
>>>uses the delay accounting/taskstat patches in 2.6.17-mm1 + the new
>>>locking patch mentioned in 2. above.
>>>
>>>The results show that differential between tgid on and off
>>>starts becoming significant once the exit rate crosses roughly 1000
>>>threads/second. Below that exit rate, the difference is negligible.
>>>Above it, the difference starts climbing rapidly.
>>>
>>>So I guess the question is whether this rate of exit is representative
>>>enough of real life to warrant making any more changes to the existing
>>>patchset, beyond the locking changes in 2. above.
>>>
>>>>From my limited experience, I think this is too high an exit rate
>>>to be worrying about overhead.
>>>
>>>
>>>      %ovhd of tgid on over off
>>>      (higher is worse)
>>>
>>>Exit     User     Sys     Elapsed
>>>Rate     Time     Time    Time
>>>
>>>2283      25.76  649.41   -0.14
>>>1193     -10.53   88.81   -0.12
>>>963      -11.90    3.28   -0.10
>>>806       -8.54   -0.84    0.16
>>>694       -4.41    2.38    0.03
>>>
>>>Exit Rate: units are threads exiting per second.
>>>Calculated by (#threads_forked+exited)/(elapsed_time)/2
>>>Since app pretty much does only thread create and exit for 10000
>>>threads (1000 threads, 10 iterations), this is a good measure
>>>for exit rate.
>>>
>>>%diff in user, sys, elapsed times calculated using
>>>(tgid_on - tgid_off)/tgid_off * 100
>>>where tgid_on/off times are reported by /usr/bin/time as before.
>>>
>>>Each data point for tgid_on and tgid_off was an average
>>>of 10 runs of the fork+exit benchmark.
>>>The rate of exits was controlled by delaying the individual
>>>threads through a usleep before being allowed to exit.
>>>
>>>Machine was 4-way 1.6GHz x86_64 Opteron.
>>>
>>>"exit_recv -w", the user program consuming the stats, was running
>>>on the side, reading the stats but not writing to a file or
>>>printing to screen.
>>>
>>>      
>>    
>
>  


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-23 20:00                       ` Jay Lan
@ 2006-06-23 20:16                         ` Shailabh Nagar
  2006-06-23 20:36                           ` Jay Lan
  0 siblings, 1 reply; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-23 20:16 UTC (permalink / raw)
  To: Jay Lan; +Cc: Andrew Morton, balbir, csturtiv, linux-kernel

Jay Lan wrote:

>>
>>My results confirm the high overhead at these exit rates. In fact,
>>on the system I used, I see the 649% overhead for the 2200 exits/second case
>>even higher than yours) but the point is whether that exit rate
>>is a valid design criteria.
>> 
> 
> 
> Agreed. The indeed the deciding factor. The exit rate in the labs
> does not help answer this question. I need input from our fields.
> 

FWIW, I just spoke to some of the IBM folks working on Websphere (the
J2EE platform) and they've said that the exit rate is quite low since a thread pool
is used to reuse threads rather than have them exit. Also, I'm waiting to
hear from our db2 folks though I suspect its the same story there.

>>
>>>And, the per-thread group processing also increase the rate of ENOBUFS
>>>at the receiver.
>>>   
>>
>>Could you quantify please ? Also, pls list the exit rate at which
>>this happens.
>> 
> 
> 
> I have not posted it nor quantify it because i must bring down the errors
> count, or we (CSA) have to explore a different way. So any comparison
> on these number at this point does not really help. Again, if the exit rate
> is unrealistic, then i need to run a different set of testings. 


> What
> sleep_factor did you use? 

Each thread executed the following code:

void *slow_exit(void *arg)
{
        int i = (int) arg;
        usleep((n-i)*200);
}

and I varied the number within between
700 (resulting in exit rate of 694 in my data)
and 100 (resulting in the 2283 exit rate)


> Are those printf() in your new test program
> essential?

No. I dropped them.
The test program used is appended below. There were no
printfs on the non-failure paths.


> 
> If this type of exit rate can happen even once a day, the surge may cause
> loss of accounting data of other processes. Again, i do not have data
> to say either way yet. But i would rather spend time on working on
> the ENOBUFS error than running all different tests to argue on the
> per-TG switch.
> 

I suppose the ENOBUFS case has to be handled at userspace anyway
since it can potentially happen for high thread exit rate cases even if
only pid data is sent.

> Regards,
>  - jay
> 
> 
>


#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <unistd.h>
#include <pthread.h>

int n;
int barrier=1;


void *slow_exit(void *arg)
{
    long i = (int) arg;
    usleep((n-i)*600);
}

int main(int argc, char *argv[])
{
    int i,rc, rep;
    pthread_t *ppthread;

    n = 5 ;
    if (argc > 1)
        n = atoi(argv[1]);

    rep = 10;
    if (argc > 2)
        rep = atoi(argv[2]);

    ppthread = malloc(n * sizeof(pthread_t));
    if (ppthread == NULL) {
        printf("Memory allocation failure\n");
        exit(-1);
    }

    while (rep) {
        for (i=0; i<n; i++) {
            rc = pthread_create(&ppthread[i], NULL,
                        slow_exit, (void *)i);
            if (rc) {
                printf("Error creating thread %d %d\n", i, rc);
                exit(-1);
            }
        }
        for (i=0; i<n; i++) {
            rc = pthread_join(ppthread[i], NULL);
            if (rc) {
                printf("Error joining thread %d\n", i);
                exit(-1);
            }
        }
        rep--;
    }
}


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-23 20:16                         ` Shailabh Nagar
@ 2006-06-23 20:36                           ` Jay Lan
  0 siblings, 0 replies; 134+ messages in thread
From: Jay Lan @ 2006-06-23 20:36 UTC (permalink / raw)
  To: Shailabh Nagar; +Cc: Andrew Morton, balbir, csturtiv, linux-kernel

Shailabh Nagar wrote:
>Jay Lan wrote:
>
>  
>>>My results confirm the high overhead at these exit rates. In fact,
>>>on the system I used, I see the 649% overhead for the 2200 exits/second case
>>>even higher than yours) but the point is whether that exit rate
>>>is a valid design criteria.
>>>
>>>      
>>Agreed. The indeed the deciding factor. The exit rate in the labs
>>does not help answer this question. I need input from our fields.
>>
>>    
>
>FWIW, I just spoke to some of the IBM folks working on Websphere (the
>J2EE platform) and they've said that the exit rate is quite low since a thread pool
>is used to reuse threads rather than have them exit. Also, I'm waiting to
>hear from our db2 folks though I suspect its the same story there.
>  

Pardon me on my lack of knowledge on the 'thread pool' tool.
Is that a GPL tool? Sounds like a great tool.

>  
>>>>And, the per-thread group processing also increase the rate of ENOBUFS
>>>>at the receiver.
>>>>  
>>>>        
>>>Could you quantify please ? Also, pls list the exit rate at which
>>>this happens.
>>>
>>>      
>>I have not posted it nor quantify it because i must bring down the errors
>>count, or we (CSA) have to explore a different way. So any comparison
>>on these number at this point does not really help. Again, if the exit rate
>>is unrealistic, then i need to run a different set of testings. 
>>    
>
>
>  
>>What
>>sleep_factor did you use? 
>>    
>
>Each thread executed the following code:
>
>void *slow_exit(void *arg)
>{
>        int i = (int) arg;
>        usleep((n-i)*200);
>}
>
>and I varied the number within between
>700 (resulting in exit rate of 694 in my data)
>and 100 (resulting in the 2283 exit rate)
>
>
>  
>>Are those printf() in your new test program
>>essential?
>>    
>
>No. I dropped them.
>The test program used is appended below. There were no
>printfs on the non-failure paths.
>
>
>  
>>If this type of exit rate can happen even once a day, the surge may cause
>>loss of accounting data of other processes. Again, i do not have data
>>to say either way yet. But i would rather spend time on working on
>>the ENOBUFS error than running all different tests to argue on the
>>per-TG switch.
>>
>>    
>
>I suppose the ENOBUFS case has to be handled at userspace anyway
>since it can potentially happen for high thread exit rate cases even if
>only pid data is sent.
>  

I will rerun my tests trying to bring exit rate down around 800
and see what happens.

Thanks!
 - jay

>  
>>Regards,
>> - jay
>>
>>
>>
>>    
>
>
>#include <stdio.h>
>#include <stdlib.h>
>#include <sys/types.h>
>#include <unistd.h>
>#include <pthread.h>
>
>int n;
>int barrier=1;
>
>
>void *slow_exit(void *arg)
>{
>    long i = (int) arg;
>    usleep((n-i)*600);
>}
>
>int main(int argc, char *argv[])
>{
>    int i,rc, rep;
>    pthread_t *ppthread;
>
>    n = 5 ;
>    if (argc > 1)
>        n = atoi(argv[1]);
>
>    rep = 10;
>    if (argc > 2)
>        rep = atoi(argv[2]);
>
>    ppthread = malloc(n * sizeof(pthread_t));
>    if (ppthread == NULL) {
>        printf("Memory allocation failure\n");
>        exit(-1);
>    }
>
>    while (rep) {
>        for (i=0; i<n; i++) {
>            rc = pthread_create(&ppthread[i], NULL,
>                        slow_exit, (void *)i);
>            if (rc) {
>                printf("Error creating thread %d %d\n", i, rc);
>                exit(-1);
>            }
>        }
>        for (i=0; i<n; i++) {
>            rc = pthread_join(ppthread[i], NULL);
>            if (rc) {
>                printf("Error joining thread %d\n", i);
>                exit(-1);
>            }
>        }
>        rep--;
>    }
>}
>
>  


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-23 17:14                 ` Shailabh Nagar
  2006-06-23 18:19                   ` Jay Lan
@ 2006-06-23 21:19                   ` Andrew Morton
  2006-06-23 22:07                     ` Jay Lan
  2006-06-24  3:08                     ` Shailabh Nagar
  1 sibling, 2 replies; 134+ messages in thread
From: Andrew Morton @ 2006-06-23 21:19 UTC (permalink / raw)
  To: Shailabh Nagar; +Cc: jlan, balbir, csturtiv, linux-kernel

On Fri, 23 Jun 2006 13:14:41 -0400
Shailabh Nagar <nagar@watson.ibm.com> wrote:

> The results show that differential between tgid on and off
> starts becoming significant once the exit rate crosses roughly 1000
> threads/second. Below that exit rate, the difference is negligible.
> Above it, the difference starts climbing rapidly.
> 
> So I guess the question is whether this rate of exit is representative
> enough of real life to warrant making any more changes to the existing
> patchset, beyond the locking changes in 2. above.
> 
> >From my limited experience, I think this is too high an exit rate
> to be worrying about overhead.
> 

1000/sec isn't terribly high.  CGI servers, shell scripts.

And kernel development ;) A `pushpatch 1500' here does 992 fork/exec/exit
per second.

>         %ovhd of tgid on over off
>         (higher is worse)
> 
> Exit     User     Sys     Elapsed
> Rate     Time     Time    Time
> 
> 2283      25.76  649.41   -0.14
> 1193     -10.53   88.81   -0.12
> 963      -11.90    3.28   -0.10
> 806       -8.54   -0.84    0.16
> 694       -4.41    2.38    0.03

Oh wow.  Something's gone quadratic there.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-23 21:19                   ` Andrew Morton
@ 2006-06-23 22:07                     ` Jay Lan
  2006-06-23 23:47                       ` Andrew Morton
  2006-06-24  3:08                     ` Shailabh Nagar
  1 sibling, 1 reply; 134+ messages in thread
From: Jay Lan @ 2006-06-23 22:07 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Shailabh Nagar, balbir, csturtiv, linux-kernel

Andrew Morton wrote:
>On Fri, 23 Jun 2006 13:14:41 -0400
>Shailabh Nagar <nagar@watson.ibm.com> wrote:
>
>  
>>The results show that differential between tgid on and off
>>starts becoming significant once the exit rate crosses roughly 1000
>>threads/second. Below that exit rate, the difference is negligible.
>>Above it, the difference starts climbing rapidly.
>>
>>So I guess the question is whether this rate of exit is representative
>>enough of real life to warrant making any more changes to the existing
>>patchset, beyond the locking changes in 2. above.
>>
>>>From my limited experience, I think this is too high an exit rate
>>to be worrying about overhead.
>>
>>    
>
>1000/sec isn't terribly high.  CGI servers, shell scripts.
>
>And kernel development ;) A `pushpatch 1500' here does 992 fork/exec/exit
>per second.
>
>  
>>        %ovhd of tgid on over off
>>        (higher is worse)
>>
>>Exit     User     Sys     Elapsed
>>Rate     Time     Time    Time
>>
>>2283      25.76  649.41   -0.14
>>1193     -10.53   88.81   -0.12
>>963      -11.90    3.28   -0.10
>>806       -8.54   -0.84    0.16
>>694       -4.41    2.38    0.03
>>    
>
>Oh wow.  Something's gone quadratic there.
>  
It was due to a loop in fill_tgid() when per-TG stats
data are assembled for netlink:
        do {
                rc = delayacct_add_tsk(stats, tsk);
                if (rc)
                        break;

        } while_each_thread(first, tsk);

and it is executed inside a lock.
Fortunately single threaded appls do not hit this code.

- jay


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-23 22:07                     ` Jay Lan
@ 2006-06-23 23:47                       ` Andrew Morton
  2006-06-24  2:59                         ` Shailabh Nagar
  0 siblings, 1 reply; 134+ messages in thread
From: Andrew Morton @ 2006-06-23 23:47 UTC (permalink / raw)
  To: Jay Lan; +Cc: nagar, balbir, csturtiv, linux-kernel

On Fri, 23 Jun 2006 15:07:28 -0700
Jay Lan <jlan@engr.sgi.com> wrote:

> >>        %ovhd of tgid on over off
> >>        (higher is worse)
> >>
> >>Exit     User     Sys     Elapsed
> >>Rate     Time     Time    Time
> >>
> >>2283      25.76  649.41   -0.14
> >>1193     -10.53   88.81   -0.12
> >>963      -11.90    3.28   -0.10
> >>806       -8.54   -0.84    0.16
> >>694       -4.41    2.38    0.03
> >>    
> >
> >Oh wow.  Something's gone quadratic there.
> >  
> It was due to a loop in fill_tgid() when per-TG stats
> data are assembled for netlink:
>         do {
>                 rc = delayacct_add_tsk(stats, tsk);
>                 if (rc)
>                         break;
> 
>         } while_each_thread(first, tsk);
> 
> and it is executed inside a lock.
> Fortunately single threaded appls do not hit this code.

Am I reading this right?  We do that loop when each thread within the
thread group exits?  How come?

Is there some better lock we can use in there?  It only has to be
threadgroup-wide rather than kernel-wide.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-23 23:47                       ` Andrew Morton
@ 2006-06-24  2:59                         ` Shailabh Nagar
  2006-06-24  4:39                           ` Andrew Morton
  0 siblings, 1 reply; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-24  2:59 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Jay Lan, balbir, csturtiv, linux-kernel

Andrew Morton wrote:

>On Fri, 23 Jun 2006 15:07:28 -0700
>Jay Lan <jlan@engr.sgi.com> wrote:
>
>  
>
>>>>       %ovhd of tgid on over off
>>>>       (higher is worse)
>>>>
>>>>Exit     User     Sys     Elapsed
>>>>Rate     Time     Time    Time
>>>>
>>>>2283      25.76  649.41   -0.14
>>>>1193     -10.53   88.81   -0.12
>>>>963      -11.90    3.28   -0.10
>>>>806       -8.54   -0.84    0.16
>>>>694       -4.41    2.38    0.03
>>>>   
>>>>        
>>>>
>>>Oh wow.  Something's gone quadratic there.
>>> 
>>>      
>>>
>>It was due to a loop in fill_tgid() when per-TG stats
>>data are assembled for netlink:
>>        do {
>>                rc = delayacct_add_tsk(stats, tsk);
>>                if (rc)
>>                        break;
>>
>>        } while_each_thread(first, tsk);
>>
>>and it is executed inside a lock.
>>Fortunately single threaded appls do not hit this code.
>>    
>>
>
>Am I reading this right?  We do that loop when each thread within the
>thread group exits?
>
Yes.

> How come?
>  
>
To get the sum of all per-tid data for threads that are currently alive.
This is returned to userspace with each thread exit.

>Is there some better lock we can use in there?  It only has to be
>threadgroup-wide rather than kernel-wide.
>  
>
The lock we're holding is the tasklist_lock. To go through all the 
threads of a thread group
thats the only lock that can protect integrity of while_each_thread afaics.

--Shailabh



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-23 21:19                   ` Andrew Morton
  2006-06-23 22:07                     ` Jay Lan
@ 2006-06-24  3:08                     ` Shailabh Nagar
  1 sibling, 0 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-24  3:08 UTC (permalink / raw)
  To: Andrew Morton; +Cc: jlan, balbir, csturtiv, linux-kernel

Andrew Morton wrote:

>On Fri, 23 Jun 2006 13:14:41 -0400
>Shailabh Nagar <nagar@watson.ibm.com> wrote:
>
>  
>
>>The results show that differential between tgid on and off
>>starts becoming significant once the exit rate crosses roughly 1000
>>threads/second. Below that exit rate, the difference is negligible.
>>Above it, the difference starts climbing rapidly.
>>
>>So I guess the question is whether this rate of exit is representative
>>enough of real life to warrant making any more changes to the existing
>>patchset, beyond the locking changes in 2. above.
>>
>>>From my limited experience, I think this is too high an exit rate
>>to be worrying about overhead.
>>
>>    
>>
>
>1000/sec isn't terribly high.  CGI servers, shell scripts.
>
>And kernel development ;) A `pushpatch 1500' here does 992 fork/exec/exit
>per second.
>  
>
Don't all of these create new tasks, not threads ?

Single-threaded tasks fork/exec/exit'ing is not a problem since per-tgid 
data is not sent in that
case (since it will be identical to the per-tid data).

>>        %ovhd of tgid on over off
>>        (higher is worse)
>>
>>Exit     User     Sys     Elapsed
>>Rate     Time     Time    Time
>>
>>2283      25.76  649.41   -0.14
>>1193     -10.53   88.81   -0.12
>>963      -11.90    3.28   -0.10
>>806       -8.54   -0.84    0.16
>>694       -4.41    2.38    0.03
>>    
>>
>
>Oh wow.  Something's gone quadratic there.
>  
>


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-24  2:59                         ` Shailabh Nagar
@ 2006-06-24  4:39                           ` Andrew Morton
  2006-06-24  5:59                             ` Shailabh Nagar
  0 siblings, 1 reply; 134+ messages in thread
From: Andrew Morton @ 2006-06-24  4:39 UTC (permalink / raw)
  To: Shailabh Nagar; +Cc: jlan, balbir, csturtiv, linux-kernel

On Fri, 23 Jun 2006 22:59:04 -0400
Shailabh Nagar <nagar@watson.ibm.com> wrote:

> >>It was due to a loop in fill_tgid() when per-TG stats
> >>data are assembled for netlink:
> >>        do {
> >>                rc = delayacct_add_tsk(stats, tsk);
> >>                if (rc)
> >>                        break;
> >>
> >>        } while_each_thread(first, tsk);
> >>
> >>and it is executed inside a lock.
> >>Fortunately single threaded appls do not hit this code.
> >>    
> >>
> >
> >Am I reading this right?  We do that loop when each thread within the
> >thread group exits?
> >
> Yes.
> 
> > How come?
> >  
> >
> To get the sum of all per-tid data for threads that are currently alive.
> This is returned to userspace with each thread exit.

I realise that.  How about we stop doing it?

When a thread exits it only makes sense to send up the stats for that
thread.  Why does the kernel assume that userspace is also interested in
the accumulated stats of its siblings?  And if userspace _is_ interested in
that info, it's still present in-kernel and can be queried for.

> >Is there some better lock we can use in there?  It only has to be
> >threadgroup-wide rather than kernel-wide.
> >  
> >
> The lock we're holding is the tasklist_lock. To go through all the 
> threads of a thread group
> thats the only lock that can protect integrity of while_each_thread afaics.

At present, yes.  That's persumably not impossible to fix.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-24  4:39                           ` Andrew Morton
@ 2006-06-24  5:59                             ` Shailabh Nagar
  2006-06-26 17:33                               ` Jay Lan
  0 siblings, 1 reply; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-24  5:59 UTC (permalink / raw)
  To: Andrew Morton; +Cc: jlan, balbir, csturtiv, linux-kernel

Andrew Morton wrote:

>On Fri, 23 Jun 2006 22:59:04 -0400
>Shailabh Nagar <nagar@watson.ibm.com> wrote:
>
>  
>
>>>>It was due to a loop in fill_tgid() when per-TG stats
>>>>data are assembled for netlink:
>>>>       do {
>>>>               rc = delayacct_add_tsk(stats, tsk);
>>>>               if (rc)
>>>>                       break;
>>>>
>>>>       } while_each_thread(first, tsk);
>>>>
>>>>and it is executed inside a lock.
>>>>Fortunately single threaded appls do not hit this code.
>>>>   
>>>>
>>>>        
>>>>
>>>Am I reading this right?  We do that loop when each thread within the
>>>thread group exits?
>>>
>>>      
>>>
>>Yes.
>>
>>    
>>
>>>How come?
>>> 
>>>
>>>      
>>>
>>To get the sum of all per-tid data for threads that are currently alive.
>>This is returned to userspace with each thread exit.
>>    
>>
>
>I realise that.  How about we stop doing it?
>
>When a thread exits it only makes sense to send up the stats for that
>thread.  
>

>Why does the kernel assume that userspace is also interested in
>the accumulated stats of its siblings?  And if userspace _is_ interested in
>that info, it's still present in-kernel and can be queried for.
>  
>
The reason for sending out sum of siblings's stats was as follows:
I didn't maintain a per-tgid data structure in-kernel where the exiting 
threads taskstats could be accumalated
, erroneously thinking that this would require such a structure to be 
*additionally* updated each  time a statististic
was being collected and that would be way too much overhead. Also to 
save on space. 
Thus if userspace wants to get the per-tgid stats for the thread group 
when the last thread exits, then it cannot
do so by querying since such a query only returns the sum of currently 
live threads (data from exited threads is lost).

So, the current design chooses to return the sum of all siblings + self 
when each thread exits. Using this userspace
can maintain the per-tgid data for all currently living threads of the 
group + previously exited threads.

But as pointed out in an earlier mail, it looks like this is 
unnecessarily elaborate way of trying to avoid maintaining
a separate per-tgid data structure in the kernel (in addition to the 
per-tid ones we already have).

What can be done is to create a taskstats structure for a thread group 
the moment the *second* thread gets created.
Then each exiting thread can accumalate its stats to this struct. If 
userspace queries for per-tgid data, the sum of all
live threads + value in this struct can be returned. And when the last 
thread of the thread group exits, the struct's
value can be output.

While this will mean an extra taskstats structure hanging around for the 
lifetime of a multithreaded app (not single threaded
ones), it should cut down on the overhead of running through all threads 
that we see in the current design.
More importantly, it will reduce the frequency of per-tgid data send to 
once for each thread group exit  instead of once
per thread exit.

Will that work for everyone ?

>>>Is there some better lock we can use in there?  It only has to be
>>>threadgroup-wide rather than kernel-wide.
>>> 
>>>
>>>      
>>>
>>The lock we're holding is the tasklist_lock. To go through all the 
>>threads of a thread group
>>thats the only lock that can protect integrity of while_each_thread afaics.
>>    
>>
>
>At present, yes.  That's persumably not impossible to fix.
>  
>
In the above design, if a userspace query for per-tgid data arrives, 
then I'll still need to run through
all the threads of a thread group (to return their sum + that of already 
exited threads accumalated in the
extra per-tgid taskstats struct).

So that could still benefit from such a thread group specific lock. 
Scope of change is a bit more of course
so will need to take a closer look.

--Shailabh

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-24  5:59                             ` Shailabh Nagar
@ 2006-06-26 17:33                               ` Jay Lan
  2006-06-26 17:52                                 ` Shailabh Nagar
  2006-06-26 17:55                                 ` Andrew Morton
  0 siblings, 2 replies; 134+ messages in thread
From: Jay Lan @ 2006-06-26 17:33 UTC (permalink / raw)
  To: Shailabh Nagar; +Cc: Andrew Morton, jlan, balbir, csturtiv, linux-kernel

Shailabh Nagar wrote:
> Andrew Morton wrote:
> 
>> On Fri, 23 Jun 2006 22:59:04 -0400
>> Shailabh Nagar <nagar@watson.ibm.com> wrote:
>>
>>  
>>
>>>>> It was due to a loop in fill_tgid() when per-TG stats
>>>>> data are assembled for netlink:
>>>>>       do {
>>>>>               rc = delayacct_add_tsk(stats, tsk);
>>>>>               if (rc)
>>>>>                       break;
>>>>>
>>>>>       } while_each_thread(first, tsk);
>>>>>
>>>>> and it is executed inside a lock.
>>>>> Fortunately single threaded appls do not hit this code.
>>>>>  
>>>>>       
>>>>
>>>> Am I reading this right?  We do that loop when each thread within the
>>>> thread group exits?
>>>>
>>>>     
>>>
>>> Yes.
>>>
>>>   
>>>
>>>> How come?
>>>>
>>>>
>>>>     
>>>
>>> To get the sum of all per-tid data for threads that are currently alive.
>>> This is returned to userspace with each thread exit.
>>>   
>>
>>
>> I realise that.  How about we stop doing it?
>>
>> When a thread exits it only makes sense to send up the stats for that
>> thread. 
> 
> 
>> Why does the kernel assume that userspace is also interested in
>> the accumulated stats of its siblings?  And if userspace _is_ 
>> interested in
>> that info, it's still present in-kernel and can be queried for.
>>  
>>
> The reason for sending out sum of siblings's stats was as follows:
> I didn't maintain a per-tgid data structure in-kernel where the exiting 
> threads taskstats could be accumalated
> , erroneously thinking that this would require such a structure to be 
> *additionally* updated each  time a statististic
> was being collected and that would be way too much overhead. Also to 
> save on space. Thus if userspace wants to get the per-tgid stats for the 
> thread group when the last thread exits, then it cannot
> do so by querying since such a query only returns the sum of currently 
> live threads (data from exited threads is lost).
> 
> So, the current design chooses to return the sum of all siblings + self 
> when each thread exits. Using this userspace
> can maintain the per-tgid data for all currently living threads of the 
> group + previously exited threads.
> 
> But as pointed out in an earlier mail, it looks like this is 
> unnecessarily elaborate way of trying to avoid maintaining
> a separate per-tgid data structure in the kernel (in addition to the 
> per-tid ones we already have).
> 
> What can be done is to create a taskstats structure for a thread group 
> the moment the *second* thread gets created.
> Then each exiting thread can accumalate its stats to this struct. If 
> userspace queries for per-tgid data, the sum of all
> live threads + value in this struct can be returned. And when the last 
> thread of the thread group exits, the struct's
> value can be output.
> 
> While this will mean an extra taskstats structure hanging around for the 
> lifetime of a multithreaded app (not single threaded
> ones), it should cut down on the overhead of running through all threads 
> that we see in the current design.
> More importantly, it will reduce the frequency of per-tgid data send to 
> once for each thread group exit  instead of once
> per thread exit.
> 
> Will that work for everyone ?

As long as the per-pid delayacct struct has a pointer to the per-tgid
data struct and deoes not need to go through the loop on every exit.

> 
>>>> Is there some better lock we can use in there?  It only has to be
>>>> threadgroup-wide rather than kernel-wide.
>>>>
>>>>
>>>>     
>>>
>>> The lock we're holding is the tasklist_lock. To go through all the 
>>> threads of a thread group
>>> thats the only lock that can protect integrity of while_each_thread 
>>> afaics.
>>>   
>>
>>
>> At present, yes.  That's persumably not impossible to fix.
>>  
>>
> In the above design, if a userspace query for per-tgid data arrives, 
> then I'll still need to run through
> all the threads of a thread group (to return their sum + that of already 
> exited threads accumalated in the
> extra per-tgid taskstats struct).

But, this query-reply logic can be separated from that executed at
exit.

Thanks,
  - jay


> 
> So that could still benefit from such a thread group specific lock. 
> Scope of change is a bit more of course
> so will need to take a closer look.
> 
> --Shailabh


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-26 17:33                               ` Jay Lan
@ 2006-06-26 17:52                                 ` Shailabh Nagar
  2006-06-26 17:55                                 ` Andrew Morton
  1 sibling, 0 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-26 17:52 UTC (permalink / raw)
  To: Jay Lan; +Cc: Andrew Morton, jlan, balbir, csturtiv, linux-kernel

Jay Lan wrote:

> Shailabh Nagar wrote:
>
>> Andrew Morton wrote:
>>
>>> On Fri, 23 Jun 2006 22:59:04 -0400
>>> Shailabh Nagar <nagar@watson.ibm.com> wrote:
>>>
>>>  
>>>
>>>>>> It was due to a loop in fill_tgid() when per-TG stats
>>>>>> data are assembled for netlink:
>>>>>>       do {
>>>>>>               rc = delayacct_add_tsk(stats, tsk);
>>>>>>               if (rc)
>>>>>>                       break;
>>>>>>
>>>>>>       } while_each_thread(first, tsk);
>>>>>>
>>>>>> and it is executed inside a lock.
>>>>>> Fortunately single threaded appls do not hit this code.
>>>>>>  
>>>>>>       
>>>>>
>>>>>
>>>>> Am I reading this right?  We do that loop when each thread within the
>>>>> thread group exits?
>>>>>
>>>>>     
>>>>
>>>>
>>>> Yes.
>>>>
>>>>  
>>>>
>>>>> How come?
>>>>>
>>>>>
>>>>>     
>>>>
>>>>
>>>> To get the sum of all per-tid data for threads that are currently 
>>>> alive.
>>>> This is returned to userspace with each thread exit.
>>>>   
>>>
>>>
>>>
>>> I realise that.  How about we stop doing it?
>>>
>>> When a thread exits it only makes sense to send up the stats for that
>>> thread. 
>>
>>
>>
>>> Why does the kernel assume that userspace is also interested in
>>> the accumulated stats of its siblings?  And if userspace _is_ 
>>> interested in
>>> that info, it's still present in-kernel and can be queried for.
>>>  
>>>
>> The reason for sending out sum of siblings's stats was as follows:
>> I didn't maintain a per-tgid data structure in-kernel where the 
>> exiting threads taskstats could be accumalated
>> , erroneously thinking that this would require such a structure to be 
>> *additionally* updated each  time a statististic
>> was being collected and that would be way too much overhead. Also to 
>> save on space. Thus if userspace wants to get the per-tgid stats for 
>> the thread group when the last thread exits, then it cannot
>> do so by querying since such a query only returns the sum of 
>> currently live threads (data from exited threads is lost).
>>
>> So, the current design chooses to return the sum of all siblings + 
>> self when each thread exits. Using this userspace
>> can maintain the per-tgid data for all currently living threads of 
>> the group + previously exited threads.
>>
>> But as pointed out in an earlier mail, it looks like this is 
>> unnecessarily elaborate way of trying to avoid maintaining
>> a separate per-tgid data structure in the kernel (in addition to the 
>> per-tid ones we already have).
>>
>> What can be done is to create a taskstats structure for a thread 
>> group the moment the *second* thread gets created.
>> Then each exiting thread can accumalate its stats to this struct. If 
>> userspace queries for per-tgid data, the sum of all
>> live threads + value in this struct can be returned. And when the 
>> last thread of the thread group exits, the struct's
>> value can be output.
>>
>> While this will mean an extra taskstats structure hanging around for 
>> the lifetime of a multithreaded app (not single threaded
>> ones), it should cut down on the overhead of running through all 
>> threads that we see in the current design.
>> More importantly, it will reduce the frequency of per-tgid data send 
>> to once for each thread group exit  instead of once
>> per thread exit.
>>
>> Will that work for everyone ?
>
>
> As long as the per-pid delayacct struct has a pointer to the per-tgid
> data struct 

It doesn't need to....the per-tgid thing is allocated inside tsk->signal.
Let me send the patch out and we can discuss the design/implementation.

> and deoes not need to go through the loop on every exit.

Yes. Thats not needed anymore.

>
>>
>>>>> Is there some better lock we can use in there?  It only has to be
>>>>> threadgroup-wide rather than kernel-wide.
>>>>>
>>>>>
>>>>>     
>>>>
>>>>
>>>> The lock we're holding is the tasklist_lock. To go through all the 
>>>> threads of a thread group
>>>> thats the only lock that can protect integrity of while_each_thread 
>>>> afaics.
>>>>   
>>>
>>>
>>>
>>> At present, yes.  That's persumably not impossible to fix.
>>>  
>>>
>> In the above design, if a userspace query for per-tgid data arrives, 
>> then I'll still need to run through
>> all the threads of a thread group (to return their sum + that of 
>> already exited threads accumalated in the
>> extra per-tgid taskstats struct).
>
>
> But, this query-reply logic can be separated from that executed at
> exit.

Yes, it already is.

Thanks,
Shailabh


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-26 17:33                               ` Jay Lan
  2006-06-26 17:52                                 ` Shailabh Nagar
@ 2006-06-26 17:55                                 ` Andrew Morton
  2006-06-26 18:00                                   ` Shailabh Nagar
  1 sibling, 1 reply; 134+ messages in thread
From: Andrew Morton @ 2006-06-26 17:55 UTC (permalink / raw)
  To: Jay Lan; +Cc: nagar, jlan, balbir, csturtiv, linux-kernel

On Mon, 26 Jun 2006 10:33:04 -0700
Jay Lan <jlan@sgi.com> wrote:

> > Will that work for everyone ?
> 
> As long as the per-pid delayacct struct has a pointer to the per-tgid
> data struct and deoes not need to go through the loop on every exit.

My brain is wilting, and time is moving along.

Balbir, are you able to summarise where we stand wrt
per-task-delay-accounting-* now?

What problem have we identified?  How close are we to finding agreeable
solutions to them?

My general sense is that there's some rework needed, and that rework will
affect the userspace interfaces, which is a problem for a 2.6.18 merge.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-26 17:55                                 ` Andrew Morton
@ 2006-06-26 18:00                                   ` Shailabh Nagar
  2006-06-26 18:12                                     ` Andrew Morton
  0 siblings, 1 reply; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-26 18:00 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Jay Lan, jlan, balbir, csturtiv, linux-kernel

Andrew Morton wrote:

>On Mon, 26 Jun 2006 10:33:04 -0700
>Jay Lan <jlan@sgi.com> wrote:
>
>  
>
>>>Will that work for everyone ?
>>>      
>>>
>>As long as the per-pid delayacct struct has a pointer to the per-tgid
>>data struct and deoes not need to go through the loop on every exit.
>>    
>>
>
>My brain is wilting, and time is moving along.
>
>Balbir, are you able to summarise where we stand wrt
>per-task-delay-accounting-* now?
>  
>
Andrew,

I'm maintaining per-task delay accouting and taskstats interface patches 
so I'll take the liberty to reply :-)

>What problem have we identified?  How close are we to finding agreeable
>solutions to them?
>  
>
The main problems identified are:

1. extra sending of per-tgid stats on every thread exit
2. unnecessary send of per-tgid stats when there are no listeners
3. unnecessary linkage of delayacct accumalation into per-tgid stats 
with sending out of taskstats

All three have an acceptable solution.
1. & 3. are going to be addressed in a patch I'm sending out shortly.
2. in a separate patch also being sent out shortly.

>My general sense is that there's some rework needed, and that rework will
>affect the userspace interfaces, which is a problem for a 2.6.18 merge.
>  
>
The rework will affect the number of per-tgid records that userspace 
sees (fewer), not the format or any of the
other details regarding the genetlink interface.
Will that be a problem for userspace ?

--Shailabh


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-26 18:00                                   ` Shailabh Nagar
@ 2006-06-26 18:12                                     ` Andrew Morton
  2006-06-26 18:26                                       ` Jay Lan
  0 siblings, 1 reply; 134+ messages in thread
From: Andrew Morton @ 2006-06-26 18:12 UTC (permalink / raw)
  To: Shailabh Nagar; +Cc: jlan, jlan, balbir, csturtiv, linux-kernel

On Mon, 26 Jun 2006 14:00:45 -0400
Shailabh Nagar <nagar@watson.ibm.com> wrote:

> >
> >Balbir, are you able to summarise where we stand wrt
> >per-task-delay-accounting-* now?
> >  
> >
> Andrew,
> 
> I'm maintaining per-task delay accouting and taskstats interface patches 
> so I'll take the liberty to reply :-)

Sorry, too many IBMers ;)

> >What problem have we identified?  How close are we to finding agreeable
> >solutions to them?
> >  
> >
> The main problems identified are:
> 
> 1. extra sending of per-tgid stats on every thread exit
> 2. unnecessary send of per-tgid stats when there are no listeners
> 3. unnecessary linkage of delayacct accumalation into per-tgid stats 
> with sending out of taskstats
> 
> All three have an acceptable solution.
> 1. & 3. are going to be addressed in a patch I'm sending out shortly.
> 2. in a separate patch also being sent out shortly.

Great.

> >My general sense is that there's some rework needed, and that rework will
> >affect the userspace interfaces, which is a problem for a 2.6.18 merge.
> >  
> >
> The rework will affect the number of per-tgid records that userspace 
> sees (fewer), not the format or any of the
> other details regarding the genetlink interface.
> Will that be a problem for userspace ?

Nope.

OK, please send the patch and I'll plan on sending this lot to Linus
Thursdayish.


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-26 18:12                                     ` Andrew Morton
@ 2006-06-26 18:26                                       ` Jay Lan
  2006-06-26 18:39                                         ` Andrew Morton
  0 siblings, 1 reply; 134+ messages in thread
From: Jay Lan @ 2006-06-26 18:26 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Shailabh Nagar, jlan, balbir, csturtiv, linux-kernel

Andrew Morton wrote:
> On Mon, 26 Jun 2006 14:00:45 -0400
> Shailabh Nagar <nagar@watson.ibm.com> wrote:
> 
> 
>>>Balbir, are you able to summarise where we stand wrt
>>>per-task-delay-accounting-* now?
>>> 
>>>
>>
>>Andrew,
>>
>>I'm maintaining per-task delay accouting and taskstats interface patches 
>>so I'll take the liberty to reply :-)
> 
> 
> Sorry, too many IBMers ;)
> 
> 
>>>What problem have we identified?  How close are we to finding agreeable
>>>solutions to them?
>>> 
>>>
>>
>>The main problems identified are:
>>
>>1. extra sending of per-tgid stats on every thread exit
>>2. unnecessary send of per-tgid stats when there are no listeners
>>3. unnecessary linkage of delayacct accumalation into per-tgid stats 
>>with sending out of taskstats
>>
>>All three have an acceptable solution.
>>1. & 3. are going to be addressed in a patch I'm sending out shortly.
>>2. in a separate patch also being sent out shortly.
> 
> 
> Great.
> 
> 
>>>My general sense is that there's some rework needed, and that rework will
>>>affect the userspace interfaces, which is a problem for a 2.6.18 merge.
>>> 
>>>
>>
>>The rework will affect the number of per-tgid records that userspace 
>>sees (fewer), not the format or any of the
>>other details regarding the genetlink interface.
>>Will that be a problem for userspace ?
> 
> 
> Nope.
> 
> OK, please send the patch and I'll plan on sending this lot to Linus
> Thursdayish.

These new patches are fresh out of Shailabh's stove (well, i have
seen one, but not the other yet) and i have not had chance to look
at them yet. No need to rush, does it?

- jay

> 


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-26 18:26                                       ` Jay Lan
@ 2006-06-26 18:39                                         ` Andrew Morton
  2006-06-26 18:49                                           ` Shailabh Nagar
                                                             ` (2 more replies)
  0 siblings, 3 replies; 134+ messages in thread
From: Andrew Morton @ 2006-06-26 18:39 UTC (permalink / raw)
  To: Jay Lan; +Cc: nagar, jlan, balbir, csturtiv, linux-kernel

On Mon, 26 Jun 2006 11:26:53 -0700
Jay Lan <jlan@sgi.com> wrote:

> > OK, please send the patch and I'll plan on sending this lot to Linus
> > Thursdayish.
> 
> These new patches are fresh out of Shailabh's stove (well, i have
> seen one, but not the other yet) and i have not had chance to look
> at them yet. No need to rush, does it?

Thursday's a long way off ;)

As long as we have a high level of confidence that any remaining issues
will be fixed within a few weeks, this code is OK for a merge.

There's a general agreement that the kernel needs this feature - people
have been mucking around with it for years.  Let's put the effort in and
make it happen.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-26 18:39                                         ` Andrew Morton
@ 2006-06-26 18:49                                           ` Shailabh Nagar
  2006-06-26 19:00                                           ` Jay Lan
  2006-06-28 21:30                                           ` Jay Lan
  2 siblings, 0 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-26 18:49 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Jay Lan, jlan, balbir, csturtiv, linux-kernel

Andrew Morton wrote:
> On Mon, 26 Jun 2006 11:26:53 -0700
> Jay Lan <jlan@sgi.com> wrote:
> 
> 
>>>OK, please send the patch and I'll plan on sending this lot to Linus
>>>Thursdayish.
>>
>>These new patches are fresh out of Shailabh's stove (well, i have
>>seen one, but not the other yet) and i have not had chance to look
>>at them yet. No need to rush, does it?
> 
> 
> Thursday's a long way off ;)

Yup !

I'll be working with Jay's CSA patches and with him to make sure any
remaining concerns are addressed.

> 
> As long as we have a high level of confidence that any remaining issues
> will be fixed within a few weeks, this code is OK for a merge.
> 
> There's a general agreement that the kernel needs this feature - people
> have been mucking around with it for years.  Let's put the effort in and
> make it happen.

Thanks for that ! Yes, will make this happen. If there are any concerns about
the code, pls let me know and we'll fix it asap.

Thanks,
Shailabh


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-26 18:39                                         ` Andrew Morton
  2006-06-26 18:49                                           ` Shailabh Nagar
@ 2006-06-26 19:00                                           ` Jay Lan
  2006-06-28 21:30                                           ` Jay Lan
  2 siblings, 0 replies; 134+ messages in thread
From: Jay Lan @ 2006-06-26 19:00 UTC (permalink / raw)
  To: Andrew Morton; +Cc: nagar, jlan, balbir, csturtiv, linux-kernel

Andrew Morton wrote:
> On Mon, 26 Jun 2006 11:26:53 -0700
> Jay Lan <jlan@sgi.com> wrote:
> 
> 
>>>OK, please send the patch and I'll plan on sending this lot to Linus
>>>Thursdayish.
>>
>>These new patches are fresh out of Shailabh's stove (well, i have
>>seen one, but not the other yet) and i have not had chance to look
>>at them yet. No need to rush, does it?
> 
> 
> Thursday's a long way off ;)
> 
> As long as we have a high level of confidence that any remaining issues
> will be fixed within a few weeks, this code is OK for a merge.
> 
> There's a general agreement that the kernel needs this feature - people
> have been mucking around with it for years.  Let's put the effort in and
> make it happen.

Yes, that is our intent! ;)

- jay



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-26 18:39                                         ` Andrew Morton
  2006-06-26 18:49                                           ` Shailabh Nagar
  2006-06-26 19:00                                           ` Jay Lan
@ 2006-06-28 21:30                                           ` Jay Lan
  2006-06-28 21:53                                             ` Andrew Morton
  2 siblings, 1 reply; 134+ messages in thread
From: Jay Lan @ 2006-06-28 21:30 UTC (permalink / raw)
  To: Andrew Morton; +Cc: nagar, balbir, csturtiv, linux-kernel

Hi Andrew,

Testing with all delay-accounting patches as been in your 2.6.27-mm3
tree as of 6/26 afternoon (ie, including the send-tgid-once, and
avoid-sending-without-listeners patches), i do not see measurable
performance difference with and without tgid processing when the
exit rate was controlled to around 1000 exit/sec.

As a result i am OK to not include a design of a system-wise
init-time configuration option (for per-thread group data processing)
in the taskstats interface.

The ENOBUFS i experienced in my testing would start to happen
when exit rate at around 14000 exits/sec. While our fields confirmed
that a 1000 threads exit/sec was a real, i have no reason to be
concerned of 14000 exits/sec rate. ;)

Regards,
 - jay


Andrew Morton wrote:
> On Mon, 26 Jun 2006 11:26:53 -0700
> Jay Lan <jlan@sgi.com> wrote:
> 
> 
>>>OK, please send the patch and I'll plan on sending this lot to Linus
>>>Thursdayish.
>>
>>These new patches are fresh out of Shailabh's stove (well, i have
>>seen one, but not the other yet) and i have not had chance to look
>>at them yet. No need to rush, does it?
> 
> 
> Thursday's a long way off ;)
> 
> As long as we have a high level of confidence that any remaining issues
> will be fixed within a few weeks, this code is OK for a merge.
> 
> There's a general agreement that the kernel needs this feature - people
> have been mucking around with it for years.  Let's put the effort in and
> make it happen.



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-28 21:30                                           ` Jay Lan
@ 2006-06-28 21:53                                             ` Andrew Morton
  2006-06-28 22:02                                               ` Jay Lan
  0 siblings, 1 reply; 134+ messages in thread
From: Andrew Morton @ 2006-06-28 21:53 UTC (permalink / raw)
  To: Jay Lan; +Cc: nagar, balbir, csturtiv, linux-kernel

Jay Lan <jlan@engr.sgi.com> wrote:
>
> Testing with all delay-accounting patches as been in your 2.6.27-mm3
> tree as of 6/26 afternoon (ie, including the send-tgid-once, and
> avoid-sending-without-listeners patches), i do not see measurable
> performance difference with and without tgid processing when the
> exit rate was controlled to around 1000 exit/sec.
> 
> As a result i am OK to not include a design of a system-wise
> init-time configuration option (for per-thread group data processing)
> in the taskstats interface.

Sounds good, thanks.

> The ENOBUFS i experienced in my testing would start to happen
> when exit rate at around 14000 exits/sec. While our fields confirmed
> that a 1000 threads exit/sec was a real, i have no reason to be
> concerned of 14000 exits/sec rate. ;)

1000 exits/sec/CPU can happen.  How many CPUs did that machine have?

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-28 21:53                                             ` Andrew Morton
@ 2006-06-28 22:02                                               ` Jay Lan
  2006-06-29  8:40                                                 ` Paul Jackson
  2006-06-29 12:42                                                 ` Shailabh Nagar
  0 siblings, 2 replies; 134+ messages in thread
From: Jay Lan @ 2006-06-28 22:02 UTC (permalink / raw)
  To: Andrew Morton; +Cc: nagar, balbir, csturtiv, linux-kernel

Andrew Morton wrote:
>>The ENOBUFS i experienced in my testing would start to happen
>>when exit rate at around 14000 exits/sec. While our fields confirmed
>>that a 1000 threads exit/sec was a real, i have no reason to be
>>concerned of 14000 exits/sec rate. ;)
>>    
>
>1000 exits/sec/CPU can happen.  How many CPUs did that machine have?
>  

The test machine was a 2 CPU IA64.

- jay



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-28 22:02                                               ` Jay Lan
@ 2006-06-29  8:40                                                 ` Paul Jackson
  2006-06-29 12:30                                                   ` Valdis.Kletnieks
  2006-06-29 12:42                                                 ` Shailabh Nagar
  1 sibling, 1 reply; 134+ messages in thread
From: Paul Jackson @ 2006-06-29  8:40 UTC (permalink / raw)
  To: Jay Lan; +Cc: akpm, nagar, balbir, csturtiv, linux-kernel

Jay wrote:
> The ENOBUFS i experienced in my testing would start to happen
> when exit rate at around 14000 exits/sec. While our fields confirmed
> that a 1000 threads exit/sec was a real, i have no reason to be
> concerned of 14000 exits/sec rate. ;)

Andrew wrote:
>1000 exits/sec/CPU can happen.  How many CPUs did that machine have?

Jay - what happens if we have 1024 CPUs (the current default config
for ia64/sn2)?

My naive expectation would be that the rate of exits/sec would go up as
the number of CPUs. In other words, I'd expect the exits/sec/CPU to be
a rough constant, slowly increasing over the years as the CPU clock
rate goes up.

So if current CPU technology can generate 1000 exits/sec, and if we
have 1024 CPUs then doesn't that mean we'd like to handle a million
exits/sec?

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29  8:40                                                 ` Paul Jackson
@ 2006-06-29 12:30                                                   ` Valdis.Kletnieks
  2006-06-29 16:44                                                     ` Paul Jackson
  0 siblings, 1 reply; 134+ messages in thread
From: Valdis.Kletnieks @ 2006-06-29 12:30 UTC (permalink / raw)
  To: Paul Jackson; +Cc: Jay Lan, akpm, nagar, balbir, csturtiv, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 982 bytes --]

On Thu, 29 Jun 2006 01:40:50 PDT, Paul Jackson said:

> Jay - what happens if we have 1024 CPUs (the current default config
> for ia64/sn2)?
> 
> My naive expectation would be that the rate of exits/sec would go up as
> the number of CPUs. In other words, I'd expect the exits/sec/CPU to be
> a rough constant, slowly increasing over the years as the CPU clock
> rate goes up.

You're probably correct on that model. However, it all depends on the actual
workload. Are people who actually have large-CPU (>256) systems actually
running fork()-heavy things like webservers on them, or are they running things
like database servers and computations, which tend to have persistent
processes?

Of course, I'm biased by my environment - the big Mac cluster and 2 larger SGI
boxes we have quite likely spend hours at a time where the exit/sec for the
entire image is in the single and low double digits, and the per-cpu value
is down in the noise.  But they're pure machoflops boxes....


[-- Attachment #2: Type: application/pgp-signature, Size: 226 bytes --]

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC]  Disabling per-tgid stats on task exit in taskstats
  2006-06-28 22:02                                               ` Jay Lan
  2006-06-29  8:40                                                 ` Paul Jackson
@ 2006-06-29 12:42                                                 ` Shailabh Nagar
  1 sibling, 0 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-29 12:42 UTC (permalink / raw)
  To: Jay Lan; +Cc: Andrew Morton, balbir, csturtiv, linux-kernel

Jay Lan wrote:

>Andrew Morton wrote:
>  
>
>>>The ENOBUFS i experienced in my testing would start to happen
>>>when exit rate at around 14000 exits/sec. While our fields confirmed
>>>that a 1000 threads exit/sec was a real, i have no reason to be
>>>concerned of 14000 exits/sec rate. ;)
>>>   
>>>      
>>>
>>1000 exits/sec/CPU can happen.  How many CPUs did that machine have?
>> 
>>    
>>
>
>The test machine was a 2 CPU IA64.
>  
>
Increasing the receive buffer size for the netlink socket may help.

--Shailabh



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 12:30                                                   ` Valdis.Kletnieks
@ 2006-06-29 16:44                                                     ` Paul Jackson
  2006-06-29 18:01                                                       ` Andrew Morton
  2006-06-29 18:05                                                       ` Nick Piggin
  0 siblings, 2 replies; 134+ messages in thread
From: Paul Jackson @ 2006-06-29 16:44 UTC (permalink / raw)
  To: Valdis.Kletnieks; +Cc: jlan, akpm, nagar, balbir, csturtiv, linux-kernel

> You're probably correct on that model. However, it all depends on the actual
> workload. Are people who actually have large-CPU (>256) systems actually
> running fork()-heavy things like webservers on them, or are they running things
> like database servers and computations, which tend to have persistent
> processes?

It may well be mostly as you say - the large-CPU systems not running
the fork() heavy jobs.

Sooner or later, someone will want to run a fork()-heavy job on a
large-CPU system.  On a 1024 CPU system, it would apparently take
just 14 exits/sec/CPU to hit this bottleneck, if Jay's number of
14000 applied.

Chris Sturdivant's reply is reasonable -- we'll hit it sooner or later,
and deal with it then.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 16:44                                                     ` Paul Jackson
@ 2006-06-29 18:01                                                       ` Andrew Morton
  2006-06-29 18:07                                                         ` Paul Jackson
                                                                           ` (4 more replies)
  2006-06-29 18:05                                                       ` Nick Piggin
  1 sibling, 5 replies; 134+ messages in thread
From: Andrew Morton @ 2006-06-29 18:01 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Valdis.Kletnieks, jlan, nagar, balbir, csturtiv, linux-kernel

On Thu, 29 Jun 2006 09:44:08 -0700
Paul Jackson <pj@sgi.com> wrote:

> > You're probably correct on that model. However, it all depends on the actual
> > workload. Are people who actually have large-CPU (>256) systems actually
> > running fork()-heavy things like webservers on them, or are they running things
> > like database servers and computations, which tend to have persistent
> > processes?
> 
> It may well be mostly as you say - the large-CPU systems not running
> the fork() heavy jobs.
> 
> Sooner or later, someone will want to run a fork()-heavy job on a
> large-CPU system.  On a 1024 CPU system, it would apparently take
> just 14 exits/sec/CPU to hit this bottleneck, if Jay's number of
> 14000 applied.
> 
> Chris Sturdivant's reply is reasonable -- we'll hit it sooner or later,
> and deal with it then.
> 

I agree, and I'm viewing this as blocking the taskstats merge.  Because if
this _is_ a problem then it's a big one because fixing it will be
intrusive, and might well involve userspace-visible changes.

The only ways I can see of fixing the problem generally are to either

a) throw more CPU(s) at stats collection: allow userspace to register for
   "stats generated by CPU N", then run a stats collection daemon on each
   CPU or

b) make the kernel recognise when it's getting overloaded and switch to
   some degraded mode where it stops trying to send all the data to
   userspace - just send a summary, or a "we goofed" message or something.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 16:44                                                     ` Paul Jackson
  2006-06-29 18:01                                                       ` Andrew Morton
@ 2006-06-29 18:05                                                       ` Nick Piggin
  1 sibling, 0 replies; 134+ messages in thread
From: Nick Piggin @ 2006-06-29 18:05 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Valdis.Kletnieks, jlan, akpm, nagar, balbir, csturtiv, linux-kernel

Paul Jackson wrote:
>>You're probably correct on that model. However, it all depends on the actual
>>workload. Are people who actually have large-CPU (>256) systems actually
>>running fork()-heavy things like webservers on them, or are they running things
>>like database servers and computations, which tend to have persistent
>>processes?
> 
> 
> It may well be mostly as you say - the large-CPU systems not running
> the fork() heavy jobs.
> 
> Sooner or later, someone will want to run a fork()-heavy job on a
> large-CPU system.  On a 1024 CPU system, it would apparently take
> just 14 exits/sec/CPU to hit this bottleneck, if Jay's number of
> 14000 applied.

Half the CPUs in that system are probably going to be several
router hops away, won't they? I'll take a guess and say they're
an order of magnitude too optimistic for such a system ;)

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 18:01                                                       ` Andrew Morton
@ 2006-06-29 18:07                                                         ` Paul Jackson
  2006-06-29 18:26                                                         ` Paul Jackson
                                                                           ` (3 subsequent siblings)
  4 siblings, 0 replies; 134+ messages in thread
From: Paul Jackson @ 2006-06-29 18:07 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Valdis.Kletnieks, jlan, nagar, balbir, csturtiv, linux-kernel

Andrew wrote:
> I agree, and I'm viewing this as blocking the taskstats merge.  Because if
> this _is_ a problem then it's a big one because fixing it will be
> intrusive, and might well involve userspace-visible changes.

Yup.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 18:01                                                       ` Andrew Morton
  2006-06-29 18:07                                                         ` Paul Jackson
@ 2006-06-29 18:26                                                         ` Paul Jackson
  2006-06-29 19:15                                                           ` Shailabh Nagar
  2006-06-29 19:22                                                           ` Shailabh Nagar
  2006-06-29 19:10                                                         ` Shailabh Nagar
                                                                           ` (2 subsequent siblings)
  4 siblings, 2 replies; 134+ messages in thread
From: Paul Jackson @ 2006-06-29 18:26 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Valdis.Kletnieks, jlan, nagar, balbir, csturtiv, linux-kernel

Andrew wrote:
> a) throw more CPU(s) at stats collection: allow userspace to register for
>    "stats generated by CPU N", then run a stats collection daemon on each
>    CPU or

I wonder if we could make the collector per-cpuset.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 18:01                                                       ` Andrew Morton
  2006-06-29 18:07                                                         ` Paul Jackson
  2006-06-29 18:26                                                         ` Paul Jackson
@ 2006-06-29 19:10                                                         ` Shailabh Nagar
  2006-06-29 19:23                                                           ` Paul Jackson
  2006-06-29 19:33                                                           ` Andrew Morton
  2006-06-29 19:33                                                         ` Jay Lan
  2006-06-30 18:53                                                         ` Shailabh Nagar
  4 siblings, 2 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-29 19:10 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Paul Jackson, Valdis.Kletnieks, jlan, balbir, csturtiv,
	linux-kernel, Jamal, netdev

Andrew Morton wrote:

>On Thu, 29 Jun 2006 09:44:08 -0700
>Paul Jackson <pj@sgi.com> wrote:
>
>  
>
>>>You're probably correct on that model. However, it all depends on the actual
>>>workload. Are people who actually have large-CPU (>256) systems actually
>>>running fork()-heavy things like webservers on them, or are they running things
>>>like database servers and computations, which tend to have persistent
>>>processes?
>>>      
>>>
>>It may well be mostly as you say - the large-CPU systems not running
>>the fork() heavy jobs.
>>
>>Sooner or later, someone will want to run a fork()-heavy job on a
>>large-CPU system.  On a 1024 CPU system, it would apparently take
>>just 14 exits/sec/CPU to hit this bottleneck, if Jay's number of
>>14000 applied.
>>
>>Chris Sturdivant's reply is reasonable -- we'll hit it sooner or later,
>>and deal with it then.
>>
>>    
>>
>
>I agree, and I'm viewing this as blocking the taskstats merge.  Because if
>this _is_ a problem then it's a big one because fixing it will be
>intrusive, and might well involve userspace-visible changes.
>  
>
First off, just a reminder that this is inherently a netlink flow 
control issue...which was being exacerbated
earlier by taskstats decision to send per-tgid data (no longer the case).

But I'd like to know whats our target here ? How many messages per 
second do we want to be able to be sent
and received without risking any loss of data ? Netlink will lose 
messages at a high enough rate so the design point
will need to be known a bit.

For statistics type usage of the genetlink/netlink, I would have thought 
that userspace, provided it is reliably informed
about the loss of data through ENOBUFS, could take measures to just 
account for the missing data and carry on ?




>The only ways I can see of fixing the problem generally are to either
>
>a) throw more CPU(s) at stats collection: allow userspace to register for
>   "stats generated by CPU N", then run a stats collection daemon on each
>   CPU or
>  
>
>b) make the kernel recognise when it's getting overloaded and switch to
>   some degraded mode where it stops trying to send all the data to
>   userspace - just send a summary, or a "we goofed" message or something.
>  
>
One of the unused features of genetlink that's meant for high volume 
data output from the kernel is
the "dump" callback of a genetlink connection. Essentially kernel space 
keeps getting provided sk_buffs
to fill which the netlink layer then supplies to user space (over time I 
guess ?)

But whatever we do, there's going to be some limit so its useful to 
decide what the design point should be ?

Adding Jamal for his thoughts on netlink's flow control in the context 
of genetlink.

--Shailabh

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 18:26                                                         ` Paul Jackson
@ 2006-06-29 19:15                                                           ` Shailabh Nagar
  2006-06-29 19:41                                                             ` Paul Jackson
  2006-06-29 19:22                                                           ` Shailabh Nagar
  1 sibling, 1 reply; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-29 19:15 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Andrew Morton, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel

Paul Jackson wrote:

>Andrew wrote:
>  
>
>>a) throw more CPU(s) at stats collection: allow userspace to register for
>>   "stats generated by CPU N", then run a stats collection daemon on each
>>   CPU or
>>    
>>
>
>I wonder if we could make the collector per-cpuset.
>  
>
I suppose this is because cpuset's offer some middle ground between 
collecting data per-cpu
vs. collecting it for all cpus ?

What happens when someone is using cpusets on such a machine and changes 
its membership
in response to other needs. All taskstats users would need to monitor 
for such changes and
adjust their processing....seems like unnecessary tying up of two 
unrelated concepts.

--Shailabh




^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 18:26                                                         ` Paul Jackson
  2006-06-29 19:15                                                           ` Shailabh Nagar
@ 2006-06-29 19:22                                                           ` Shailabh Nagar
  1 sibling, 0 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-29 19:22 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Andrew Morton, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel

Paul Jackson wrote:

>Andrew wrote:
>  
>
>>a) throw more CPU(s) at stats collection: allow userspace to register for
>>   "stats generated by CPU N", then run a stats collection daemon on each
>>   CPU or
>>    
>>
>
>I wonder if we could make the collector per-cpuset.
>
>  
>
I suppose this is because cpuset's offer some middle ground between 
collecting data per-cpu
vs. collecting it for all cpus (as being done now) ?

What happens when someone is using cpusets on such a machine and changes 
its membership
in response to other needs. All taskstats users would need to monitor 
for such changes and
adjust their processing....seems like unnecessary tying up of two 
unrelated concepts.

--Shailabh



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 19:10                                                         ` Shailabh Nagar
@ 2006-06-29 19:23                                                           ` Paul Jackson
  2006-06-29 19:33                                                           ` Andrew Morton
  1 sibling, 0 replies; 134+ messages in thread
From: Paul Jackson @ 2006-06-29 19:23 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: akpm, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel,
	hadi, netdev

Shailabh wrote:
> First off, just a reminder that this is inherently a netlink flow
> control issue...which was being exacerbated earlier by taskstats
> decision to send per-tgid data (no longer the case).
> 
> But I'd like to know whats our target here ? How many messages
> per second do we want to be able to be sent and received without
> risking any loss of data ? Netlink will lose messages at a high
> enough rate so the design point will need to be known a bit.

Perhaps its not so much an issue of the design rate, as an issue of
how we deal with hitting the limit.  Sooner or later, perhaps due to
operator error, almost any implementable rate will be exceeded.

Ideally, we would both of the remedies that Andrew mentioned,
rephrasing:
 1) a way for a customer who needs a higher rate to scale
    the useful resources he can apply to the collection, and
 2) a clear indicator when the supported rate was exceeded
    anyway.

> For statistics type usage of the genetlink/netlink, I would have
> thought that userspace, provided it is reliably informed about the loss
> of data through ENOBUFS, could take measures to just account for the
> missing data and carry on ?

If that's so, then the ENOBUFS error may well meet my remedy (2) above,
leaving just the question of how a customer could scale to higher
rates, if they found it was worth doing so.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 18:01                                                       ` Andrew Morton
                                                                           ` (2 preceding siblings ...)
  2006-06-29 19:10                                                         ` Shailabh Nagar
@ 2006-06-29 19:33                                                         ` Jay Lan
  2006-06-30 18:53                                                         ` Shailabh Nagar
  4 siblings, 0 replies; 134+ messages in thread
From: Jay Lan @ 2006-06-29 19:33 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Paul Jackson, Valdis.Kletnieks, jlan, nagar, balbir, csturtiv,
	linux-kernel

Andrew Morton wrote:
> On Thu, 29 Jun 2006 09:44:08 -0700
> Paul Jackson <pj@sgi.com> wrote:
> 
> 
>>>You're probably correct on that model. However, it all depends on the actual
>>>workload. Are people who actually have large-CPU (>256) systems actually
>>>running fork()-heavy things like webservers on them, or are they running things
>>>like database servers and computations, which tend to have persistent
>>>processes?
>>
>>It may well be mostly as you say - the large-CPU systems not running
>>the fork() heavy jobs.
>>
>>Sooner or later, someone will want to run a fork()-heavy job on a
>>large-CPU system.  On a 1024 CPU system, it would apparently take
>>just 14 exits/sec/CPU to hit this bottleneck, if Jay's number of
>>14000 applied.
>>
>>Chris Sturdivant's reply is reasonable -- we'll hit it sooner or later,
>>and deal with it then.
>>
> 
> 
> I agree, and I'm viewing this as blocking the taskstats merge.  Because if
> this _is_ a problem then it's a big one because fixing it will be
> intrusive, and might well involve userspace-visible changes.
> 
> The only ways I can see of fixing the problem generally are to either
> 
> a) throw more CPU(s) at stats collection: allow userspace to register for
>    "stats generated by CPU N", then run a stats collection daemon on each
>    CPU or

Clearly this approach (or the per-cpuset as Paul suggested) can solve
large-CPU system issues. As technology advances, this _WILL_ become a
problem sooner or later.

However, taskstats header carries a version number. Would a change like
this too intrusive to add to a later version?

Regards,
  - jay


> 
> b) make the kernel recognise when it's getting overloaded and switch to
>    some degraded mode where it stops trying to send all the data to
>    userspace - just send a summary, or a "we goofed" message or something.


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 19:10                                                         ` Shailabh Nagar
  2006-06-29 19:23                                                           ` Paul Jackson
@ 2006-06-29 19:33                                                           ` Andrew Morton
  2006-06-29 19:43                                                             ` Shailabh Nagar
  2006-06-29 20:01                                                             ` Shailabh Nagar
  1 sibling, 2 replies; 134+ messages in thread
From: Andrew Morton @ 2006-06-29 19:33 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: pj, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel, hadi, netdev

On Thu, 29 Jun 2006 15:10:31 -0400
Shailabh Nagar <nagar@watson.ibm.com> wrote:

> >I agree, and I'm viewing this as blocking the taskstats merge.  Because if
> >this _is_ a problem then it's a big one because fixing it will be
> >intrusive, and might well involve userspace-visible changes.
> >  
> >
> First off, just a reminder that this is inherently a netlink flow 
> control issue...which was being exacerbated
> earlier by taskstats decision to send per-tgid data (no longer the case).
> 
> But I'd like to know whats our target here ? How many messages per 
> second do we want to be able to be sent
> and received without risking any loss of data ? Netlink will lose 
> messages at a high enough rate so the design point
> will need to be known a bit.
> 
> For statistics type usage of the genetlink/netlink, I would have thought 
> that userspace, provided it is reliably informed
> about the loss of data through ENOBUFS, could take measures to just 
> account for the missing data and carry on ?

Could be so.  But we need to understand how significant the impact of this
will be in practice.

We could find, once this is deployed is real production environments on
large machines that the data loss is sufficiently common and sufficiently
serious that the feature needs a lot of rework.

Now there's always a risk of that sort of thing happening with all
features, but it's usually not this evident so early in the development
process.  We need to get a better understanding of the risk before
proceeding too far.

And there's always a 100% reliable fix for this: throttling.  Make the
sender of the messages block until the consumer can catch up.  In some
situations, that is what people will want to be able to do.  I suspect a
good implementation would be to run a collection daemon on each CPU and
make the delivery be cpu-local.  That's sounding more like relayfs than
netlink.


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 19:15                                                           ` Shailabh Nagar
@ 2006-06-29 19:41                                                             ` Paul Jackson
  2006-06-29 21:42                                                               ` Shailabh Nagar
  0 siblings, 1 reply; 134+ messages in thread
From: Paul Jackson @ 2006-06-29 19:41 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: akpm, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel

Shailabh wrote:
> I suppose this is because cpuset's offer some middle ground between 
> collecting data per-cpu vs. collecting it for all cpus ?

Yes - well said.  And I have this strange tendency to see all the
worlds problems as opportunities for cpuset solutions <grin>.

> What happens when someone is using cpusets on such a machine and
> changes its membership in response to other needs.  All taskstats
> users would need to monitor for such changes and adjust their
> processing....seems like unnecessary tying up of two unrelated
> concepts.

I would not expect taskstat users to monitor for such changes.
I'd expect them to monitor the stats from whatever is in the
cpuset they named.  If a task moves out of that cpuset to another,
then tough -- that task will no longer be monitored by that
particular monitoring request.

Cpusets do provide a convenient middle ground, as you say, which
is really useful for reducing scaling issues such as this one to
a managable size.

Per-cpu is too fine grained, and per-system too coarse.

An unnecessary tying - yes.  But perhaps a useful one.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 19:33                                                           ` Andrew Morton
@ 2006-06-29 19:43                                                             ` Shailabh Nagar
  2006-06-29 20:00                                                               ` Andrew Morton
  2006-06-29 20:01                                                             ` Shailabh Nagar
  1 sibling, 1 reply; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-29 19:43 UTC (permalink / raw)
  To: Andrew Morton
  Cc: pj, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel, hadi, netdev

Andrew Morton wrote:

>On Thu, 29 Jun 2006 15:10:31 -0400
>Shailabh Nagar <nagar@watson.ibm.com> wrote:
>
>  
>
>>>I agree, and I'm viewing this as blocking the taskstats merge.  Because if
>>>this _is_ a problem then it's a big one because fixing it will be
>>>intrusive, and might well involve userspace-visible changes.
>>> 
>>>
>>>      
>>>
>>First off, just a reminder that this is inherently a netlink flow 
>>control issue...which was being exacerbated
>>earlier by taskstats decision to send per-tgid data (no longer the case).
>>
>>But I'd like to know whats our target here ? How many messages per 
>>second do we want to be able to be sent
>>and received without risking any loss of data ? Netlink will lose 
>>messages at a high enough rate so the design point
>>will need to be known a bit.
>>
>>For statistics type usage of the genetlink/netlink, I would have thought 
>>that userspace, provided it is reliably informed
>>about the loss of data through ENOBUFS, could take measures to just 
>>account for the missing data and carry on ?
>>    
>>
>
>Could be so.  But we need to understand how significant the impact of this
>will be in practice.
>
>We could find, once this is deployed is real production environments on
>large machines that the data loss is sufficiently common and sufficiently
>serious that the feature needs a lot of rework.
>
>Now there's always a risk of that sort of thing happening with all
>features, but it's usually not this evident so early in the development
>process.  We need to get a better understanding of the risk before
>proceeding too far.
>  
>

>And there's always a 100% reliable fix for this: throttling.  Make the
>sender of the messages block until the consumer can catch up. 
>
Is blocking exits an option ?

> In some
>situations, that is what people will want to be able to do.  I suspect a
>good implementation would be to run a collection daemon on each CPU and
>make the delivery be cpu-local.  That's sounding more like relayfs than
>netlink.
>  
>
Yup...the per-cpu, high speed requirements are up relayfs' alley, unless 
Jamal or netlink folks
are planning something (or can shed light on) how large flows can be 
managed over netlink. I suspect
this discussion has happened before :-)



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 19:43                                                             ` Shailabh Nagar
@ 2006-06-29 20:00                                                               ` Andrew Morton
  2006-06-29 22:13                                                                 ` Shailabh Nagar
  0 siblings, 1 reply; 134+ messages in thread
From: Andrew Morton @ 2006-06-29 20:00 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: pj, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel, hadi, netdev

On Thu, 29 Jun 2006 15:43:41 -0400
Shailabh Nagar <nagar@watson.ibm.com> wrote:

> >Could be so.  But we need to understand how significant the impact of this
> >will be in practice.
> >
> >We could find, once this is deployed is real production environments on
> >large machines that the data loss is sufficiently common and sufficiently
> >serious that the feature needs a lot of rework.
> >
> >Now there's always a risk of that sort of thing happening with all
> >features, but it's usually not this evident so early in the development
> >process.  We need to get a better understanding of the risk before
> >proceeding too far.
> >  
> >
> 
> >And there's always a 100% reliable fix for this: throttling.  Make the
> >sender of the messages block until the consumer can catch up. 
> >
> Is blocking exits an option ?

I think it has to be an option.  I'm sure that some peope under some
circumstances will just want to collect all the data, thank you very much.

And I doubt if it'll be a performance problem for them - the amount of CPU
time per exit will be small - if you're exitting at great frequency then the
stats collecion overhead rises proportionately.  That is to be expected.

There will be buffering in the channel, so we'd expect to gather thousands
of records per context switch.

> > In some
> >situations, that is what people will want to be able to do.  I suspect a
> >good implementation would be to run a collection daemon on each CPU and
> >make the delivery be cpu-local.  That's sounding more like relayfs than
> >netlink.
> >  
> >
> Yup...the per-cpu, high speed requirements are up relayfs' alley, unless 
> Jamal or netlink folks
> are planning something (or can shed light on) how large flows can be 
> managed over netlink. I suspect
> this discussion has happened before :-)

yeah.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 19:33                                                           ` Andrew Morton
  2006-06-29 19:43                                                             ` Shailabh Nagar
@ 2006-06-29 20:01                                                             ` Shailabh Nagar
  2006-06-29 21:22                                                               ` Paul Jackson
  2006-06-29 22:54                                                               ` jamal
  1 sibling, 2 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-29 20:01 UTC (permalink / raw)
  To: Andrew Morton
  Cc: pj, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel, hadi, netdev

Andrew Morton wrote:

>On Thu, 29 Jun 2006 15:10:31 -0400
>Shailabh Nagar <nagar@watson.ibm.com> wrote:
>
>  
>
>>>I agree, and I'm viewing this as blocking the taskstats merge.  Because if
>>>this _is_ a problem then it's a big one because fixing it will be
>>>intrusive, and might well involve userspace-visible changes.
>>> 
>>>
>>>      
>>>
>>First off, just a reminder that this is inherently a netlink flow 
>>control issue...which was being exacerbated
>>earlier by taskstats decision to send per-tgid data (no longer the case).
>>
>>But I'd like to know whats our target here ? How many messages per 
>>second do we want to be able to be sent
>>and received without risking any loss of data ? Netlink will lose 
>>messages at a high enough rate so the design point
>>will need to be known a bit.
>>
>>For statistics type usage of the genetlink/netlink, I would have thought 
>>that userspace, provided it is reliably informed
>>about the loss of data through ENOBUFS, could take measures to just 
>>account for the missing data and carry on ?
>>    
>>
>
>Could be so.  But we need to understand how significant the impact of this
>will be in practice.
>
>We could find, once this is deployed is real production environments on
>large machines that the data loss is sufficiently common and sufficiently
>serious that the feature needs a lot of rework.
>
>Now there's always a risk of that sort of thing happening with all
>features, but it's usually not this evident so early in the development
>process.  We need to get a better understanding of the risk before
>proceeding too far.
>  
>
Ok.

I suppose we should first determine what number of tasks can be 
forked/exited at a sustained rate
on these m/c's and that would be one upper bound.

Paul, Chris, Jay,
What total exit rate would be a good upper bound ? How much memory do 
these 1024 CPU machines
have (in high end configurations, not just based on 64-bit 
addressability) and how many tasks can actually be
forked/exited in such a machine ?

>And there's always a 100% reliable fix for this: throttling.  Make the
>sender of the messages block until the consumer can catch up.  In some
>situations, that is what people will want to be able to do.  
>
Is this really an option for taskstats ? Allowing exits to get throttled 
? I suppose its one way
but seems like overkill for something like stats.

>I suspect a
>good implementation would be to run a collection daemon on each CPU and
>make the delivery be cpu-local.  That's sounding more like relayfs than
>netlink.
>  
>
Yup...per-cpu, high speed delivery is looking like relayfs alright.

One option that we've not explored in detail is the "dump" functionality 
of genetlink which allows
kernel space to keep getting called with skb's to fill until its done. 
How much buffering that affords us
in the face of a slow user is not known. But if we're discussing large 
exit rates happening in a burst, not
a sustained way, that may be one way out.

Jamal,
any thoughts on the flow control capabilities of netlink that apply here 
? Usage of the connection is to
supply statistics data to userspace.

--Shailabh


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 20:01                                                             ` Shailabh Nagar
@ 2006-06-29 21:22                                                               ` Paul Jackson
  2006-06-29 22:54                                                               ` jamal
  1 sibling, 0 replies; 134+ messages in thread
From: Paul Jackson @ 2006-06-29 21:22 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: akpm, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel,
	hadi, netdev

Shailabh wrote:
>  How much memory do these 1024 CPU machines have 

From:

    http://www.hpcwire.com/hpc/653963.html (May 12, 2006)

    SGI has already shipped more than a dozen SGI systems with
    over a terabyte of memory and about a hundred systems of half
    a terabyte or larger. But the new Altix will have much larger
    memory capacities. The systems SGI has in mind will scale to tens
    of terabytes and beyond. In fact, a few SGI customers are already
    testing with systems in the 10-terabyte range. "The largest we
    have shipped is a 13-terabyte memory system for the Japan Atomic
    Energy Agency," said [SGI CTO Dr. Eng Lim] Goh.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 19:41                                                             ` Paul Jackson
@ 2006-06-29 21:42                                                               ` Shailabh Nagar
  2006-06-29 21:54                                                                 ` Jay Lan
  2006-06-29 22:23                                                                 ` Paul Jackson
  0 siblings, 2 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-29 21:42 UTC (permalink / raw)
  To: Paul Jackson; +Cc: akpm, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel

Paul Jackson wrote:

>Shailabh wrote:
>  
>
>>I suppose this is because cpuset's offer some middle ground between 
>>collecting data per-cpu vs. collecting it for all cpus ?
>>    
>>
>
>Yes - well said.  And I have this strange tendency to see all the
>worlds problems as opportunities for cpuset solutions <grin>.
>
>  
>
>>What happens when someone is using cpusets on such a machine and
>>changes its membership in response to other needs.  All taskstats
>>users would need to monitor for such changes and adjust their
>>processing....seems like unnecessary tying up of two unrelated
>>concepts.
>>    
>>
>
>I would not expect taskstat users to monitor for such changes.
>I'd expect them to monitor the stats from whatever is in the
>cpuset they named.  If a task moves out of that cpuset to another,
>then tough -- that task will no longer be monitored by that
>particular monitoring request.
>
>Cpusets do provide a convenient middle ground, as you say, which
>is really useful for reducing scaling issues such as this one to
>a managable size.
>
>Per-cpu is too fine grained, and per-system too coarse.
>
>An unnecessary tying - yes.  But perhaps a useful one.
>  
>
The idea of collecting stats for a group of cpus rather than all (or 
one) seems attractive.
But cpusets doesnt :-)

How about if we did something simple like
having a separate listen group (within genetlink) for a reasonably large 
number of cpus
and have all the messages from those cpus multicast to the listeners of 
that group alone ?

e.g. currently we have only one TASKSTATS_LISTEN_GROUP
we could reserve the following
    TASKSTATS_LISTEN_GROUP_0
    TASKSTATS_LISTEN_GROUP_1....

where GROUP_0 handles cpus numbered 0-63 (or 31)....etc.

Advantages would be

1. Most users would still need to listen to the one group as they do
in the current design and others could listen to more, scaling up their 
userspace listening daemons
as appropriate (e.g. one daemon per listening group).

2. Userspace could be saved the bother of having too many streams of 
per-cpu data and reassemble them
in the order they were generated.

The moment we talk of splitting up the data stream generated by the 
kernel I suppose we have to do some
kind of timestamping so reassembly in the same order can be done. I 
can't see this mattering for the likes of
delay accounting and CSA but for future taskstats users, who knows.


--Shailabh



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 21:42                                                               ` Shailabh Nagar
@ 2006-06-29 21:54                                                                 ` Jay Lan
  2006-06-29 22:09                                                                   ` Shailabh Nagar
  2006-06-29 22:23                                                                 ` Paul Jackson
  1 sibling, 1 reply; 134+ messages in thread
From: Jay Lan @ 2006-06-29 21:54 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: Paul Jackson, akpm, Valdis.Kletnieks, jlan, balbir, csturtiv,
	linux-kernel

Shailabh Nagar wrote:
> Paul Jackson wrote:
> 
>> Shailabh wrote:
>>  
>>
>>> I suppose this is because cpuset's offer some middle ground between 
>>> collecting data per-cpu vs. collecting it for all cpus ?
>>>   
>>
>>
>> Yes - well said.  And I have this strange tendency to see all the
>> worlds problems as opportunities for cpuset solutions <grin>.
>>
>>  
>>
>>> What happens when someone is using cpusets on such a machine and
>>> changes its membership in response to other needs.  All taskstats
>>> users would need to monitor for such changes and adjust their
>>> processing....seems like unnecessary tying up of two unrelated
>>> concepts.
>>>   
>>
>>
>> I would not expect taskstat users to monitor for such changes.
>> I'd expect them to monitor the stats from whatever is in the
>> cpuset they named.  If a task moves out of that cpuset to another,
>> then tough -- that task will no longer be monitored by that
>> particular monitoring request.
>>
>> Cpusets do provide a convenient middle ground, as you say, which
>> is really useful for reducing scaling issues such as this one to
>> a managable size.
>>
>> Per-cpu is too fine grained, and per-system too coarse.
>>
>> An unnecessary tying - yes.  But perhaps a useful one.
>>  
>>
> The idea of collecting stats for a group of cpus rather than all (or 
> one) seems attractive.
> But cpusets doesnt :-)
> 
> How about if we did something simple like
> having a separate listen group (within genetlink) for a reasonably large 
> number of cpus
> and have all the messages from those cpus multicast to the listeners of 
> that group alone ?
> 
> e.g. currently we have only one TASKSTATS_LISTEN_GROUP
> we could reserve the following
>    TASKSTATS_LISTEN_GROUP_0
>    TASKSTATS_LISTEN_GROUP_1....
> 
> where GROUP_0 handles cpus numbered 0-63 (or 31)....etc.
> 
> Advantages would be
> 
> 1. Most users would still need to listen to the one group as they do
> in the current design and others could listen to more, scaling up their 
> userspace listening daemons
> as appropriate (e.g. one daemon per listening group).
> 
> 2. Userspace could be saved the bother of having too many streams of 
> per-cpu data and reassemble them
> in the order they were generated.
> 
> The moment we talk of splitting up the data stream generated by the 
> kernel I suppose we have to do some
> kind of timestamping so reassembly in the same order can be done. I 
> can't see this mattering for the likes of
> delay accounting and CSA but for future taskstats users, who knows.

Timestamp of the taskstats messages or timestamp of the exiting task?
I include an exit_time field for the task as part of "Common
Accounting Fields" in my csa_taskstats patch i sent to you. So, we
have both start_time and exit_time.

Thanks,
  - jay

> 
> 
> --Shailabh
> 
> 


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 21:54                                                                 ` Jay Lan
@ 2006-06-29 22:09                                                                   ` Shailabh Nagar
  0 siblings, 0 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-29 22:09 UTC (permalink / raw)
  To: Jay Lan
  Cc: Paul Jackson, akpm, Valdis.Kletnieks, jlan, balbir, csturtiv,
	linux-kernel

Jay Lan wrote:
> Shailabh Nagar wrote:
> 
>> Paul Jackson wrote:
>>
>>> Shailabh wrote:
>>>  
>>>
>>>> I suppose this is because cpuset's offer some middle ground between
>>>> collecting data per-cpu vs. collecting it for all cpus ?
>>>>   
>>>
>>>
>>>
>>> Yes - well said.  And I have this strange tendency to see all the
>>> worlds problems as opportunities for cpuset solutions <grin>.
>>>
>>>  
>>>
>>>> What happens when someone is using cpusets on such a machine and
>>>> changes its membership in response to other needs.  All taskstats
>>>> users would need to monitor for such changes and adjust their
>>>> processing....seems like unnecessary tying up of two unrelated
>>>> concepts.
>>>>   
>>>
>>>
>>>
>>> I would not expect taskstat users to monitor for such changes.
>>> I'd expect them to monitor the stats from whatever is in the
>>> cpuset they named.  If a task moves out of that cpuset to another,
>>> then tough -- that task will no longer be monitored by that
>>> particular monitoring request.
>>>
>>> Cpusets do provide a convenient middle ground, as you say, which
>>> is really useful for reducing scaling issues such as this one to
>>> a managable size.
>>>
>>> Per-cpu is too fine grained, and per-system too coarse.
>>>
>>> An unnecessary tying - yes.  But perhaps a useful one.
>>>  
>>>
>> The idea of collecting stats for a group of cpus rather than all (or
>> one) seems attractive.
>> But cpusets doesnt :-)
>>
>> How about if we did something simple like
>> having a separate listen group (within genetlink) for a reasonably
>> large number of cpus
>> and have all the messages from those cpus multicast to the listeners
>> of that group alone ?
>>
>> e.g. currently we have only one TASKSTATS_LISTEN_GROUP
>> we could reserve the following
>>    TASKSTATS_LISTEN_GROUP_0
>>    TASKSTATS_LISTEN_GROUP_1....
>>
>> where GROUP_0 handles cpus numbered 0-63 (or 31)....etc.
>>
>> Advantages would be
>>
>> 1. Most users would still need to listen to the one group as they do
>> in the current design and others could listen to more, scaling up
>> their userspace listening daemons
>> as appropriate (e.g. one daemon per listening group).
>>
>> 2. Userspace could be saved the bother of having too many streams of
>> per-cpu data and reassemble them
>> in the order they were generated.
>>
>> The moment we talk of splitting up the data stream generated by the
>> kernel I suppose we have to do some
>> kind of timestamping so reassembly in the same order can be done. I
>> can't see this mattering for the likes of
>> delay accounting and CSA but for future taskstats users, who knows.
> 
> 
> Timestamp of the taskstats messages or timestamp of the exiting task?

I meant a timestamp of the taskstats message...though the latter
(timestamp of exiting task) would also be ok since that would be called
at the same location in the exit path for each exit message sent.

> I include an exit_time field for the task as part of "Common
> Accounting Fields" in my csa_taskstats patch i sent to you. So, we
> have both start_time and exit_time.

Yes, that sort of thing should do. We would just need to generalize to the
taskstats layer.

Thanks,
Shailabh
> 
> Thanks,
>  - jay
> 
>>
>>
>> --Shailabh
>>
>>
> 


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 20:00                                                               ` Andrew Morton
@ 2006-06-29 22:13                                                                 ` Shailabh Nagar
  2006-06-29 23:00                                                                   ` jamal
  0 siblings, 1 reply; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-29 22:13 UTC (permalink / raw)
  To: Andrew Morton
  Cc: pj, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel, hadi, netdev

Andrew Morton wrote:

>>Yup...the per-cpu, high speed requirements are up relayfs' alley, unless 
>>Jamal or netlink folks
>>are planning something (or can shed light on) how large flows can be 
>>managed over netlink. I suspect
>>this discussion has happened before :-)
> 
> 
> yeah.

And now I remember why I didn't go down that path earlier. Relayfs is one-way
kernel->user and lacks the ability to send query commands from user space
that we need. Either we would need to send commands up through a separate interface
(even a syscall) or try and ensure that the exiting genetlink interface can
scale better with message volume (including throttling).

--Shailabh




^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 21:42                                                               ` Shailabh Nagar
  2006-06-29 21:54                                                                 ` Jay Lan
@ 2006-06-29 22:23                                                                 ` Paul Jackson
  2006-06-30  0:15                                                                   ` Shailabh Nagar
       [not found]                                                                   ` <44A46C6C.1090405@watson.ibm.com>
  1 sibling, 2 replies; 134+ messages in thread
From: Paul Jackson @ 2006-06-29 22:23 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: akpm, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel

Shailabh wrote:
> The idea of collecting stats for a group of cpus rather than all
> (or one) seems attractive.  But cpusets doesnt :-)

Ok ... ;).

If you think yet another cpu grouping mechanism is needed, I'm not
the unbiased neutral part to say it's not needed.

However a static grouping does not seem to fit the actual usage
patterns that we see, at least on our (unusally large) Altix
systems.

At least in the usage we see, people run various sized, independent
jobs on a system, using cpusets to define the cpu and memory containers
holding those jobs.  Much of what they do is naturally divided along
those job boundaries, so they want the ability to dynamically size
other resource management and tracking facilities along the same
boundaries.

One job might want to trace a data stream with no data loss, even if
it means slowing the job down.  Another job might want to collect what
it can with limited collecting resources, and let the bits fall where
they will.  A third job might want to increase the data collection
resources sufficiently to collect alot of data while not slowing the
job down.  One job might have very high fork/exit rates, and another
very low.

If the collectors are grouped along natural job boundaries, there might
not be any need to combine multiple streams, hence no need for the
timestamps you mention.  Cpusets are perhaps the best surrogate for
these boundaries.  Cpusets are hierarchical, so it would be convenient
to have a single collector for a large group of jobs.

It may well be that you find cpusets unattactive for this use for
good reason.  Or perhaps you find them unattractive here just out of
unfamilarity or misunderstanding.  Before introducing yet another
grouping mechanism, we should have an explanation of why the current
mechanism(s) are unsuitable.  Hopefully an explanation slightly more
elaborate than "doesn't seem attactive" ;).

(Hmmm ... I hope I don't end up regretting asking the question "why
do cpusets suck for this ...?")

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 20:01                                                             ` Shailabh Nagar
  2006-06-29 21:22                                                               ` Paul Jackson
@ 2006-06-29 22:54                                                               ` jamal
  2006-06-30  0:38                                                                 ` Shailabh Nagar
  1 sibling, 1 reply; 134+ messages in thread
From: jamal @ 2006-06-29 22:54 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: Andrew Morton, pj, Valdis.Kletnieks, jlan, balbir, csturtiv,
	linux-kernel, netdev

On Thu, 2006-29-06 at 16:01 -0400, Shailabh Nagar wrote:

> 
> Jamal,
> any thoughts on the flow control capabilities of netlink that apply here 
> ? Usage of the connection is to supply statistics data to userspace.
> 

if you want reliable delivery, then you cant just depend on async events
from the kernel -> user - which i am assuming is the way stats get
delivered as processes exit? Sorry, i dont remember the details. You
need some synchronous scheme to ask the kernel to do a "get" or "dump".

Lets be clear about one thing:
The problem really has nothing to do with gen/netlink or any other
scheme you use;->
It has everything to do with reliability implications and the fact
that you need to assume memory is a finite resource - at one point
or another you will run out of memory ;-> And of course then messages
will be lost.  So for gen/netlink, just make sure you have large socket
buffer and you would most likely be fine. 
I havent seen how the numbers were reached: But if you say you receive
14K exits/sec each of which is a 50B message, I would think a 1M socket
buffer would be plenty.

You can find out about lack of memory in netlink when you get a ENOBUFS.
As an example, you should then do a kernel query. Clearly if you do a
query of that sort, you may not want to find obsolete info. Therefore,
as a suggestion, you may want to keep sequence numbers of sorts as
markers. Perhaps keep a 32-bit field which monotically increases per
process exit or use the pid as the sequence number etc..

As for throttling - Shailabh, I think we talked about this:
- You could maintain info using some thresholds and timer. Then
when a timer expires or threshold is exceeded send to user space.

BTW, where is the doc fixes ? ;->

cheers,
jamal



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 22:13                                                                 ` Shailabh Nagar
@ 2006-06-29 23:00                                                                   ` jamal
  0 siblings, 0 replies; 134+ messages in thread
From: jamal @ 2006-06-29 23:00 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: Andrew Morton, pj, Valdis.Kletnieks, jlan, balbir, csturtiv,
	linux-kernel, netdev

On Thu, 2006-29-06 at 18:13 -0400, Shailabh Nagar wrote:

> 
> And now I remember why I didn't go down that path earlier. Relayfs is one-way
> kernel->user and lacks the ability to send query commands from user space
> that we need. Either we would need to send commands up through a separate interface
> (even a syscall) or try and ensure that the exiting genetlink interface can
> scale better with message volume (including throttling).

Refer to my other email - whatever it takes to store "bulk" data in the
kernel is subject to the constraint of the fact memory is finite.
You can send messages from the kernel in sizes constrained by the memory
socket size. You can tune the socket size.

cheers,
jamal


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 22:23                                                                 ` Paul Jackson
@ 2006-06-30  0:15                                                                   ` Shailabh Nagar
  2006-06-30  0:40                                                                     ` Paul Jackson
       [not found]                                                                   ` <44A46C6C.1090405@watson.ibm.com>
  1 sibling, 1 reply; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-30  0:15 UTC (permalink / raw)
  To: Paul Jackson; +Cc: akpm, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel

Paul Jackson wrote:

>
>If the collectors are grouped along natural job boundaries, there might
>not be any need to combine multiple streams, hence no need for the
>timestamps you mention. 
>
Nope...as long as there are users who are using cpusets ONLY as a means 
of reducing sockets
to listen to, timestamps will be needed. Userspace can of course, choose 
to combine or not.




^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
       [not found]                                                                   ` <44A46C6C.1090405@watson.ibm.com>
@ 2006-06-30  0:38                                                                     ` Paul Jackson
  2006-06-30  2:21                                                                       ` Paul Jackson
  0 siblings, 1 reply; 134+ messages in thread
From: Paul Jackson @ 2006-06-30  0:38 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: akpm, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel

Shailabh wrote:
> Uh, oh....looks like I've triggered another monologue from PJ :-)

Oh dear, my reputation precedes me.


> The overhead of creating cpusets just for this
> reason seems excessive when the need is only to
> reduce the number of sockets to monitor

What sort of overhead do you have in mind here?

I'm suspecting you mean the mental overhead to the programmer coding
for this, who might complain at having to learn a whole new subsystem
(cpusets) just to say how to group CPUs for collecting these stats.


> Throttling or flow control etc. would be a systemwide policy.

My natural inclination is to disagree with this.  Increasingly
popular large systems running heterogenous loads benefit from
finer granularity policies.

Though I will be honest in acknowledging that I have not studied
these taskstats in detail, so my inclinations may be off the mark.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 22:54                                                               ` jamal
@ 2006-06-30  0:38                                                                 ` Shailabh Nagar
  2006-06-30  1:05                                                                   ` Andrew Morton
  0 siblings, 1 reply; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-30  0:38 UTC (permalink / raw)
  To: hadi
  Cc: Andrew Morton, pj, Valdis.Kletnieks, jlan, balbir, csturtiv,
	linux-kernel, netdev

jamal wrote:

>On Thu, 2006-29-06 at 16:01 -0400, Shailabh Nagar wrote:
>
>  
>
>>Jamal,
>>any thoughts on the flow control capabilities of netlink that apply here 
>>? Usage of the connection is to supply statistics data to userspace.
>>
>>    
>>
>
>if you want reliable delivery, then you cant just depend on async events
>from the kernel -> user - which i am assuming is the way stats get
>delivered as processes exit? 
>
Yes.

>Sorry, i dont remember the details. You
>need some synchronous scheme to ask the kernel to do a "get" or "dump".
>  
>
Oh, yes. Dump is synchronous. So it won't be useful unless we buffer 
task exit records within
taskstats.

>Lets be clear about one thing:
>The problem really has nothing to do with gen/netlink or any other
>scheme you use;->
>It has everything to do with reliability implications and the fact
>that you need to assume memory is a finite resource - at one point
>or another you will run out of memory ;-> And of course then messages
>will be lost.  So for gen/netlink, just make sure you have large socket
>buffer and you would most likely be fine. 
>I havent seen how the numbers were reached: But if you say you receive
>14K exits/sec each of which is a 50B message, I would think a 1M socket
>buffer would be plenty.
>  
>
The rates (or upper bounds) that are being discussed here, as of now, 
are 1000 exits/sec/CPU for
1024 CPU systems. That would be roughly 1M exits/system * 
248Bytes/message  = 248 MB/sec.

>You can find out about lack of memory in netlink when you get a ENOBUFS.
>As an example, you should then do a kernel query. Clearly if you do a
>query of that sort, you may not want to find obsolete info. Therefore,
>as a suggestion, you may want to keep sequence numbers of sorts as
>markers. Perhaps keep a 32-bit field which monotically increases per
>process exit or use the pid as the sequence number etc..
>
>As for throttling - Shailabh, I think we talked about this:
>- You could maintain info using some thresholds and timer. Then
>when a timer expires or threshold is exceeded send to user space.
>  
>
Hmm. So we could buffer the per-task exit data within taskstats (the mem 
consumption would grow
but thats probably not a problem) and then send it out later.

Jay - would not getting exit data soon after exit be a problem for CSA ? 
I'm guessing not, if the
timeout is kept small enough. Internally, taskstats could always pace 
its sends so that "too much"
isn't sent out at one shot.

--Shailabh



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-30  0:15                                                                   ` Shailabh Nagar
@ 2006-06-30  0:40                                                                     ` Paul Jackson
  2006-06-30  1:00                                                                       ` Shailabh Nagar
  0 siblings, 1 reply; 134+ messages in thread
From: Paul Jackson @ 2006-06-30  0:40 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: akpm, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel

Shailabh wrote:
> Nope...as long as there are users who are using cpusets ONLY as a means 
> of reducing sockets to listen to, timestamps will be needed.

Could you take one more stab at explaining this.

It made no sense to me this time around.  Sorry.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-30  0:40                                                                     ` Paul Jackson
@ 2006-06-30  1:00                                                                       ` Shailabh Nagar
  2006-06-30  1:05                                                                         ` Paul Jackson
  0 siblings, 1 reply; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-30  1:00 UTC (permalink / raw)
  To: Paul Jackson; +Cc: akpm, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel

Paul Jackson wrote:

>Shailabh wrote:
>  
>
>>Nope...as long as there are users who are using cpusets ONLY as a means 
>>of reducing sockets to listen to, timestamps will be needed.
>>    
>>
>
>Could you take one more stab at explaining this.
>
>It made no sense to me this time around.  Sorry.
>
>  
>

In the current taskstats interface, there is only a single stream of 
taskstats structures coming out
from the kernel. There is some ordering there. Lets say this ordering 
info is of some relevance to a
consumer of taskstats (very big and possibly faulty assumption there !)

Now we move to a design where the kernel is sending the same data out in 
multiple streams.
If the consumer wants to reconstruct the ordering she would have got 
under the current scheme (even
approximately),  she would need to know how to merge sort these streams 
and for that she would
need timestamp data on each of the taskstats structs.

Assumption is a bit of a stretch admittedly. But since timestamping 
costs so little, might as well put one
in (will also help CSA do one less thing)

--Shailabh



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-30  0:38                                                                 ` Shailabh Nagar
@ 2006-06-30  1:05                                                                   ` Andrew Morton
  2006-06-30  1:11                                                                     ` Shailabh Nagar
  2006-06-30  2:25                                                                     ` Paul Jackson
  0 siblings, 2 replies; 134+ messages in thread
From: Andrew Morton @ 2006-06-30  1:05 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: hadi, pj, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel, netdev

Shailabh Nagar <nagar@watson.ibm.com> wrote:
>
> The rates (or upper bounds) that are being discussed here, as of now, 
> are 1000 exits/sec/CPU for
> 1024 CPU systems. That would be roughly 1M exits/system * 
> 248Bytes/message  = 248 MB/sec.

I think it's worth differentiating between burst rates and sustained rates
here.

One could easily imagine 10,000 threads all exiting at once, and the user
being interested in reliably collecting the results.

But if the machine is _sustaining_ such a high rate then that means that
these exiting tasks all have a teeny runtime and the user isn't going to be
interested in the per-thread statistics.

So if we can detect the silly sustained-high-exit-rate scenario then it
seems to me quite legitimate to do some aggressive data reduction on that. 
Like, a single message which says "20,000 sub-millisecond-runtime tasks
exited in the past second" or something.


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-30  1:00                                                                       ` Shailabh Nagar
@ 2006-06-30  1:05                                                                         ` Paul Jackson
  0 siblings, 0 replies; 134+ messages in thread
From: Paul Jackson @ 2006-06-30  1:05 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: akpm, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel

Shailabh wrote:
> Now we move to a design where the kernel is sending the same data out in 
> multiple streams.

Ah - so its simply the multiple streams versus single stream that
motivates time stamps.

Nothing much to do with whether someone is ONLY using cpusets to
define the streams .. or even using cpusets at all to define them.

Ok.  Thanks.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-30  1:05                                                                   ` Andrew Morton
@ 2006-06-30  1:11                                                                     ` Shailabh Nagar
  2006-06-30  1:30                                                                       ` jamal
  2006-06-30  2:25                                                                     ` Paul Jackson
  1 sibling, 1 reply; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-30  1:11 UTC (permalink / raw)
  To: Andrew Morton
  Cc: hadi, pj, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel, netdev

Andrew Morton wrote:

>Shailabh Nagar <nagar@watson.ibm.com> wrote:
>  
>
>>The rates (or upper bounds) that are being discussed here, as of now, 
>>are 1000 exits/sec/CPU for
>>1024 CPU systems. That would be roughly 1M exits/system * 
>>248Bytes/message  = 248 MB/sec.
>>    
>>
>
>I think it's worth differentiating between burst rates and sustained rates
>here.
>
>One could easily imagine 10,000 threads all exiting at once, and the user
>being interested in reliably collecting the results.
>
>But if the machine is _sustaining_ such a high rate then that means that
>these exiting tasks all have a teeny runtime and the user isn't going to be
>interested in the per-thread statistics.
>
>So if we can detect the silly sustained-high-exit-rate scenario then it
>seems to me quite legitimate to do some aggressive data reduction on that. 
>Like, a single message which says "20,000 sub-millisecond-runtime tasks
>exited in the past second" or something.
>  
>
The "buffering within taskstats" might be a way out then.
As long as the user is willing to pay the price in terms of memory, we 
can collect the exiting task's
taskstats data but not send it immediately (taskstats_cache would grow) 
unless a high water mark had
been crossed. Otherwise a timer event would do the sends of accumalated 
taskstats (not all at once but
iteratively if necessary).

At task exit, despite doing a few rounds of sending of pending data, if 
netlink were still reporting errors
then it would be a sign of unsustainable rate and the pending queue 
could be dropped and a message
like you suggest could be sent.

Thoughts ?


--Shailabh


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-30  1:11                                                                     ` Shailabh Nagar
@ 2006-06-30  1:30                                                                       ` jamal
  2006-06-30  3:01                                                                         ` Shailabh Nagar
  0 siblings, 1 reply; 134+ messages in thread
From: jamal @ 2006-06-30  1:30 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: netdev, linux-kernel, csturtiv, balbir, jlan, Valdis.Kletnieks,
	pj, Andrew Morton

On Thu, 2006-29-06 at 21:11 -0400, Shailabh Nagar wrote:
> Andrew Morton wrote:
> 
> >Shailabh Nagar <nagar@watson.ibm.com> wrote:
[..]
> >So if we can detect the silly sustained-high-exit-rate scenario then it
> >seems to me quite legitimate to do some aggressive data reduction on that. 
> >Like, a single message which says "20,000 sub-millisecond-runtime tasks
> >exited in the past second" or something.
> >  
> >
> The "buffering within taskstats" might be a way out then.

Thats what it looks like.

> As long as the user is willing to pay the price in terms of memory,

You may wanna draw a line to the upper limit - maybe even allocate slab
space.

>  we can collect the exiting task's taskstats data but not send it 
> immediately (taskstats_cache would grow) 
> unless a high water mark had been crossed. Otherwise a timer event would do the 
> sends of accumalated  taskstats (not all at once but
> iteratively if necessary).
> 

Sounds reasonable. Thats what xfrm events do. Try to have those
parameters settable because different machines or users may have
different view as to what is proper - maybe even as simple as sysctl.

> At task exit, despite doing a few rounds of sending of pending data, if 
> netlink were still reporting errors
> then it would be a sign of unsustainable rate and the pending queue 
> could be dropped and a message like you suggest could be sent.
> 

When you send inside the kernel - you will get an error if there's
problems sending to the socket queue. So you may wanna use that info
to release the kernel allocated entries or keep them for a little
longer.

Hopefully that helps.

cheers,
jamal
 


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-30  0:38                                                                     ` Paul Jackson
@ 2006-06-30  2:21                                                                       ` Paul Jackson
  2006-06-30  2:46                                                                         ` Shailabh Nagar
  0 siblings, 1 reply; 134+ messages in thread
From: Paul Jackson @ 2006-06-30  2:21 UTC (permalink / raw)
  To: Paul Jackson
  Cc: nagar, akpm, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel

Shailabh wrote:
> The overhead of creating cpusets just for this
> reason seems excessive when the need is only to
> reduce the number of sockets to monitor

As I reread this thread, some of my ancient interactions with process
accounting come to mind again.

K.I.S.S. - keep it simple, I'm telling myself.

I'm also thinking that since this is a system wide stat tool, it
wants to minimize interactions with other mechanisms.

Hog tying cpusets and process accounting together seems just
plain weird, and risks imposing conflicting demands on the cpuset
configuration of a system.

Please be so kind as to forget I suggested that ;).


How about a simple way to disable collection on specified CPUs.

Collecting this sort of data makes sense for certain managed system
situations, where one chooses to spend some portion of the system
tracking the rest of it.

Collecting it may put an intolerable performance impact on pedal to
the metal maximum performance beasts running on dedicated cpus/nodes.


I propose a per-cpu boolean flag to disable collection.

If this flag is set on the cpu on which a task happens to be when
exiting, then we just drop that data on the floor, silently, with no
accumulation, as quickly as we can, avoiding any system-wide locks.

Then I could run a managed job mix, collecting accounting data, on
some nodes, while running dedicated performance beasts on other nodes,
without the accounting interfering with the performance beasts.

Independently, the cpuset friendly customers could make use of cpusets
to help manage which jobs were on which cpus, so that they collected
their accounting data as desired.  But no need for the accounting
system to be aware of that, past the state of its per-cpu flag.

Such a flag reduces the need for further (over) designing this to
handle the extreme case.  If someone has such an extreme case, they
can turn off collecting on some cpus, to get a handle on the situation.

This could be done as a variant of your idea for multiple
TASKSTATS_LISTEN_GROUP's.  Essentially, for now, we would have two
GROUP's - one that drops the data on the floor, and one that collects
it.  Each cpu is in either one or the other group.  Later on, when the
need arises, we add support for more GROUP's that can actually collect
data.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-30  1:05                                                                   ` Andrew Morton
  2006-06-30  1:11                                                                     ` Shailabh Nagar
@ 2006-06-30  2:25                                                                     ` Paul Jackson
  2006-06-30  2:35                                                                       ` Andrew Morton
  1 sibling, 1 reply; 134+ messages in thread
From: Paul Jackson @ 2006-06-30  2:25 UTC (permalink / raw)
  To: Andrew Morton
  Cc: nagar, hadi, Valdis.Kletnieks, jlan, balbir, csturtiv,
	linux-kernel, netdev

Andrew wrote:
> Like, a single message which says "20,000 sub-millisecond-runtime tasks
> exited in the past second" or something.

System wide accumulation of such data in the exit() code path still
risks being a bottleneck, just a bit later on.

I'm more inclined now to look for ways to disable collection on some
CPUs, and/or to allow for multiple streams in the future, as need be,
along the lines of Shailabh's multiple TASKSTATS_LISTEN_GROUPs.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-30  2:25                                                                     ` Paul Jackson
@ 2006-06-30  2:35                                                                       ` Andrew Morton
  2006-06-30  2:43                                                                         ` Paul Jackson
  0 siblings, 1 reply; 134+ messages in thread
From: Andrew Morton @ 2006-06-30  2:35 UTC (permalink / raw)
  To: Paul Jackson
  Cc: nagar, hadi, Valdis.Kletnieks, jlan, balbir, csturtiv,
	linux-kernel, netdev

On Thu, 29 Jun 2006 19:25:26 -0700
Paul Jackson <pj@sgi.com> wrote:

> Andrew wrote:
> > Like, a single message which says "20,000 sub-millisecond-runtime tasks
> > exited in the past second" or something.
> 
> System wide accumulation of such data in the exit() code path still
> risks being a bottleneck, just a bit later on.

Nah.  Stick it in the same cacheline as tasklist_lock (I'm amazed that
we've continued to get away with a global lock for that).


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-30  2:35                                                                       ` Andrew Morton
@ 2006-06-30  2:43                                                                         ` Paul Jackson
  0 siblings, 0 replies; 134+ messages in thread
From: Paul Jackson @ 2006-06-30  2:43 UTC (permalink / raw)
  To: Andrew Morton
  Cc: nagar, hadi, Valdis.Kletnieks, jlan, balbir, csturtiv,
	linux-kernel, netdev

Andrew wrote:
> Nah.  Stick it in the same cacheline as tasklist_lock (I'm amazed that
> we've continued to get away with a global lock for that).

Yes - a bit amazing.  But no sense compounding the problem now.

We shouldn't be adding global locks/modifiable data in the
fork/exit code path if we can help it, without at least
providing some simple way to ameliorate the problem when
folks do start hitting it.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-30  2:21                                                                       ` Paul Jackson
@ 2006-06-30  2:46                                                                         ` Shailabh Nagar
  2006-06-30  2:54                                                                           ` Paul Jackson
  2006-06-30  3:02                                                                           ` Paul Jackson
  0 siblings, 2 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-30  2:46 UTC (permalink / raw)
  To: Paul Jackson; +Cc: akpm, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel

Paul Jackson wrote:

>Shailabh wrote:
>  
>
>>The overhead of creating cpusets just for this
>>reason seems excessive when the need is only to
>>reduce the number of sockets to monitor
>>    
>>
>
>As I reread this thread, some of my ancient interactions with process
>accounting come to mind again.
>
>K.I.S.S. - keep it simple, I'm telling myself.
>
>I'm also thinking that since this is a system wide stat tool, it
>wants to minimize interactions with other mechanisms.
>
>Hog tying cpusets and process accounting together seems just
>plain weird, and risks imposing conflicting demands on the cpuset
>configuration of a system.
>
>Please be so kind as to forget I suggested that ;).
>  
>
What suggestion are you talking about  :-)

>
>How about a simple way to disable collection on specified CPUs.
>  
>

>Collecting this sort of data makes sense for certain managed system
>situations, where one chooses to spend some portion of the system
>tracking the rest of it.
>
>Collecting it may put an intolerable performance impact on pedal to
>the metal maximum performance beasts running on dedicated cpus/nodes.
>
>
>I propose a per-cpu boolean flag to disable collection.
>
>If this flag is set on the cpu on which a task happens to be when
>exiting, then we just drop that data on the floor, silently, with no
>accumulation, as quickly as we can, avoiding any system-wide locks.
>
>Then I could run a managed job mix, collecting accounting data, on
>some nodes, while running dedicated performance beasts on other nodes,
>without the accounting interfering with the performance beasts.
>  
>
Doing enablement/disablement on a per-CPU basis seems to fit the cpuset 
framework where
jobs are closely tied to CPUs.

Otherwise, from a generic taskstats perspective, having the CPU of exit 
determine the output
of exit related data seems a bit arbitrary.

>Independently, the cpuset friendly customers could make use of cpusets
>to help manage which jobs were on which cpus, so that they collected
>their accounting data as desired.  But no need for the accounting
>system to be aware of that, past the state of its per-cpu flag.
>
>Such a flag reduces the need for further (over) designing this to
>handle the extreme case.  
>

>If someone has such an extreme case, they
>can turn off collecting on some cpus, to get a handle on the situation.
>  
>
Hmm ? Again a very cpuset'ish solution where turning off collection on a 
set of cpus will mean only
a known set of tasks (aggregated under a job) get affected. In general, 
this seems like a terrible
way of doing flow control.....just pick some tasks and shut their data 
output out (admittedly thats
what we're doing today when data gets dropped on overflow but I guess 
the aim here is to do
better)

>This could be done as a variant of your idea for multiple
>TASKSTATS_LISTEN_GROUP's.  Essentially, for now, we would have two
>GROUP's - one that drops the data on the floor, and one that collects
>it.  Each cpu is in either one or the other group.  Later on, when the
>need arises, we add support for more GROUP's that can actually collect
>data.
>  
>
Sorry...don't like this proposal much but others may differ.

--Shailabh


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-30  2:46                                                                         ` Shailabh Nagar
@ 2006-06-30  2:54                                                                           ` Paul Jackson
  2006-06-30  3:02                                                                           ` Paul Jackson
  1 sibling, 0 replies; 134+ messages in thread
From: Paul Jackson @ 2006-06-30  2:54 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: akpm, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel

Shailabh wrote:
> Otherwise, from a generic taskstats perspective, having the CPU of exit 
> determine the output of exit related data seems a bit arbitrary.

On systems that do manage CPU placement, it would be worth quite
a bit to be able to disable taskstat collection on certain CPUs.

On systems that don't manage CPU placement, just use the multiple
GROUP's (just one group, for this now) you proposed.

This seems like a trivial extension of your multiple GROUP's idea
that would be harmless for some, and valuable for others.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-30  1:30                                                                       ` jamal
@ 2006-06-30  3:01                                                                         ` Shailabh Nagar
  2006-06-30 12:45                                                                           ` jamal
  0 siblings, 1 reply; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-30  3:01 UTC (permalink / raw)
  To: hadi
  Cc: netdev, linux-kernel, csturtiv, balbir, jlan, Valdis.Kletnieks,
	pj, Andrew Morton

jamal wrote:

>On Thu, 2006-29-06 at 21:11 -0400, Shailabh Nagar wrote:
>  
>
>>Andrew Morton wrote:
>>
>>    
>>
>>>Shailabh Nagar <nagar@watson.ibm.com> wrote:
>>>      
>>>
>[..]
>  
>
>>>So if we can detect the silly sustained-high-exit-rate scenario then it
>>>seems to me quite legitimate to do some aggressive data reduction on that. 
>>>Like, a single message which says "20,000 sub-millisecond-runtime tasks
>>>exited in the past second" or something.
>>> 
>>>
>>>      
>>>
>>The "buffering within taskstats" might be a way out then.
>>    
>>
>
>Thats what it looks like.
>
>  
>
>>As long as the user is willing to pay the price in terms of memory,
>>    
>>
>
>You may wanna draw a line to the upper limit - maybe even allocate slab
>space.
>  
>
Didn't quite understand...could you please elaborate ?
Today we have a slab cache from which the taskstats structure gets 
allocated at the beginning
of the exit() path.
The upper limit to which you refer is the amount of slab memory the user 
is willing to be used
to store the bursty traffic ?


>> we can collect the exiting task's taskstats data but not send it 
>>immediately (taskstats_cache would grow) 
>>unless a high water mark had been crossed. Otherwise a timer event would do the 
>>sends of accumalated  taskstats (not all at once but
>>iteratively if necessary).
>>
>>    
>>
>
>Sounds reasonable. Thats what xfrm events do. 
>

>Try to have those
>parameters settable because different machines or users may have
>different view as to what is proper - maybe even as simple as sysctl.
>  
>
Sounds good.

>  
>
>>At task exit, despite doing a few rounds of sending of pending data, if 
>>netlink were still reporting errors
>>then it would be a sign of unsustainable rate and the pending queue 
>>could be dropped and a message like you suggest could be sent.
>>
>>    
>>
>
>When you send inside the kernel - you will get an error if there's
>problems sending to the socket queue. So you may wanna use that info
>to release the kernel allocated entries or keep them for a little
>longer.
>
>Hopefully that helps.
>  
>
Yes it does. Thanks for the tips.

Will code up something and send out so this can become more concrete.

--Shailabh

>cheers,
>jamal
> 
>
>  
>


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-30  2:46                                                                         ` Shailabh Nagar
  2006-06-30  2:54                                                                           ` Paul Jackson
@ 2006-06-30  3:02                                                                           ` Paul Jackson
  1 sibling, 0 replies; 134+ messages in thread
From: Paul Jackson @ 2006-06-30  3:02 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: akpm, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel

Shailabh wrote:
> just pick some tasks and shut their data output out 

I missed this on first read.  Shutting down per-task (if it was a
property inherited on fork, and if it affectively eliminated the impact
on such tasks) would be equivalent to per CPU, for the configurations I
care about.

And you're right - per-task makes more sense than per-cpu.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-30  3:01                                                                         ` Shailabh Nagar
@ 2006-06-30 12:45                                                                           ` jamal
  0 siblings, 0 replies; 134+ messages in thread
From: jamal @ 2006-06-30 12:45 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: netdev, linux-kernel, csturtiv, balbir, jlan, Valdis.Kletnieks,
	pj, Andrew Morton

On Thu, 2006-29-06 at 23:01 -0400, Shailabh Nagar wrote:
> jamal wrote:

> >  
> >
> >>As long as the user is willing to pay the price in terms of memory,
> >>    
> >>
> >
> >You may wanna draw a line to the upper limit - maybe even allocate slab
> >space.
> >  
> >
> Didn't quite understand...could you please elaborate ?
> Today we have a slab cache from which the taskstats structure gets 
> allocated at the beginning
> of the exit() path.
> The upper limit to which you refer is the amount of slab memory the user 
> is willing to be used
> to store the bursty traffic ?
> 

I think you have it fine already if you have a slab - as long as you
know you will run out of space and have some strategy to deal with
such boundary conditions. I was only reacting to your statement
"As long as the user is willing to pay the price in terms of memory"
I think you meant that a user could adjust the slab size on bootup etc,
but it is finite in size.

cheers,
jamal


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-29 18:01                                                       ` Andrew Morton
                                                                           ` (3 preceding siblings ...)
  2006-06-29 19:33                                                         ` Jay Lan
@ 2006-06-30 18:53                                                         ` Shailabh Nagar
  2006-06-30 19:10                                                           ` Shailabh Nagar
  2006-06-30 22:56                                                           ` Andrew Morton
  4 siblings, 2 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-30 18:53 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Paul Jackson, Valdis.Kletnieks, jlan, balbir, csturtiv,
	linux-kernel, Jamal, netdev

Andrew Morton wrote:
> On Thu, 29 Jun 2006 09:44:08 -0700
> Paul Jackson <pj@sgi.com> wrote:
>
>
>>>You're probably correct on that model. However, it all depends on the actual
>>>workload. Are people who actually have large-CPU (>256) systems actually
>>>running fork()-heavy things like webservers on them, or are they running things
>>>like database servers and computations, which tend to have persistent
>>>processes?
>>
>>It may well be mostly as you say - the large-CPU systems not running
>>the fork() heavy jobs.
>>
>>Sooner or later, someone will want to run a fork()-heavy job on a
>>large-CPU system.  On a 1024 CPU system, it would apparently take
>>just 14 exits/sec/CPU to hit this bottleneck, if Jay's number of
>>14000 applied.
>>
>>Chris Sturdivant's reply is reasonable -- we'll hit it sooner or later,
>>and deal with it then.
>>
>
>
> I agree, and I'm viewing this as blocking the taskstats merge.  Because if
> this _is_ a problem then it's a big one because fixing it will be
> intrusive, and might well involve userspace-visible changes.
>
> The only ways I can see of fixing the problem generally are to either
>
> a) throw more CPU(s) at stats collection: allow userspace to register for
>    "stats generated by CPU N", then run a stats collection daemon on each
>    CPU or
>
> b) make the kernel recognise when it's getting overloaded and switch to
>    some degraded mode where it stops trying to send all the data to
>    userspace - just send a summary, or a "we goofed" message or something.



Andrew,

Based on previous discussions, the above solutions can be expanded/modified to:

a) allow userspace to listen to a group of cpus instead of all. Multiple
collection daemons can distribute the load as you pointed out. Doing collection
by cpu groups rather than individual cpus reduces the aggregation burden on
userspace (and scales better with NR_CPUS)

b) do flow control on the kernel send side. This can involve buffering and sending
later (to handle bursty case) or dropping (to handle sustained load) as pointed out
by you, Jamal in other threads.

c) increase receiver's socket buffer. This can and should always be done but no
involvement needed.


With regards to taskstats changes to handle the problem and its impact on userspace
visible changes,

a) will change userspace
b) will be transparent.
c) is immaterial going forward (except perhaps as a change in Documentation)


I'm sending a patch that demonstrates how a) can be done quite simply
and a patch for b) is in progress.

If the approach suggested in patch a) is acceptable (and I'll provide the testing, stability
results once comments on it are largely over), could taskstats acceptance in 2.6.18 go ahead
and patch b) be added later (solution outline has already been provided and a prelim patch should
be out by eod)

--Shailabh




^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-30 18:53                                                         ` Shailabh Nagar
@ 2006-06-30 19:10                                                           ` Shailabh Nagar
  2006-06-30 19:19                                                             ` Shailabh Nagar
                                                                               ` (2 more replies)
  2006-06-30 22:56                                                           ` Andrew Morton
  1 sibling, 3 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-30 19:10 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: Andrew Morton, Paul Jackson, Valdis.Kletnieks, jlan, balbir,
	csturtiv, linux-kernel, Jamal, netdev

Shailabh Nagar wrote:

> Andrew,
>
> Based on previous discussions, the above solutions can be expanded/modified to:
>
> a) allow userspace to listen to a group of cpus instead of all. Multiple
> collection daemons can distribute the load as you pointed out. Doing collection
> by cpu groups rather than individual cpus reduces the aggregation burden on
> userspace (and scales better with NR_CPUS)


> I'm sending a patch that demonstrates how a) can be done quite simply
> and a patch for b) is in progress.
>

Here's the patch.
Testing etc. need to be done (an earlier version that did per-cpu queues
has worked) but the main point is to show how small a change is needed
in the interface (on both the kernel and user side)
and current codebase to achieve the a) solution.

Also to get feedback on this kind of usage of the nl_pid field, the
approach etc.

Thanks,
Shailabh






=======================================================================


On systems with a large number of cpus, with even a modest rate of tasks
exiting per cpu, the volume of taskstats data sent on thread exit can overflow
a userspace listener's buffers.

One approach to avoiding overflow is to allow listeners to get data for a
limited number of cpus. By scaling the number of listening programs, each
listening to a different set of cpus, userspace can avoid more overflow
situations.

This patch implements this idea by creating simple grouping of cpus and
allowing userspace to listen to any cpu group it chooses.

Alternative designs considered and rejected were:

- creating a separate netlink group for each group of cpus. Since only 32
netlink groups can be specified by a user, this option will not scale with
number of cpus.

- aligning the grouping of cpus with cpusets. The unnecessary tying together
of the two functionalities was not merited.

Thanks to Balbir Singh for discovering the potential use of the pid field of
sockaddr_nl as a communication subchannel in the same socket, Paul Jackson and
Vivek Kashyap for suggesting cpus be grouped together for data send purposes.

Signed-Off-By: Shailabh Nagar <nagar@watson.ibm.com>
Signed-Off-By: Balbir Singh <balbir@in.ibm.com>

 Documentation/accounting/getdelays.c |   30 +++++++++++++++++++-----------
 include/linux/taskstats.h            |   22 ++++++++++++++++++++++
 kernel/taskstats.c                   |    5 +++--
 3 files changed, 44 insertions(+), 13 deletions(-)

Index: linux-2.6.17-mm3equiv/include/linux/taskstats.h
===================================================================
--- linux-2.6.17-mm3equiv.orig/include/linux/taskstats.h	2006-06-30 11:57:14.000000000 -0400
+++ linux-2.6.17-mm3equiv/include/linux/taskstats.h	2006-06-30 14:24:49.000000000 -0400
@@ -89,6 +89,28 @@ struct taskstats {

 #define TASKSTATS_LISTEN_GROUP	0x1

+
+/*
+ * Per-task exit data sent from the kernel to user space
+ * is tagged by an id based on grouping of cpus.
+ *
+ * If userspace specifies a non-zero P as the nl_pid field of
+ * the sockaddr_nl structure while binding to a netlink socket,
+ * it will receive exit data from threads that exited on cpus in the range
+ *
+ *    [(P-1)*Y, P*Y-1]
+ *
+ *  where Y = TASKSTATS_CPUS_PER_SET
+ *  i.e. if TASKSTATS_CPUS_PER_SET is 16,
+ *  to listen to data from cpus 0..15, specify P=1
+ *  for cpus 16..32, specify P=2 etc.
+ *
+ *  To listen to data from all cpus, userspace should use P=0
+ */
+
+#define TASKSTATS_CPUS_PER_SET	16
+
+
 /*
  * Commands sent from userspace
  * Not versioned. New commands should only be inserted at the enum's end
Index: linux-2.6.17-mm3equiv/kernel/taskstats.c
===================================================================
--- linux-2.6.17-mm3equiv.orig/kernel/taskstats.c	2006-06-30 11:57:14.000000000 -0400
+++ linux-2.6.17-mm3equiv/kernel/taskstats.c	2006-06-30 13:58:36.000000000 -0400
@@ -266,7 +266,7 @@ void taskstats_exit_send(struct task_str
 	struct sk_buff *rep_skb;
 	void *reply;
 	size_t size;
-	int is_thread_group;
+	int is_thread_group, setid;
 	struct nlattr *na;

 	if (!family_registered || !tidstats)
@@ -320,7 +320,8 @@ void taskstats_exit_send(struct task_str
 	nla_nest_end(rep_skb, na);

 send:
-	send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
+	setid = (smp_processor_id()%TASKSTATS_CPUS_PER_SET)+1;
+	send_reply(rep_skb, setid, TASKSTATS_MSG_MULTICAST);
 	return;

 nla_put_failure:
Index: linux-2.6.17-mm3equiv/Documentation/accounting/getdelays.c
===================================================================
--- linux-2.6.17-mm3equiv.orig/Documentation/accounting/getdelays.c	2006-06-28 16:08:56.000000000 -0400
+++ linux-2.6.17-mm3equiv/Documentation/accounting/getdelays.c	2006-06-30 14:09:28.000000000 -0400
@@ -40,7 +40,7 @@ int done = 0;
 /*
  * Create a raw netlink socket and bind
  */
-static int create_nl_socket(int protocol, int groups)
+static int create_nl_socket(int protocol, int cpugroup)
 {
     socklen_t addr_len;
     int fd;
@@ -52,7 +52,8 @@ static int create_nl_socket(int protocol

     memset(&local, 0, sizeof(local));
     local.nl_family = AF_NETLINK;
-    local.nl_groups = groups;
+    local.nl_groups = TASKSTATS_LISTEN_GROUP;
+    local.nl_pid = cpugroup;

     if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0)
 	goto error;
@@ -203,7 +204,7 @@ int main(int argc, char *argv[])
     pid_t rtid = 0;
     int cmd_type = TASKSTATS_TYPE_TGID;
     int c, status;
-    int forking = 0;
+    int forking = 0, cpugroup = 0;
     struct sigaction act = {
 	.sa_handler = SIG_IGN,
 	.sa_mask = SA_NOMASK,
@@ -222,7 +223,7 @@ int main(int argc, char *argv[])

     while (1) {

-	c = getopt(argc, argv, "t:p:c:");
+	c = getopt(argc, argv, "t:p:c:g:l");
 	if (c < 0)
 	    break;

@@ -252,8 +253,14 @@ int main(int argc, char *argv[])
 	    }
 	    forking = 1;
 	    break;
+	case 'g':
+		cpugroup = atoi(optarg);
+		break;
+	case 'l':
+		loop = 1;
+		break;
 	default:
-	    printf("usage %s [-t tgid][-p pid][-c cmd]\n", argv[0]);
+	    printf("usage %s [-t tgid][-p pid][-c cmd][-g cpugroup][-l]\n", argv[0]);
 	    exit(-1);
 	    break;
 	}
@@ -266,7 +273,7 @@ int main(int argc, char *argv[])
     /* Send Netlink request message & get reply */

     if ((nl_sd =
-	 create_nl_socket(NETLINK_GENERIC, TASKSTATS_LISTEN_GROUP)) < 0)
+	 create_nl_socket(NETLINK_GENERIC, cpugroup)) < 0)
 	err(1, "error creating Netlink socket\n");


@@ -287,10 +294,10 @@ int main(int argc, char *argv[])


     if (!forking && sendto_fd(nl_sd, (char *) &req, req.n.nlmsg_len) < 0)
+    if ((!forking && !loop) &&
+	sendto_fd(nl_sd, (char *) &req, req.n.nlmsg_len) < 0)
 	err(1, "error sending message via Netlink\n");

-    act.sa_handler = SIG_IGN;
-    sigemptyset(&act.sa_mask);
     if (sigaction(SIGINT, &act, NULL) < 0)
 	err(1, "sigaction failed for SIGINT\n");

@@ -349,10 +356,11 @@ int main(int argc, char *argv[])
 			rtid = *(int *) NLA_DATA(na);
 			break;
 		    case TASKSTATS_TYPE_STATS:
-			if (rtid == tid) {
+			if (rtid == tid || loop) {
 			    print_taskstats((struct taskstats *)
 					    NLA_DATA(na));
-			    done = 1;
+			    if (!loop)
+				    done = 1;
 			}
 			break;
 		    }
@@ -369,7 +377,7 @@ int main(int argc, char *argv[])
 	if (done)
 	    break;
     }
-    while (1);
+    while (loop);

     close(nl_sd);
     return 0;


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-30 19:10                                                           ` Shailabh Nagar
@ 2006-06-30 19:19                                                             ` Shailabh Nagar
  2006-06-30 20:19                                                             ` jamal
  2006-06-30 22:50                                                             ` Andrew Morton
  2 siblings, 0 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-06-30 19:19 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: Andrew Morton, Paul Jackson, Valdis.Kletnieks, jlan, balbir,
	csturtiv, linux-kernel, Jamal, netdev

Shailabh Nagar wrote:
> Shailabh Nagar wrote:
> 
> 
<snip>

> Index: linux-2.6.17-mm3equiv/kernel/taskstats.c
> ===================================================================
> --- linux-2.6.17-mm3equiv.orig/kernel/taskstats.c	2006-06-30 11:57:14.000000000 -0400
> +++ linux-2.6.17-mm3equiv/kernel/taskstats.c	2006-06-30 13:58:36.000000000 -0400
> @@ -266,7 +266,7 @@ void taskstats_exit_send(struct task_str
>  	struct sk_buff *rep_skb;
>  	void *reply;
>  	size_t size;
> -	int is_thread_group;
> +	int is_thread_group, setid;
>  	struct nlattr *na;
> 
>  	if (!family_registered || !tidstats)
> @@ -320,7 +320,8 @@ void taskstats_exit_send(struct task_str
>  	nla_nest_end(rep_skb, na);
> 
>  send:
> -	send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
> +	setid = (smp_processor_id()%TASKSTATS_CPUS_PER_SET)+1;
> +	send_reply(rep_skb, setid, TASKSTATS_MSG_MULTICAST);

This should be
	send_reply(rep_skb, setid, TASKSTATS_MSG_UNICAST);

>  	return;
> 
>  nla_put_failure:
> Index: linux-2.6.17-mm3equiv/Documentation/accounting/getdelays.c
> ===================================================================
> --- linux-2.6.17-mm3equiv.orig/Documentation/accounting/getdelays.c	2006-06-28 16:08:56.000000000 -0400
> +++ linux-2.6.17-mm3equiv/Documentation/accounting/getdelays.c	2006-06-30 14:09:28.000000000 -0400
> @@ -40,7 +40,7 @@ int done = 0;
>  /*
>   * Create a raw netlink socket and bind
>   */
> -static int create_nl_socket(int protocol, int groups)
> +static int create_nl_socket(int protocol, int cpugroup)
>  {
>      socklen_t addr_len;
>      int fd;
> @@ -52,7 +52,8 @@ static int create_nl_socket(int protocol
> 
>      memset(&local, 0, sizeof(local));
>      local.nl_family = AF_NETLINK;
> -    local.nl_groups = groups;
> +    local.nl_groups = TASKSTATS_LISTEN_GROUP;
> +    local.nl_pid = cpugroup;
> 
>      if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0)
>  	goto error;
> @@ -203,7 +204,7 @@ int main(int argc, char *argv[])
>      pid_t rtid = 0;
>      int cmd_type = TASKSTATS_TYPE_TGID;
>      int c, status;
> -    int forking = 0;
> +    int forking = 0, cpugroup = 0;
>      struct sigaction act = {
>  	.sa_handler = SIG_IGN,
>  	.sa_mask = SA_NOMASK,
> @@ -222,7 +223,7 @@ int main(int argc, char *argv[])
> 
>      while (1) {
> 
> -	c = getopt(argc, argv, "t:p:c:");
> +	c = getopt(argc, argv, "t:p:c:g:l");
>  	if (c < 0)
>  	    break;
> 
> @@ -252,8 +253,14 @@ int main(int argc, char *argv[])
>  	    }
>  	    forking = 1;
>  	    break;
> +	case 'g':
> +		cpugroup = atoi(optarg);
> +		break;
> +	case 'l':
> +		loop = 1;
> +		break;
>  	default:
> -	    printf("usage %s [-t tgid][-p pid][-c cmd]\n", argv[0]);
> +	    printf("usage %s [-t tgid][-p pid][-c cmd][-g cpugroup][-l]\n", argv[0]);
>  	    exit(-1);
>  	    break;
>  	}
> @@ -266,7 +273,7 @@ int main(int argc, char *argv[])
>      /* Send Netlink request message & get reply */
> 
>      if ((nl_sd =
> -	 create_nl_socket(NETLINK_GENERIC, TASKSTATS_LISTEN_GROUP)) < 0)
> +	 create_nl_socket(NETLINK_GENERIC, cpugroup)) < 0)
>  	err(1, "error creating Netlink socket\n");
> 
> 
> @@ -287,10 +294,10 @@ int main(int argc, char *argv[])
> 
> 
>      if (!forking && sendto_fd(nl_sd, (char *) &req, req.n.nlmsg_len) < 0)
> +    if ((!forking && !loop) &&
> +	sendto_fd(nl_sd, (char *) &req, req.n.nlmsg_len) < 0)
>  	err(1, "error sending message via Netlink\n");
> 
> -    act.sa_handler = SIG_IGN;
> -    sigemptyset(&act.sa_mask);
>      if (sigaction(SIGINT, &act, NULL) < 0)
>  	err(1, "sigaction failed for SIGINT\n");
> 
> @@ -349,10 +356,11 @@ int main(int argc, char *argv[])
>  			rtid = *(int *) NLA_DATA(na);
>  			break;
>  		    case TASKSTATS_TYPE_STATS:
> -			if (rtid == tid) {
> +			if (rtid == tid || loop) {
>  			    print_taskstats((struct taskstats *)
>  					    NLA_DATA(na));
> -			    done = 1;
> +			    if (!loop)
> +				    done = 1;
>  			}
>  			break;
>  		    }
> @@ -369,7 +377,7 @@ int main(int argc, char *argv[])
>  	if (done)
>  	    break;
>      }
> -    while (1);
> +    while (loop);
> 
>      close(nl_sd);
>      return 0;
> 


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-30 19:10                                                           ` Shailabh Nagar
  2006-06-30 19:19                                                             ` Shailabh Nagar
@ 2006-06-30 20:19                                                             ` jamal
  2006-06-30 22:50                                                             ` Andrew Morton
  2 siblings, 0 replies; 134+ messages in thread
From: jamal @ 2006-06-30 20:19 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: netdev, linux-kernel, csturtiv, balbir, jlan, Valdis.Kletnieks,
	Paul Jackson, Andrew Morton

On Fri, 2006-30-06 at 15:10 -0400, Shailabh Nagar wrote:

> 
> Also to get feedback on this kind of usage of the nl_pid field, the
> approach etc.
> 

It does not look unreasonable. I think you may have issues when you have
multiple such sockets opened within a single process. But 
do some testing and see how it goes.

cheers,
jamal


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-30 19:10                                                           ` Shailabh Nagar
  2006-06-30 19:19                                                             ` Shailabh Nagar
  2006-06-30 20:19                                                             ` jamal
@ 2006-06-30 22:50                                                             ` Andrew Morton
  2006-07-01  2:20                                                               ` Shailabh Nagar
  2 siblings, 1 reply; 134+ messages in thread
From: Andrew Morton @ 2006-06-30 22:50 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: nagar, pj, Valdis.Kletnieks, jlan, balbir, csturtiv,
	linux-kernel, hadi, netdev

Shailabh Nagar <nagar@watson.ibm.com> wrote:
>
> +/*
> + * Per-task exit data sent from the kernel to user space
> + * is tagged by an id based on grouping of cpus.
> + *
> + * If userspace specifies a non-zero P as the nl_pid field of
> + * the sockaddr_nl structure while binding to a netlink socket,
> + * it will receive exit data from threads that exited on cpus in the range
> + *
> + *    [(P-1)*Y, P*Y-1]
> + *
> + *  where Y = TASKSTATS_CPUS_PER_SET
> + *  i.e. if TASKSTATS_CPUS_PER_SET is 16,
> + *  to listen to data from cpus 0..15, specify P=1
> + *  for cpus 16..32, specify P=2 etc.
> + *
> + *  To listen to data from all cpus, userspace should use P=0
> + */
> +
> +#define TASKSTATS_CPUS_PER_SET	16

The constant is unpleasant.

If we're going to abuse nl_pid then how about we design things so that
nl_pid is treated as two 16-bit words - one word is the start CPU and the
other word is the end cpu?

Or, if a 65536-CPU limit is too scary, make the bottom 8 bits of nl_pid be
the number of CPUS (ie: TASKSTATS_CPUS_PER_SET) and the top 24 bits is the
starting CPU.  

<avoids mentioning nl_pad>

It'd be better to use a cpumask, of course..

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-30 18:53                                                         ` Shailabh Nagar
  2006-06-30 19:10                                                           ` Shailabh Nagar
@ 2006-06-30 22:56                                                           ` Andrew Morton
  1 sibling, 0 replies; 134+ messages in thread
From: Andrew Morton @ 2006-06-30 22:56 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: pj, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel, hadi, netdev

Shailabh Nagar <nagar@watson.ibm.com> wrote:
>
> Based on previous discussions, the above solutions can be expanded/modified to:
> 
> a) allow userspace to listen to a group of cpus instead of all. Multiple
> collection daemons can distribute the load as you pointed out. Doing collection
> by cpu groups rather than individual cpus reduces the aggregation burden on
> userspace (and scales better with NR_CPUS)
> 
> b) do flow control on the kernel send side. This can involve buffering and sending
> later (to handle bursty case) or dropping (to handle sustained load) as pointed out
> by you, Jamal in other threads.
> 
> c) increase receiver's socket buffer. This can and should always be done but no
> involvement needed.
> 
> 
> With regards to taskstats changes to handle the problem and its impact on userspace
> visible changes,
> 
> a) will change userspace
> b) will be transparent.
> c) is immaterial going forward (except perhaps as a change in Documentation)
> 
> 
> I'm sending a patch that demonstrates how a) can be done quite simply
> and a patch for b) is in progress.
> 
> If the approach suggested in patch a) is acceptable (and I'll provide the testing, stability
> results once comments on it are largely over), could taskstats acceptance in 2.6.18 go ahead
> and patch b) be added later (solution outline has already been provided and a prelim patch should
> be out by eod)

Throwing more CPUs at the problem makes heaps of sense.

It's not necessarily a userspace-incompatible change.  As long as userspace
sets nl_pid to 0x00000000, future kernel revisions can treat that as "all
CPUs".  Or userspace can be forward-compatible by setting nl_pid to
0xffff0000, or whatever.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-06-30 22:50                                                             ` Andrew Morton
@ 2006-07-01  2:20                                                               ` Shailabh Nagar
  2006-07-01  2:43                                                                 ` Andrew Morton
  0 siblings, 1 reply; 134+ messages in thread
From: Shailabh Nagar @ 2006-07-01  2:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: pj, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel, hadi, netdev

Andrew Morton wrote:

>Shailabh Nagar <nagar@watson.ibm.com> wrote:
>  
>
>>+/*
>>+ * Per-task exit data sent from the kernel to user space
>>+ * is tagged by an id based on grouping of cpus.
>>+ *
>>+ * If userspace specifies a non-zero P as the nl_pid field of
>>+ * the sockaddr_nl structure while binding to a netlink socket,
>>+ * it will receive exit data from threads that exited on cpus in the range
>>+ *
>>+ *    [(P-1)*Y, P*Y-1]
>>+ *
>>+ *  where Y = TASKSTATS_CPUS_PER_SET
>>+ *  i.e. if TASKSTATS_CPUS_PER_SET is 16,
>>+ *  to listen to data from cpus 0..15, specify P=1
>>+ *  for cpus 16..32, specify P=2 etc.
>>+ *
>>+ *  To listen to data from all cpus, userspace should use P=0
>>+ */
>>+
>>+#define TASKSTATS_CPUS_PER_SET	16
>>    
>>
>
>The constant is unpleasant.
>  
>
I was planning to make it configurable. But that would still not be as 
flexible as below...

>If we're going to abuse nl_pid then how about we design things so that
>nl_pid is treated as two 16-bit words - one word is the start CPU and the
>other word is the end cpu?
>
>Or, if a 65536-CPU limit is too scary, make the bottom 8 bits of nl_pid be
>the number of CPUS (ie: TASKSTATS_CPUS_PER_SET) and the top 24 bits is the
>starting CPU.  
>
><avoids mentioning nl_pad>
>
>It'd be better to use a cpumask, of course..
>  
>
All these options mean each listener gets to pick a "custom" range of 
cpus to listen on, 
rather than choose one of pre-defined ranges (even if the pre-defined 
ranges can change
by a configurable TASKSTATS_CPUS_PER_SET). Which means the kernel side 
has to
figure out which of the listeners cpu range includes the currently 
exiting task's cpu. To do
this, we'll need a callback from the binding of the netlink socket (so 
taskstats can maintain
the cpu -> nl_pid mappings at any exit).
The current genetlink interface doesn't have that kind of flexibility 
(though it can be added
I'm sure).

Seems a bit involved if the primary aim is to restrict the number of 
cpus that one listener
 wants to listen, rather than be able to pick which ones.

A configurable range won't suffice ?

--Shailabh

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-01  2:20                                                               ` Shailabh Nagar
@ 2006-07-01  2:43                                                                 ` Andrew Morton
  2006-07-01  3:37                                                                   ` Shailabh Nagar
  0 siblings, 1 reply; 134+ messages in thread
From: Andrew Morton @ 2006-07-01  2:43 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: pj, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel, hadi, netdev

On Fri, 30 Jun 2006 22:20:23 -0400
Shailabh Nagar <nagar@watson.ibm.com> wrote:

> >If we're going to abuse nl_pid then how about we design things so that
> >nl_pid is treated as two 16-bit words - one word is the start CPU and the
> >other word is the end cpu?
> >
> >Or, if a 65536-CPU limit is too scary, make the bottom 8 bits of nl_pid be
> >the number of CPUS (ie: TASKSTATS_CPUS_PER_SET) and the top 24 bits is the
> >starting CPU.  
> >
> ><avoids mentioning nl_pad>
> >
> >It'd be better to use a cpumask, of course..
> >  
> >
> All these options mean each listener gets to pick a "custom" range of 
> cpus to listen on, 
> rather than choose one of pre-defined ranges (even if the pre-defined 
> ranges can change
> by a configurable TASKSTATS_CPUS_PER_SET). Which means the kernel side 
> has to
> figure out which of the listeners cpu range includes the currently 
> exiting task's cpu. To do
> this, we'll need a callback from the binding of the netlink socket (so 
> taskstats can maintain
> the cpu -> nl_pid mappings at any exit).
> The current genetlink interface doesn't have that kind of flexibility 
> (though it can be added
> I'm sure).
> 
> Seems a bit involved if the primary aim is to restrict the number of 
> cpus that one listener
>  wants to listen, rather than be able to pick which ones.
> 
> A configurable range won't suffice ?
> 

Set aside the implementation details and ask "what is a good design"?

A kernel-wide constant, whether determined at build-time or by a /proc poke
isn't a nice design.

Can we permit userspace to send in a netlink message describing a cpumask? 
That's back-compatible.


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-01  2:43                                                                 ` Andrew Morton
@ 2006-07-01  3:37                                                                   ` Shailabh Nagar
  2006-07-01  3:51                                                                     ` Andrew Morton
  2006-07-03  4:53                                                                     ` Paul Jackson
  0 siblings, 2 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-07-01  3:37 UTC (permalink / raw)
  To: Andrew Morton
  Cc: pj, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel, hadi, netdev

Andrew Morton wrote:

>On Fri, 30 Jun 2006 22:20:23 -0400
>Shailabh Nagar <nagar@watson.ibm.com> wrote:
>
>  
>
>>>If we're going to abuse nl_pid then how about we design things so that
>>>nl_pid is treated as two 16-bit words - one word is the start CPU and the
>>>other word is the end cpu?
>>>
>>>Or, if a 65536-CPU limit is too scary, make the bottom 8 bits of nl_pid be
>>>the number of CPUS (ie: TASKSTATS_CPUS_PER_SET) and the top 24 bits is the
>>>starting CPU.  
>>>
>>><avoids mentioning nl_pad>
>>>
>>>It'd be better to use a cpumask, of course..
>>> 
>>>
>>>      
>>>
>>All these options mean each listener gets to pick a "custom" range of 
>>cpus to listen on, 
>>rather than choose one of pre-defined ranges (even if the pre-defined 
>>ranges can change
>>by a configurable TASKSTATS_CPUS_PER_SET). Which means the kernel side 
>>has to
>>figure out which of the listeners cpu range includes the currently 
>>exiting task's cpu. To do
>>this, we'll need a callback from the binding of the netlink socket (so 
>>taskstats can maintain
>>the cpu -> nl_pid mappings at any exit).
>>The current genetlink interface doesn't have that kind of flexibility 
>>(though it can be added
>>I'm sure).
>>
>>Seems a bit involved if the primary aim is to restrict the number of 
>>cpus that one listener
>> wants to listen, rather than be able to pick which ones.
>>
>>A configurable range won't suffice ?
>>
>>    
>>
>
>Set aside the implementation details and ask "what is a good design"?
>
>A kernel-wide constant, whether determined at build-time or by a /proc poke
>isn't a nice design.
>
>Can we permit userspace to send in a netlink message describing a cpumask? 
>That's back-compatible.
>  
>
Yes, that should be doable. And passing in a cpumask is much better 
since we no longer
have to maintain mappings.

So the strawman is:
Listener bind()s to genetlink using its real pid.
Sends a separate "registration" message with cpumask to listen to. 
Kernel stores (real) pid and cpumask.
During task exit, kernel goes through each registered listener (small 
list) and decides which
one needs to get this exit data and calls a genetlink_unicast to each 
one that does need it.

If number of listeners is small, the lookups should be swift enough. If 
it grows large, we
can consider a fancier lookup (but there I go again, delving into 
implementation too early :-)


Sounds good to me !

--Shailabh








^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-01  3:37                                                                   ` Shailabh Nagar
@ 2006-07-01  3:51                                                                     ` Andrew Morton
  2006-07-03 21:11                                                                       ` Shailabh Nagar
  2006-07-03  4:53                                                                     ` Paul Jackson
  1 sibling, 1 reply; 134+ messages in thread
From: Andrew Morton @ 2006-07-01  3:51 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: pj, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel, hadi, netdev

On Fri, 30 Jun 2006 23:37:10 -0400
Shailabh Nagar <nagar@watson.ibm.com> wrote:

> >Set aside the implementation details and ask "what is a good design"?
> >
> >A kernel-wide constant, whether determined at build-time or by a /proc poke
> >isn't a nice design.
> >
> >Can we permit userspace to send in a netlink message describing a cpumask? 
> >That's back-compatible.
> >  
> >
> Yes, that should be doable. And passing in a cpumask is much better 
> since we no longer
> have to maintain mappings.
> 
> So the strawman is:
> Listener bind()s to genetlink using its real pid.
> Sends a separate "registration" message with cpumask to listen to. 
> Kernel stores (real) pid and cpumask.
> During task exit, kernel goes through each registered listener (small 
> list) and decides which
> one needs to get this exit data and calls a genetlink_unicast to each 
> one that does need it.
> 
> If number of listeners is small, the lookups should be swift enough. If 
> it grows large, we
> can consider a fancier lookup (but there I go again, delving into 
> implementation too early :-)

We'll need a map.

1024 CPUs, 1024 listeners, 1000 exits/sec/CPU and we're up to a million
operations per second per CPU.  Meltdown.

But it's a pretty simple map.  A per-cpu array of pointers to the head of a
linked list.  One lock for each CPU's list.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-01  3:37                                                                   ` Shailabh Nagar
  2006-07-01  3:51                                                                     ` Andrew Morton
@ 2006-07-03  4:53                                                                     ` Paul Jackson
  2006-07-03 15:02                                                                       ` Shailabh Nagar
  1 sibling, 1 reply; 134+ messages in thread
From: Paul Jackson @ 2006-07-03  4:53 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: akpm, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel,
	hadi, netdev

Shailabh wrote:
> Sends a separate "registration" message with cpumask to listen to. 
> Kernel stores (real) pid and cpumask.

Question:
=========

Ah - good.

So this means that I could configure a system with a fork/exit
intensive, performance critical job on some dedicated CPUs, and be able
to collect taskstat data from tasks exiting on the -other- CPUS, while
avoiding collecting data from this special job, thus avoiding any
taskstat collection performance impact on said job.

If I'm understanding this correctly, excellent.

Caveat:
=======

Passing cpumasks across the kernel-user boundary can be tricky.

Historically, Unix has a long tradition of boloxing up the passing
of variable length data types across the kernel-user boundary.

We've got perhaps a half dozen ways of getting these masks out of the
kernel, and three ways of getting them (or the similar nodemasks) back
into the kernel.  The three ways being used in the sched_setaffinity
system call, the mbind and set_mempolicy system calls, and the cpuset
file system.

All three of these ways have their controversial details:
 * The kernel cpumask mask size needed for sched_setaffinity calls is
   not trivially available to userland.
 * The nodemask bit size is off by one in the mbind and set_mempolicy
   calls.
 * The CPU and Node masks are ascii, not binary, in the cpuset calls.

One option that might make sense for these task stat registrations
would be to:
 1) make the kernel/sched.c get_user_cpu_mask() routine generic,
    moving it to non-static lib/*.c code, and
 2) provide a sensible way for user space to query the size of
    the kernel cpumask (and perhaps nodemask while you're at it.)

Currently, the best way I know for user space to query the kernels
cpumask and nodemask size is to examine the length of the ascii
string values labeled "Cpus_allowed:" and "Mems_allowed:" in the file
/proc/self/status.  These ascii strings always require exactly nine
ascii chars to express each 32 bits of kernel mask code, if you include
in the count the trailing ',' comma or '\n' newline after each eight
ascii character word.

Probing /proc/self/status fields for these mask sizes is rather
unobvious and indirect, and requires caching the result if you care at
all about performance.  Userland code in support of your taskstat
facility might be better served by a more obvious way to size cpumasks.

... unless of course you're inclined to pass cpumasks formatted as
    ascii strings, in which case speak up, as I'd be delighted to
    throw in my 2 cents on how to do that ;).

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-03  4:53                                                                     ` Paul Jackson
@ 2006-07-03 15:02                                                                       ` Shailabh Nagar
  2006-07-03 15:55                                                                         ` Paul Jackson
                                                                                           ` (2 more replies)
  0 siblings, 3 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-07-03 15:02 UTC (permalink / raw)
  To: Paul Jackson
  Cc: akpm, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel,
	hadi, netdev

Paul Jackson wrote:

>Shailabh wrote:
>  
>
>>Sends a separate "registration" message with cpumask to listen to. 
>>Kernel stores (real) pid and cpumask.
>>    
>>
>
>Question:
>=========
>
>Ah - good.
>
>So this means that I could configure a system with a fork/exit
>intensive, performance critical job on some dedicated CPUs, and be able
>to collect taskstat data from tasks exiting on the -other- CPUS, while
>avoiding collecting data from this special job, thus avoiding any
>taskstat collection performance impact on said job.
>
>If I'm understanding this correctly, excellent.
>  
>
Yes. If no one registers to listen on a particular CPU, data from tasks 
exiting on that cpu is
not sent out at all.

>Caveat:
>=======
>
>Passing cpumasks across the kernel-user boundary can be tricky.
>
>Historically, Unix has a long tradition of boloxing up the passing
>of variable length data types across the kernel-user boundary.
>
>We've got perhaps a half dozen ways of getting these masks out of the
>kernel, and three ways of getting them (or the similar nodemasks) back
>into the kernel.  The three ways being used in the sched_setaffinity
>system call, the mbind and set_mempolicy system calls, and the cpuset
>file system.
>
>All three of these ways have their controversial details:
> * The kernel cpumask mask size needed for sched_setaffinity calls is
>   not trivially available to userland.
> * The nodemask bit size is off by one in the mbind and set_mempolicy
>   calls.
> * The CPU and Node masks are ascii, not binary, in the cpuset calls.
>
>One option that might make sense for these task stat registrations
>would be to:
> 1) make the kernel/sched.c get_user_cpu_mask() routine generic,
>    moving it to non-static lib/*.c code, and
> 2) provide a sensible way for user space to query the size of
>    the kernel cpumask (and perhaps nodemask while you're at it.)
>
>Currently, the best way I know for user space to query the kernels
>cpumask and nodemask size is to examine the length of the ascii
>string values labeled "Cpus_allowed:" and "Mems_allowed:" in the file
>/proc/self/status.  These ascii strings always require exactly nine
>ascii chars to express each 32 bits of kernel mask code, if you include
>in the count the trailing ',' comma or '\n' newline after each eight
>ascii character word.
>
>Probing /proc/self/status fields for these mask sizes is rather
>unobvious and indirect, and requires caching the result if you care at
>all about performance.  Userland code in support of your taskstat
>facility might be better served by a more obvious way to size cpumasks.
>
>... unless of course you're inclined to pass cpumasks formatted as
>    ascii strings, in which case speak up, as I'd be delighted to
>    throw in my 2 cents on how to do that ;).
>  
>
Thanks for the size info. I did hit it while coding this up.

So I chose to use the "cpulist" ascii format that has been helpfully 
provided in include/linux/cpumask.h (by whom I wonder :-)

User specified the cpumask as an ascii string containing comma separated 
cpu ranges.
Kernel parses the same and stores it as a cpumask_t after which we can 
iterate over the
mask using standard helpers.

Since registration/deregistration is not a common operation, the 
overhead of parsing
ascii strings should be acceptable and avoids the hassles of trying to 
determine kernel cpumask size. I don't know if there are buffer overflow 
issues in passing a string (though I'm using the
standard netlink way of passing it up using NLA_STRING).

Will post the patch shortly.

--Shailabh

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-03 15:02                                                                       ` Shailabh Nagar
@ 2006-07-03 15:55                                                                         ` Paul Jackson
  2006-07-03 16:31                                                                         ` Paul Jackson
  2006-07-05 17:20                                                                         ` Jay Lan
  2 siblings, 0 replies; 134+ messages in thread
From: Paul Jackson @ 2006-07-03 15:55 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: akpm, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel,
	hadi, netdev

Shailabh wrote:
> Yes. If no one registers to listen on a particular CPU, data from tasks 
> exiting on that cpu is not sent out at all.

Excellent.


> So I chose to use the "cpulist" ascii format that has been helpfully 
> provided in include/linux/cpumask.h (by whom I wonder :-)

Excellent.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-03 15:02                                                                       ` Shailabh Nagar
  2006-07-03 15:55                                                                         ` Paul Jackson
@ 2006-07-03 16:31                                                                         ` Paul Jackson
  2006-07-04  0:09                                                                           ` Shailabh Nagar
  2006-07-05 17:20                                                                         ` Jay Lan
  2 siblings, 1 reply; 134+ messages in thread
From: Paul Jackson @ 2006-07-03 16:31 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: akpm, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel,
	hadi, netdev

Shailabh wrote:
> I don't know if there are buffer overflow 
> issues in passing a string

I don't know if this comment applies to "the standard netlink way of
passing it up using NLA_STRING", but the way I deal with buffer length
issues in the cpuset code is to insist that the user code express the
list in no fewer than 100 + 6 * NR_CPUS bytes:

>From kernel/cpuset.c:

        /* Crude upper limit on largest legitimate cpulist user might write. */
        if (nbytes > 100 + 6 * NR_CPUS)
                return -E2BIG;

This lets the user specify the buffer size passed in, but prevents
them from trying a denial of service attack on the kernel by trying
to pass in a huge buffer.

If the user can't figure out how to write the desired cpulist in
that size, then tough toenails.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-01  3:51                                                                     ` Andrew Morton
@ 2006-07-03 21:11                                                                       ` Shailabh Nagar
  2006-07-03 21:41                                                                         ` Andrew Morton
  2006-07-04  0:54                                                                         ` Shailabh Nagar
  0 siblings, 2 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-07-03 21:11 UTC (permalink / raw)
  To: Andrew Morton
  Cc: pj, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel, hadi, netdev

Andrew Morton wrote:

>On Fri, 30 Jun 2006 23:37:10 -0400
>Shailabh Nagar <nagar@watson.ibm.com> wrote:
>
>  
>
>>>Set aside the implementation details and ask "what is a good design"?
>>>
>>>A kernel-wide constant, whether determined at build-time or by a /proc poke
>>>isn't a nice design.
>>>
>>>Can we permit userspace to send in a netlink message describing a cpumask? 
>>>That's back-compatible.
>>> 
>>>
>>>      
>>>
>>Yes, that should be doable. And passing in a cpumask is much better 
>>since we no longer
>>have to maintain mappings.
>>
>>So the strawman is:
>>Listener bind()s to genetlink using its real pid.
>>Sends a separate "registration" message with cpumask to listen to. 
>>Kernel stores (real) pid and cpumask.
>>During task exit, kernel goes through each registered listener (small 
>>list) and decides which
>>one needs to get this exit data and calls a genetlink_unicast to each 
>>one that does need it.
>>
>>If number of listeners is small, the lookups should be swift enough. If 
>>it grows large, we
>>can consider a fancier lookup (but there I go again, delving into 
>>implementation too early :-)
>>    
>>
>
>We'll need a map.
>
>1024 CPUs, 1024 listeners, 1000 exits/sec/CPU and we're up to a million
>operations per second per CPU.  Meltdown.
>
>But it's a pretty simple map.  A per-cpu array of pointers to the head of a
>linked list.  One lock for each CPU's list.
>  
>
Here's a patch that implements the above ideas.

A listener register's interest by specifying a cpumask in the
cpulist format (comma separated ranges of cpus). The listener's pid
is entered into per-cpu lists for those cpus and exit events from those
cpus go to the listeners using netlink unicasts.

Please comment.

Andrew, this is not being proposed for inclusion yet since there is 
atleast one more issue that needs to be resolved:

What happens when a listener exits without doing deregistration
(or if the listener attempts to register another cpumask while a current
registration is still active).

More on that in a separate thread.

--Shailabh



On systems with a large number of cpus, with even a modest rate of
tasks exiting per cpu, the volume of taskstats data sent on thread exit
can overflow a userspace listener's buffers.

One approach to avoiding overflow is to allow listeners to get data for
a limited and specific set of cpus. By scaling the number of listeners
and/or the cpus they monitor, userspace can handle the statistical data
overload more gracefully.

In this patch, each listener registers to listen to a specific set of
cpus by specifying a cpumask.  The interest is recorded per-cpu. When
a task exits on a cpu, its taskstats data is unicast to each listener
interested in that cpu.

Thanks to Andrew Morton for pointing out the various scalability and
general concerns of previous attempts and for suggesting this design.

Signed-Off-By: Shailabh Nagar <nagar@watson.ibm.com>

 include/linux/taskstats.h      |    4 -
 include/linux/taskstats_kern.h |   12 ---
 kernel/taskstats.c             |  136 +++++++++++++++++++++++++++++++++++++++--
 3 files changed, 135 insertions(+), 17 deletions(-)

Index: linux-2.6.17-mm3equiv/include/linux/taskstats.h
===================================================================
--- linux-2.6.17-mm3equiv.orig/include/linux/taskstats.h	2006-06-30 19:03:40.000000000 -0400
+++ linux-2.6.17-mm3equiv/include/linux/taskstats.h	2006-07-01 23:53:01.000000000 -0400
@@ -87,8 +87,6 @@ struct taskstats {
 };


-#define TASKSTATS_LISTEN_GROUP	0x1
-
 /*
  * Commands sent from userspace
  * Not versioned. New commands should only be inserted at the enum's end
@@ -120,6 +118,8 @@ enum {
 	TASKSTATS_CMD_ATTR_UNSPEC = 0,
 	TASKSTATS_CMD_ATTR_PID,
 	TASKSTATS_CMD_ATTR_TGID,
+	TASKSTATS_CMD_ATTR_REGISTER_CPUMASK,
+	TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK,
 	__TASKSTATS_CMD_ATTR_MAX,
 };

Index: linux-2.6.17-mm3equiv/include/linux/taskstats_kern.h
===================================================================
--- linux-2.6.17-mm3equiv.orig/include/linux/taskstats_kern.h	2006-06-30 11:57:14.000000000 -0400
+++ linux-2.6.17-mm3equiv/include/linux/taskstats_kern.h	2006-07-01 23:53:01.000000000 -0400
@@ -19,20 +19,14 @@ enum {
 #ifdef CONFIG_TASKSTATS
 extern kmem_cache_t *taskstats_cache;
 extern struct mutex taskstats_exit_mutex;
-
-static inline int taskstats_has_listeners(void)
-{
-	if (!genl_sock)
-		return 0;
-	return netlink_has_listeners(genl_sock, TASKSTATS_LISTEN_GROUP);
-}
-
+DECLARE_PER_CPU(struct list_head, listener_list);

 static inline void taskstats_exit_alloc(struct taskstats **ptidstats)
 {
 	*ptidstats = NULL;
-	if (taskstats_has_listeners())
+	if (!list_empty(&get_cpu_var(listener_list)))
 		*ptidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
+	put_cpu_var(listener_list);
 }

 static inline void taskstats_exit_free(struct taskstats *tidstats)
Index: linux-2.6.17-mm3equiv/kernel/taskstats.c
===================================================================
--- linux-2.6.17-mm3equiv.orig/kernel/taskstats.c	2006-06-30 23:38:39.000000000 -0400
+++ linux-2.6.17-mm3equiv/kernel/taskstats.c	2006-07-02 00:16:18.000000000 -0400
@@ -19,6 +19,8 @@
 #include <linux/kernel.h>
 #include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
 #include <net/genetlink.h>
 #include <asm/atomic.h>

@@ -26,6 +28,9 @@ static DEFINE_PER_CPU(__u32, taskstats_s
 static int family_registered = 0;
 kmem_cache_t *taskstats_cache;

+DEFINE_PER_CPU(struct list_head, listener_list);
+static DEFINE_PER_CPU(struct rw_semaphore, listener_list_sem);
+
 static struct genl_family family = {
 	.id		= GENL_ID_GENERATE,
 	.name		= TASKSTATS_GENL_NAME,
@@ -37,9 +42,19 @@ static struct nla_policy taskstats_cmd_g
 __read_mostly = {
 	[TASKSTATS_CMD_ATTR_PID]  = { .type = NLA_U32 },
 	[TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
-};
+	[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
+	[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};

+struct listener {
+	struct list_head list;
+	pid_t pid;
+};

+enum actions {
+	REGISTER,
+	DEREGISTER
+};
+
 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
 			void **replyp, size_t size)
 {
@@ -77,6 +92,8 @@ static int prepare_reply(struct genl_inf
 static int send_reply(struct sk_buff *skb, pid_t pid, int event)
 {
 	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+	struct rw_semaphore *sem;
+	struct list_head *p, *head;
 	void *reply;
 	int rc;

@@ -88,9 +105,30 @@ static int send_reply(struct sk_buff *sk
 		return rc;
 	}

-	if (event == TASKSTATS_MSG_MULTICAST)
-		return genlmsg_multicast(skb, pid, TASKSTATS_LISTEN_GROUP);
-	return genlmsg_unicast(skb, pid);
+	if (event == TASKSTATS_MSG_UNICAST)
+		return genlmsg_unicast(skb, pid);
+
+	/*
+	 * Taskstats multicast is unicasts to listeners who have registered
+	 * interest in this cpu
+	 */
+	sem = &get_cpu_var(listener_list_sem);
+	head = &get_cpu_var(listener_list);
+
+	down_read(sem);
+	list_for_each(p, head) {
+		int ret;
+		struct listener *s = list_entry(p, struct listener, list);
+		ret = genlmsg_unicast(skb, s->pid);
+		if (ret)
+			rc = ret;
+	}
+	up_read(sem);
+
+	put_cpu_var(listener_list);
+	put_cpu_var(listener_list_sem);
+
+	return rc;
 }

 static int fill_pid(pid_t pid, struct task_struct *pidtsk,
@@ -201,8 +239,73 @@ ret:
 	return;
 }

+static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
+{
+	struct listener *s;
+	unsigned int cpu, mycpu;
+	cpumask_t mask;
+	struct rw_semaphore *sem;
+	struct list_head *head, *p;

-static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
+	memcpy(&mask, maskp, sizeof(cpumask_t));
+	if (cpus_empty(mask))
+		return -EINVAL;
+
+	mycpu = get_cpu();
+	put_cpu();
+	if (isadd == REGISTER) {
+		for_each_cpu_mask(cpu, mask) {
+			if (!cpu_possible(cpu))
+				continue;
+			if (cpu == mycpu)
+				preempt_disable();
+
+			sem = &per_cpu(listener_list_sem, cpu);
+			head = &per_cpu(listener_list, cpu);
+
+			s = kmalloc(sizeof(struct listener), GFP_KERNEL);
+			if (!s)
+				return -ENOMEM;
+			s->pid = pid;
+			INIT_LIST_HEAD(&s->list);
+
+			down_write(sem);
+			list_add(&s->list, head);
+			up_write(sem);
+
+			if (cpu == mycpu)
+				preempt_enable();
+		}
+	} else {
+		for_each_cpu_mask(cpu, mask) {
+			struct list_head *tmp;
+
+			if (!cpu_possible(cpu))
+				continue;
+			if (cpu == mycpu)
+				preempt_disable();
+
+			sem = &per_cpu(listener_list_sem, cpu);
+			head = &per_cpu(listener_list, cpu);
+
+			down_write(sem);
+			list_for_each_safe(p, tmp, head) {
+				s = list_entry(p, struct listener, list);
+				if (s->pid == pid) {
+					list_del(&s->list);
+					break;
+				}
+			}
+			up_write(sem);
+
+			if (cpu == mycpu)
+				preempt_enable();
+		}
+	}
+	return 0;
+}
+
+static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 {
 	int rc = 0;
 	struct sk_buff *rep_skb;
@@ -210,6 +313,21 @@ static int taskstats_send_stats(struct s
 	void *reply;
 	size_t size;
 	struct nlattr *na;
+	cpumask_t mask;
+
+	if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]) {
+		na = info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK];
+		cpulist_parse((char *)nla_data(na), mask);
+		rc = add_del_listener(info->snd_pid, &mask, REGISTER);
+		return rc;
+	}
+
+	if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK]) {
+		na = info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK];
+		cpulist_parse((char *)nla_data(na), mask);
+		rc = add_del_listener(info->snd_pid, &mask, DEREGISTER);
+		return rc;
+	}

 	/*
 	 * Size includes space for nested attributes
@@ -334,7 +452,7 @@ ret:

 static struct genl_ops taskstats_ops = {
 	.cmd		= TASKSTATS_CMD_GET,
-	.doit		= taskstats_send_stats,
+	.doit		= taskstats_user_cmd,
 	.policy		= taskstats_cmd_get_policy,
 };

@@ -349,6 +467,7 @@ void __init taskstats_init_early(void)
 static int __init taskstats_init(void)
 {
 	int rc;
+	unsigned int i;

 	rc = genl_register_family(&family);
 	if (rc)
@@ -358,6 +477,11 @@ static int __init taskstats_init(void)
 	rc = genl_register_ops(&family, &taskstats_ops);
 	if (rc < 0)
 		goto err;
+
+	for_each_possible_cpu(i) {
+		INIT_LIST_HEAD(&(per_cpu(listener_list, i)));
+		init_rwsem(&(per_cpu(listener_list_sem, i)));
+	}

 	return 0;
 err:




^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-03 21:11                                                                       ` Shailabh Nagar
@ 2006-07-03 21:41                                                                         ` Andrew Morton
  2006-07-04  0:13                                                                           ` Shailabh Nagar
  2006-07-04 20:19                                                                           ` Paul Jackson
  2006-07-04  0:54                                                                         ` Shailabh Nagar
  1 sibling, 2 replies; 134+ messages in thread
From: Andrew Morton @ 2006-07-03 21:41 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: pj, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel, hadi, netdev

On Mon, 03 Jul 2006 17:11:59 -0400
Shailabh Nagar <nagar@watson.ibm.com> wrote:

> >>So the strawman is:
> >>Listener bind()s to genetlink using its real pid.
> >>Sends a separate "registration" message with cpumask to listen to. 
> >>Kernel stores (real) pid and cpumask.
> >>During task exit, kernel goes through each registered listener (small 
> >>list) and decides which
> >>one needs to get this exit data and calls a genetlink_unicast to each 
> >>one that does need it.
> >>
> >>If number of listeners is small, the lookups should be swift enough. If 
> >>it grows large, we
> >>can consider a fancier lookup (but there I go again, delving into 
> >>implementation too early :-)
> >>    
> >>
> >
> >We'll need a map.
> >
> >1024 CPUs, 1024 listeners, 1000 exits/sec/CPU and we're up to a million
> >operations per second per CPU.  Meltdown.
> >
> >But it's a pretty simple map.  A per-cpu array of pointers to the head of a
> >linked list.  One lock for each CPU's list.
> >  
> >
> Here's a patch that implements the above ideas.
> 
> A listener register's interest by specifying a cpumask in the
> cpulist format (comma separated ranges of cpus). The listener's pid
> is entered into per-cpu lists for those cpus and exit events from those
> cpus go to the listeners using netlink unicasts.
> 
> ...
> 
> On systems with a large number of cpus, with even a modest rate of
> tasks exiting per cpu, the volume of taskstats data sent on thread exit
> can overflow a userspace listener's buffers.
> 
> One approach to avoiding overflow is to allow listeners to get data for
> a limited and specific set of cpus. By scaling the number of listeners
> and/or the cpus they monitor, userspace can handle the statistical data
> overload more gracefully.
> 
> In this patch, each listener registers to listen to a specific set of
> cpus by specifying a cpumask.  The interest is recorded per-cpu. When
> a task exits on a cpu, its taskstats data is unicast to each listener
> interested in that cpu.

I think the approach is sane.  The impementation needs work, as you say.

> +++ linux-2.6.17-mm3equiv/include/linux/taskstats_kern.h	2006-07-01 23:53:01.000000000 -0400
> @@ -19,20 +19,14 @@ enum {
>  #ifdef CONFIG_TASKSTATS
>  extern kmem_cache_t *taskstats_cache;
>  extern struct mutex taskstats_exit_mutex;
> -
> -static inline int taskstats_has_listeners(void)
> -{
> -	if (!genl_sock)
> -		return 0;
> -	return netlink_has_listeners(genl_sock, TASKSTATS_LISTEN_GROUP);
> -}
> -
> +DECLARE_PER_CPU(struct list_head, listener_list);
> 
>  static inline void taskstats_exit_alloc(struct taskstats **ptidstats)
>  {
>  	*ptidstats = NULL;
> -	if (taskstats_has_listeners())
> +	if (!list_empty(&get_cpu_var(listener_list)))
>  		*ptidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
> +	put_cpu_var(listener_list);
>  }

It's time to uninline this function..

>  static inline void taskstats_exit_free(struct taskstats *tidstats)
> Index: linux-2.6.17-mm3equiv/kernel/taskstats.c
> ===================================================================
> --- linux-2.6.17-mm3equiv.orig/kernel/taskstats.c	2006-06-30 23:38:39.000000000 -0400
> +++ linux-2.6.17-mm3equiv/kernel/taskstats.c	2006-07-02 00:16:18.000000000 -0400
> @@ -19,6 +19,8 @@
>  #include <linux/kernel.h>
>  #include <linux/taskstats_kern.h>
>  #include <linux/delayacct.h>
> +#include <linux/cpumask.h>
> +#include <linux/percpu.h>
>  #include <net/genetlink.h>
>  #include <asm/atomic.h>
> 
> @@ -26,6 +28,9 @@ static DEFINE_PER_CPU(__u32, taskstats_s
>  static int family_registered = 0;
>  kmem_cache_t *taskstats_cache;
> 
> +DEFINE_PER_CPU(struct list_head, listener_list);
> +static DEFINE_PER_CPU(struct rw_semaphore, listener_list_sem);

Which will permit listener_list to become static - it wasn't a good name
for a global anyway.

I suggest you implement a new

struct whatever {
	struct rw_semaphore sem;
	struct list_head list;
};

static DEFINE_PER_CPU(struct whatever, listener_aray);


>  static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
>  			void **replyp, size_t size)
>  {
> @@ -77,6 +92,8 @@ static int prepare_reply(struct genl_inf
>  static int send_reply(struct sk_buff *skb, pid_t pid, int event)
>  {
>  	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
> +	struct rw_semaphore *sem;
> +	struct list_head *p, *head;
>  	void *reply;
>  	int rc;
> 
> @@ -88,9 +105,30 @@ static int send_reply(struct sk_buff *sk
>  		return rc;
>  	}
> 
> -	if (event == TASKSTATS_MSG_MULTICAST)
> -		return genlmsg_multicast(skb, pid, TASKSTATS_LISTEN_GROUP);
> -	return genlmsg_unicast(skb, pid);
> +	if (event == TASKSTATS_MSG_UNICAST)
> +		return genlmsg_unicast(skb, pid);
> +
> +	/*
> +	 * Taskstats multicast is unicasts to listeners who have registered
> +	 * interest in this cpu
> +	 */
> +	sem = &get_cpu_var(listener_list_sem);
> +	head = &get_cpu_var(listener_list);

This has a double preempt_disable(), but the above will fix that.

> +	down_read(sem);
> +	list_for_each(p, head) {
> +		int ret;
> +		struct listener *s = list_entry(p, struct listener, list);
> +		ret = genlmsg_unicast(skb, s->pid);
> +		if (ret)
> +			rc = ret;
> +	}
> +	up_read(sem);
> +
> +	put_cpu_var(listener_list);
> +	put_cpu_var(listener_list_sem);
> +
> +	return rc;
>  }
> 
>  static int fill_pid(pid_t pid, struct task_struct *pidtsk,
> @@ -201,8 +239,73 @@ ret:
>  	return;
>  }
> 
> +static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
> +{
> +	struct listener *s;
> +	unsigned int cpu, mycpu;
> +	cpumask_t mask;
> +	struct rw_semaphore *sem;
> +	struct list_head *head, *p;
> 
> -static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
> +	memcpy(&mask, maskp, sizeof(cpumask_t));
> +	if (cpus_empty(mask))
> +		return -EINVAL;
> +
> +	mycpu = get_cpu();
> +	put_cpu();

This is effectively raw_smp_processor_id().  And after the put_cpu(),
`mycpu' is meaningless.

> +	if (isadd == REGISTER) {
> +		for_each_cpu_mask(cpu, mask) {
> +			if (!cpu_possible(cpu))
> +				continue;
> +			if (cpu == mycpu)
> +				preempt_disable();
> +
> +			sem = &per_cpu(listener_list_sem, cpu);
> +			head = &per_cpu(listener_list, cpu);
> +
> +			s = kmalloc(sizeof(struct listener), GFP_KERNEL);

Cannot do GFP_KERNEL inside preempt_disable().

There's no easy solution to this problem.  GFP_ATOMIC is not a good fix at
all.  One approach would be to run lock_cpu_hotplug(), then allocate (with
GFP_KERNEL) all the memory which will be needed within the locked region,
then take the lock, then use that preallocated memory.

You should use kmalloc_node() here, to ensure that the memory on each CPU's
list resides with that CPU's local memory (not _this_ CPU's local memory).

> +			if (!s)
> +				return -ENOMEM;
> +			s->pid = pid;
> +			INIT_LIST_HEAD(&s->list);
> +
> +			down_write(sem);
> +			list_add(&s->list, head);
> +			up_write(sem);
> +
> +			if (cpu == mycpu)
> +				preempt_enable();

Actually, I don't understand the tricks which are going on with the local CPU here. 
What's it all for?


> +		}
> +	} else {
> +		for_each_cpu_mask(cpu, mask) {
> +			struct list_head *tmp;
> +
> +			if (!cpu_possible(cpu))
> +				continue;

I guess you could just do cpus_and(mask, cpus_possible_map) on entry.


> +			if (cpu == mycpu)
> +				preempt_disable();
> +
> +			sem = &per_cpu(listener_list_sem, cpu);
> +			head = &per_cpu(listener_list, cpu);
> +
> +			down_write(sem);
> +			list_for_each_safe(p, tmp, head) {
> +				s = list_entry(p, struct listener, list);
> +				if (s->pid == pid) {
> +					list_del(&s->list);

kfree(s);

> +					break;
> +				}
> +			}
> +			up_write(sem);
> +
> +			if (cpu == mycpu)
> +				preempt_enable();
> +		}
> +	}
> +	return 0;
> +}
> +
> +static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
>  {
>  	int rc = 0;
>  	struct sk_buff *rep_skb;
> @@ -210,6 +313,21 @@ static int taskstats_send_stats(struct s
>  	void *reply;
>  	size_t size;
>  	struct nlattr *na;
> +	cpumask_t mask;
> +
> +	if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]) {
> +		na = info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK];
> +		cpulist_parse((char *)nla_data(na), mask);

OK, so we're passing in an ASCII string.  Fair enough, I think.  Paul would
know better.


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-03 16:31                                                                         ` Paul Jackson
@ 2006-07-04  0:09                                                                           ` Shailabh Nagar
  2006-07-04 19:59                                                                             ` Paul Jackson
  0 siblings, 1 reply; 134+ messages in thread
From: Shailabh Nagar @ 2006-07-04  0:09 UTC (permalink / raw)
  To: Paul Jackson
  Cc: akpm, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel,
	hadi, netdev

Paul Jackson wrote:

>Shailabh wrote:
>  
>
>>I don't know if there are buffer overflow 
>>issues in passing a string
>>    
>>
>
>I don't know if this comment applies to "the standard netlink way of
>passing it up using NLA_STRING", but the way I deal with buffer length
>issues in the cpuset code is to insist that the user code express the
>list in no fewer than 100 + 6 * NR_CPUS bytes:
>
>From kernel/cpuset.c:
>
>        /* Crude upper limit on largest legitimate cpulist user might write. */
>        if (nbytes > 100 + 6 * NR_CPUS)
>                return -E2BIG;
>
>This lets the user specify the buffer size passed in, but prevents
>them from trying a denial of service attack on the kernel by trying
>to pass in a huge buffer.
>
>If the user can't figure out how to write the desired cpulist in
>that size, then tough toenails.
>  
>
Paul,

Perhaps I should use the the other ascii format for specifying cpumasks 
since its more amenable
to specifying an upper bound for the length of the ascii string and is 
more compact ?

That format (the one used in lib/bitmap.c:bitmap_parse) is comma 
separated chunks of hex digits
with each chunk specifying 32 bits of the desired cpumask.

So
((NR_CPUS + 32) / 32) * 8 + 1
(8 hex characters for each 32 cpus, and 1 extra character for null 
terminator)
would be an upper bound that would accomodate all the cpus for sure.

Thoughts ?

--Shailabh

--Shailabh

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-03 21:41                                                                         ` Andrew Morton
@ 2006-07-04  0:13                                                                           ` Shailabh Nagar
  2006-07-04  0:38                                                                             ` Andrew Morton
  2006-07-04 20:19                                                                           ` Paul Jackson
  1 sibling, 1 reply; 134+ messages in thread
From: Shailabh Nagar @ 2006-07-04  0:13 UTC (permalink / raw)
  To: Andrew Morton
  Cc: pj, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel, hadi, netdev

Andrew Morton wrote:

>On Mon, 03 Jul 2006 17:11:59 -0400
>Shailabh Nagar <nagar@watson.ibm.com> wrote:
>  
>
>>
>> static inline void taskstats_exit_alloc(struct taskstats **ptidstats)
>> {
>> 	*ptidstats = NULL;
>>-	if (taskstats_has_listeners())
>>+	if (!list_empty(&get_cpu_var(listener_list)))
>> 		*ptidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
>>+	put_cpu_var(listener_list);
>> }
>>    
>>
>
>It's time to uninline this function..
>
>  
>
>> static inline void taskstats_exit_free(struct taskstats *tidstats)
>>Index: linux-2.6.17-mm3equiv/kernel/taskstats.c
>>===================================================================
>>--- linux-2.6.17-mm3equiv.orig/kernel/taskstats.c	2006-06-30 23:38:39.000000000 -0400
>>+++ linux-2.6.17-mm3equiv/kernel/taskstats.c	2006-07-02 00:16:18.000000000 -0400
>>@@ -19,6 +19,8 @@
>> #include <linux/kernel.h>
>> #include <linux/taskstats_kern.h>
>> #include <linux/delayacct.h>
>>+#include <linux/cpumask.h>
>>+#include <linux/percpu.h>
>> #include <net/genetlink.h>
>> #include <asm/atomic.h>
>>
>>@@ -26,6 +28,9 @@ static DEFINE_PER_CPU(__u32, taskstats_s
>> static int family_registered = 0;
>> kmem_cache_t *taskstats_cache;
>>
>>+DEFINE_PER_CPU(struct list_head, listener_list);
>>+static DEFINE_PER_CPU(struct rw_semaphore, listener_list_sem);
>>    
>>
>
>Which will permit listener_list to become static - it wasn't a good name
>for a global anyway.
>
>I suggest you implement a new
>
>struct whatever {
>	struct rw_semaphore sem;
>	struct list_head list;
>};
>  
>
Ok. The listener_list was a global to allow taskstats_exit_alloc to 
access but this is better.

>static DEFINE_PER_CPU(struct whatever, listener_aray);
>
>
>  
>
>> static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
>> 			void **replyp, size_t size)
>> {
>>@@ -77,6 +92,8 @@ static int prepare_reply(struct genl_inf
>> static int send_reply(struct sk_buff *skb, pid_t pid, int event)
>> {
>> 	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
>>+	struct rw_semaphore *sem;
>>+	struct list_head *p, *head;
>> 	void *reply;
>> 	int rc;
>>
>>@@ -88,9 +105,30 @@ static int send_reply(struct sk_buff *sk
>> 		return rc;
>> 	}
>>
>>-	if (event == TASKSTATS_MSG_MULTICAST)
>>-		return genlmsg_multicast(skb, pid, TASKSTATS_LISTEN_GROUP);
>>-	return genlmsg_unicast(skb, pid);
>>+	if (event == TASKSTATS_MSG_UNICAST)
>>+		return genlmsg_unicast(skb, pid);
>>+
>>+	/*
>>+	 * Taskstats multicast is unicasts to listeners who have registered
>>+	 * interest in this cpu
>>+	 */
>>+	sem = &get_cpu_var(listener_list_sem);
>>+	head = &get_cpu_var(listener_list);
>>    
>>
>
>This has a double preempt_disable(), but the above will fix that.
>
>  
>
>>+	down_read(sem);
>>+	list_for_each(p, head) {
>>+		int ret;
>>+		struct listener *s = list_entry(p, struct listener, list);
>>+		ret = genlmsg_unicast(skb, s->pid);
>>+		if (ret)
>>+			rc = ret;
>>+	}
>>+	up_read(sem);
>>+
>>+	put_cpu_var(listener_list);
>>+	put_cpu_var(listener_list_sem);
>>+
>>+	return rc;
>> }
>>
>> static int fill_pid(pid_t pid, struct task_struct *pidtsk,
>>@@ -201,8 +239,73 @@ ret:
>> 	return;
>> }
>>
>>+static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
>>+{
>>+	struct listener *s;
>>+	unsigned int cpu, mycpu;
>>+	cpumask_t mask;
>>+	struct rw_semaphore *sem;
>>+	struct list_head *head, *p;
>>
>>-static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
>>+	memcpy(&mask, maskp, sizeof(cpumask_t));
>>+	if (cpus_empty(mask))
>>+		return -EINVAL;
>>+
>>+	mycpu = get_cpu();
>>+	put_cpu();
>>    
>>
>
>This is effectively raw_smp_processor_id().  And after the put_cpu(),
>`mycpu' is meaningless.
>  
>
Hmm.

>  
>
>>+	if (isadd == REGISTER) {
>>+		for_each_cpu_mask(cpu, mask) {
>>+			if (!cpu_possible(cpu))
>>+				continue;
>>+			if (cpu == mycpu)
>>+				preempt_disable();
>>+
>>+			sem = &per_cpu(listener_list_sem, cpu);
>>+			head = &per_cpu(listener_list, cpu);
>>+
>>+			s = kmalloc(sizeof(struct listener), GFP_KERNEL);
>>    
>>
>
>Cannot do GFP_KERNEL inside preempt_disable().
>
>There's no easy solution to this problem.  GFP_ATOMIC is not a good fix at
>all.  One approach would be to run lock_cpu_hotplug(), then allocate (with
>GFP_KERNEL) all the memory which will be needed within the locked region,
>then take the lock, then use that preallocated memory.
>  
>
>You should use kmalloc_node() here, to ensure that the memory on each CPU's
>list resides with that CPU's local memory (not _this_ CPU's local memory).
>  
>
Ok.

>  
>
>>+			if (!s)
>>+				return -ENOMEM;
>>+			s->pid = pid;
>>+			INIT_LIST_HEAD(&s->list);
>>+
>>+			down_write(sem);
>>+			list_add(&s->list, head);
>>+			up_write(sem);
>>+
>>+			if (cpu == mycpu)
>>+				preempt_enable();
>>    
>>
>
>Actually, I don't understand the tricks which are going on with the local CPU here. 
>What's it all for?
>  
>
I was wanting to do a  get_cpu_var  for listener_list & sem
for the current cpu and per_cpu otherwise (since thats what I thought 
was the recommendation
for accessing the local cpu's variable). Perhaps the preempt_disable is 
uncalled for ?


>
>  
>
>>+		}
>>+	} else {
>>+		for_each_cpu_mask(cpu, mask) {
>>+			struct list_head *tmp;
>>+
>>+			if (!cpu_possible(cpu))
>>+				continue;
>>    
>>
>
>I guess you could just do cpus_and(mask, cpus_possible_map) on entry.
>  
>
Yup !

>
>  
>
>>+			if (cpu == mycpu)
>>+				preempt_disable();
>>+
>>+			sem = &per_cpu(listener_list_sem, cpu);
>>+			head = &per_cpu(listener_list, cpu);
>>+
>>+			down_write(sem);
>>+			list_for_each_safe(p, tmp, head) {
>>+				s = list_entry(p, struct listener, list);
>>+				if (s->pid == pid) {
>>+					list_del(&s->list);
>>    
>>
>
>kfree(s);
>  
>

Oops.

>  
>
>>+					break;
>>+				}
>>+			}
>>+			up_write(sem);
>>+
>>+			if (cpu == mycpu)
>>+				preempt_enable();
>>+		}
>>+	}
>>+	return 0;
>>+}
>>+
>>+static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
>> {
>> 	int rc = 0;
>> 	struct sk_buff *rep_skb;
>>@@ -210,6 +313,21 @@ static int taskstats_send_stats(struct s
>> 	void *reply;
>> 	size_t size;
>> 	struct nlattr *na;
>>+	cpumask_t mask;
>>+
>>+	if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]) {
>>+		na = info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK];
>>+		cpulist_parse((char *)nla_data(na), mask);
>>    
>>
>
>OK, so we're passing in an ASCII string.  Fair enough, I think.  Paul would
>know better.
>  
>



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-04  0:13                                                                           ` Shailabh Nagar
@ 2006-07-04  0:38                                                                             ` Andrew Morton
  0 siblings, 0 replies; 134+ messages in thread
From: Andrew Morton @ 2006-07-04  0:38 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: pj, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel, hadi, netdev

On Mon, 03 Jul 2006 20:13:36 -0400
Shailabh Nagar <nagar@watson.ibm.com> wrote:

> >>+			if (!s)
> >>+				return -ENOMEM;
> >>+			s->pid = pid;
> >>+			INIT_LIST_HEAD(&s->list);
> >>+
> >>+			down_write(sem);
> >>+			list_add(&s->list, head);
> >>+			up_write(sem);
> >>+
> >>+			if (cpu == mycpu)
> >>+				preempt_enable();
> >>    
> >>
> >
> >Actually, I don't understand the tricks which are going on with the local CPU here. 
> >What's it all for?
> >  
> >
> I was wanting to do a  get_cpu_var  for listener_list & sem
> for the current cpu and per_cpu otherwise (since thats what I thought 
> was the recommendation
> for accessing the local cpu's variable). Perhaps the preempt_disable is 
> uncalled for ?

Well we have a problem.  You want to grab this CPU's list, and then lock a
semaphore.  But taking a semaphore is a sleeping operation.

Fortunately, there's really no need to stay on-CPU at all.  When userspace
is setting or clearing entries in the map, userspace _told_ us which CPU to
manipulate, so this code can be running on any CPU at all.  So just go grab
the Nth entry in the array and acquire the lock.

And when the time comes to send some statistics, just use
raw_smp_processor_id() and don't use preempt_disable() at all.  If we end
up hopping over to another CPU, well at least we tried.  All we can do here
is to run raw_smp_processor_id() as early as possible to reduce the
possibility that we'll get a different CPU from the one which this task
really exited on.

IOW: in all cases we were provided with explicit CPU numbers from other
sources.  So no preemption disabling is required.


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-03 21:11                                                                       ` Shailabh Nagar
  2006-07-03 21:41                                                                         ` Andrew Morton
@ 2006-07-04  0:54                                                                         ` Shailabh Nagar
  2006-07-04  1:01                                                                           ` Andrew Morton
  1 sibling, 1 reply; 134+ messages in thread
From: Shailabh Nagar @ 2006-07-04  0:54 UTC (permalink / raw)
  To: hadi
  Cc: Andrew Morton, pj, Valdis.Kletnieks, jlan, balbir, csturtiv,
	linux-kernel, netdev

Shailabh Nagar wrote:

> Andrew Morton wrote:
>
>> On Fri, 30 Jun 2006 23:37:10 -0400
>> Shailabh Nagar <nagar@watson.ibm.com> wrote:
>>
>>  
>>
>>>> Set aside the implementation details and ask "what is a good design"?
>>>>
>>>> A kernel-wide constant, whether determined at build-time or by a 
>>>> /proc poke
>>>> isn't a nice design.
>>>>
>>>> Can we permit userspace to send in a netlink message describing a 
>>>> cpumask? That's back-compatible.
>>>>
>>>>
>>>>     
>>>
>>> Yes, that should be doable. And passing in a cpumask is much better 
>>> since we no longer
>>> have to maintain mappings.
>>>
>>> So the strawman is:
>>> Listener bind()s to genetlink using its real pid.
>>> Sends a separate "registration" message with cpumask to listen to. 
>>> Kernel stores (real) pid and cpumask.
>>> During task exit, kernel goes through each registered listener 
>>> (small list) and decides which
>>> one needs to get this exit data and calls a genetlink_unicast to 
>>> each one that does need it.
>>>
>>> If number of listeners is small, the lookups should be swift enough. 
>>> If it grows large, we
>>> can consider a fancier lookup (but there I go again, delving into 
>>> implementation too early :-)
>>>   
>>
>>
>> We'll need a map.
>>
>> 1024 CPUs, 1024 listeners, 1000 exits/sec/CPU and we're up to a million
>> operations per second per CPU.  Meltdown.
>>
>> But it's a pretty simple map.  A per-cpu array of pointers to the 
>> head of a
>> linked list.  One lock for each CPU's list.
>>  
>>
> Here's a patch that implements the above ideas.
>
> A listener register's interest by specifying a cpumask in the
> cpulist format (comma separated ranges of cpus). The listener's pid
> is entered into per-cpu lists for those cpus and exit events from those
> cpus go to the listeners using netlink unicasts.
>
> Please comment.
>
> Andrew, this is not being proposed for inclusion yet since there is 
> atleast one more issue that needs to be resolved:
>
> What happens when a listener exits without doing deregistration
> (or if the listener attempts to register another cpumask while a current
> registration is still active).
>
( Jamal, your thoughts on this problem would be appreciated)

Problem is that we have a listener task which has "registered" with 
taskstats and caused
its pid to be stored in various per-cpu lists of listeners. Later, when 
some other task exits on a given cpu, its exit data is sent using 
genlmsg_unicast on each pid present on that cpu's list.

If the listener exits without doing a "deregister", its pid continues to 
be kept around, obviously not a good thing. So we need some way of 
detecting the situation (task is no longer listening on
these cpus events) that is efficient.

Two solutions come to mind:

1. During the exit of every task check to see if it is is already  
"registered" with taskstats. If so, do a cleanup of its pid on various 
per-cpu lists.

2. Before doing a genlmsg_unicast to a pid on one of the per-cpu lists 
(or if genlmsg_unicast
fails with a -ECONNREFUSED, a result of netlink_lookup failing for that 
pid), then just delete
it from that cpu's list and continue.

1 is more desirable because its the right place to catch this and 
happens relatively rarely
(few listener exits compared to all exits). However, how can we check 
whether a task/pid
has registered with taskstats earlier ? Again, two possibilities
- Maintain a list of registered listeners within taskstats and check that.
- try to leverage netlink's nl_pid_hash which maintains the same kind of 
info for each protocol.
Thus a netlink_lookup of the pid would save a lot of work.
However, the netlink layer's hashtable appears to be for the entire 
NETLINK_GENERIC
protocol and not just for the taskstats client of NETLINK_GENERIC. So 
even if a task has
deregistered with taskstats, as long as it has some other 
NETLINK_GENERIC socket open,
it will still show up as "connected" as far as netlink is concerned.

Jamal - is my interpretation correct ? Do I need to essentially 
replicate the pidhash at the
taskstats layer ? Thoughts on whether there's any way genetlink can 
provide support for this or
whether its desirable etc. (we appear to be the second user of genetlink 
- this may not be a
common need going forward).

1 has the disadvantage that if such a situation is detected, one has to 
iterate over all cpus in
the system, deleting that pid from any per-cpu list it happens to be in.
One could store the cpumask that the listener originally used to 
optimize this search. usual tradeoff of storage vs. time.

2 avoids the problem just mentioned since it delegates the task of 
cleanup to each cpu at the cost
of incurring an extra check for each listener for each exit on that cpu.
By storing the task_struct instead of the pid in the per-cpu lists, the 
check can be made quite
cheap.
But one problem with 2 is the issue of recycled task_structs and pids. 
Since the stale task on the
per-cpu listener list could have exited a while back, its possible its 
alive at the time of the check
and has even registered with a different interest list ! So it'll 
receive events it didn't register for.
I guess this again calls for us to maintain the listener list within 
taskstats explicitly (solution 1)
and explicitly catch the exit of the task/pid.

Thoughts ?

--Shailabh






^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-04  0:54                                                                         ` Shailabh Nagar
@ 2006-07-04  1:01                                                                           ` Andrew Morton
  2006-07-04 13:05                                                                             ` jamal
  0 siblings, 1 reply; 134+ messages in thread
From: Andrew Morton @ 2006-07-04  1:01 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: hadi, pj, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel, netdev

On Mon, 03 Jul 2006 20:54:37 -0400
Shailabh Nagar <nagar@watson.ibm.com> wrote:

> > What happens when a listener exits without doing deregistration
> > (or if the listener attempts to register another cpumask while a current
> > registration is still active).
> >
> ( Jamal, your thoughts on this problem would be appreciated)
> 
> Problem is that we have a listener task which has "registered" with 
> taskstats and caused
> its pid to be stored in various per-cpu lists of listeners. Later, when 
> some other task exits on a given cpu, its exit data is sent using 
> genlmsg_unicast on each pid present on that cpu's list.
> 
> If the listener exits without doing a "deregister", its pid continues to 
> be kept around, obviously not a good thing. So we need some way of 
> detecting the situation (task is no longer listening on
> these cpus events) that is efficient.

Also need to address the case where the listener has closed off his file
descriptor but continues to run.

So hooking into listener's exit() isn't appropriate - the teardown is
associated with the lifetime of the fd, not of the process.  If we do that,
exit() gets handled for free.  

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-04  1:01                                                                           ` Andrew Morton
@ 2006-07-04 13:05                                                                             ` jamal
  2006-07-04 15:18                                                                               ` Shailabh Nagar
  0 siblings, 1 reply; 134+ messages in thread
From: jamal @ 2006-07-04 13:05 UTC (permalink / raw)
  To: Andrew Morton
  Cc: netdev, linux-kernel, csturtiv, balbir, jlan, Valdis.Kletnieks,
	pj, Shailabh Nagar

On Mon, 2006-03-07 at 18:01 -0700, Andrew Morton wrote:
> On Mon, 03 Jul 2006 20:54:37 -0400
> Shailabh Nagar <nagar@watson.ibm.com> wrote:
> 
> > > What happens when a listener exits without doing deregistration
> > > (or if the listener attempts to register another cpumask while a current
> > > registration is still active).
> > >
> > ( Jamal, your thoughts on this problem would be appreciated)
> > 
> > Problem is that we have a listener task which has "registered" with 
> > taskstats and caused
> > its pid to be stored in various per-cpu lists of listeners. Later, when 
> > some other task exits on a given cpu, its exit data is sent using 
> > genlmsg_unicast on each pid present on that cpu's list.
> > 
> > If the listener exits without doing a "deregister", its pid continues to 
> > be kept around, obviously not a good thing. So we need some way of 
> > detecting the situation (task is no longer listening on
> > these cpus events) that is efficient.
> 
> Also need to address the case where the listener has closed off his file
> descriptor but continues to run.
> 
> So hooking into listener's exit() isn't appropriate - the teardown is
> associated with the lifetime of the fd, not of the process.  If we do that,
> exit() gets handled for free.  

If you are always going to send unicast messages, then  -ECONNREFUSED
will tell you the listener has closed their fd - this doesnt meant it
has exited. Besides that one process could open several sockets. I know
that would not be the app you would write - but it doesnt stop other
people from doing it.
I think i may not follow what you are doing - for some reason i thought
you may have many listeners in user space and these messages get
multicast to them?
Does the user space program somehow communicate its pid to the kernel?

cheers,
jamal


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-04 13:05                                                                             ` jamal
@ 2006-07-04 15:18                                                                               ` Shailabh Nagar
  2006-07-04 16:37                                                                                 ` Shailabh Nagar
  0 siblings, 1 reply; 134+ messages in thread
From: Shailabh Nagar @ 2006-07-04 15:18 UTC (permalink / raw)
  To: hadi
  Cc: Andrew Morton, netdev, linux-kernel, csturtiv, balbir, jlan,
	Valdis.Kletnieks, pj

jamal wrote:
> On Mon, 2006-03-07 at 18:01 -0700, Andrew Morton wrote:
> 
>>On Mon, 03 Jul 2006 20:54:37 -0400
>>Shailabh Nagar <nagar@watson.ibm.com> wrote:
>>
>>
>>>>What happens when a listener exits without doing deregistration
>>>>(or if the listener attempts to register another cpumask while a current
>>>>registration is still active).
>>>>
>>>
>>>( Jamal, your thoughts on this problem would be appreciated)
>>>
>>>Problem is that we have a listener task which has "registered" with 
>>>taskstats and caused
>>>its pid to be stored in various per-cpu lists of listeners. Later, when 
>>>some other task exits on a given cpu, its exit data is sent using 
>>>genlmsg_unicast on each pid present on that cpu's list.
>>>
>>>If the listener exits without doing a "deregister", its pid continues to 
>>>be kept around, obviously not a good thing. So we need some way of 
>>>detecting the situation (task is no longer listening on
>>>these cpus events) that is efficient.
>>
>>Also need to address the case where the listener has closed off his file
>>descriptor but continues to run.
>>
>>So hooking into listener's exit() isn't appropriate - the teardown is
>>associated with the lifetime of the fd, not of the process.  If we do that,
>>exit() gets handled for free.  
> 
> 
> If you are always going to send unicast messages, then  -ECONNREFUSED
> will tell you the listener has closed their fd - this doesnt meant it
> has exited. 

Thats good. So we have atleast one way of detecting the "closed fd without
deregistering" within taskstats itself.

> Besides that one process could open several sockets. I know
> that would not be the app you would write - but it doesnt stop other
> people from doing it.

As far as API is concerned, even a taskstats listener is not being
prevented from opening multiple sockets. As Andrew also pointed out,
everything needs to be done per-socket.

> I think i may not follow what you are doing - for some reason i thought
> you may have many listeners in user space and these messages get
> multicast to them?

That was the design earlier. In the past week, the design has changed to
one where there are still many listeners in user space but messages
get unicast to each of them. Earlier listeners would get messages generated
on task exit from every cpu, now they get it only from cpus for which
they have explicitly registered interest (via a cpumask passed in through
another genetlink command).

> Does the user space program somehow communicate its pid to the kernel?

Yes. When the listener registers interest in a set of cpus, as described
above, its (genl_info->pid) is being stored in the per-cpu list of
listeners for those cpus. When a task exits on one of those cpus, the
exit data is only sent via genetlink_unicast to those pids
(really, nl_pids) who are on that cpu's listener list.


Now that I think more about it, netlink is really maintaining a pidhash
of nl_pids, not process pids, right ? So if one userapp were to open
multiple sockets using NETLINK_GENERIC protocol (regardless of how many
of those are for the taskstats), each of them would have to use a
different nl_pid. Hence, it would be valid for the taskstats layer to use 
netlink_lookup() at any time to see if the corresponding socket were
closed ?


--Shailabh





^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-04 15:18                                                                               ` Shailabh Nagar
@ 2006-07-04 16:37                                                                                 ` Shailabh Nagar
  2006-07-04 19:24                                                                                   ` jamal
  0 siblings, 1 reply; 134+ messages in thread
From: Shailabh Nagar @ 2006-07-04 16:37 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: hadi, Andrew Morton, netdev, linux-kernel, csturtiv, balbir,
	jlan, Valdis.Kletnieks, pj

Shailabh Nagar wrote:
> jamal wrote:
> 
>> On Mon, 2006-03-07 at 18:01 -0700, Andrew Morton wrote:
>>
>>> On Mon, 03 Jul 2006 20:54:37 -0400
>>> Shailabh Nagar <nagar@watson.ibm.com> wrote:
>>>
>>>
>>>>> What happens when a listener exits without doing deregistration
>>>>> (or if the listener attempts to register another cpumask while a 
>>>>> current
>>>>> registration is still active).
>>>>>
>>>>
>>>> ( Jamal, your thoughts on this problem would be appreciated)
>>>>
>>>> Problem is that we have a listener task which has "registered" with 
>>>> taskstats and caused
>>>> its pid to be stored in various per-cpu lists of listeners. Later, 
>>>> when some other task exits on a given cpu, its exit data is sent 
>>>> using genlmsg_unicast on each pid present on that cpu's list.
>>>>
>>>> If the listener exits without doing a "deregister", its pid 
>>>> continues to be kept around, obviously not a good thing. So we need 
>>>> some way of detecting the situation (task is no longer listening on
>>>> these cpus events) that is efficient.
>>>
>>>
>>> Also need to address the case where the listener has closed off his file
>>> descriptor but continues to run.
>>>
>>> So hooking into listener's exit() isn't appropriate - the teardown is
>>> associated with the lifetime of the fd, not of the process.  If we do 
>>> that,
>>> exit() gets handled for free.  
>>
>>
>>
>> If you are always going to send unicast messages, then  -ECONNREFUSED
>> will tell you the listener has closed their fd - this doesnt meant it
>> has exited. 
> 
> 
> Thats good. So we have atleast one way of detecting the "closed fd without
> deregistering" within taskstats itself.
> 
>> Besides that one process could open several sockets. I know
>> that would not be the app you would write - but it doesnt stop other
>> people from doing it.
> 
> 
> As far as API is concerned, even a taskstats listener is not being
> prevented from opening multiple sockets. As Andrew also pointed out,
> everything needs to be done per-socket.
> 
>> I think i may not follow what you are doing - for some reason i thought
>> you may have many listeners in user space and these messages get
>> multicast to them?
> 
> 
> That was the design earlier. In the past week, the design has changed to
> one where there are still many listeners in user space but messages
> get unicast to each of them. Earlier listeners would get messages generated
> on task exit from every cpu, now they get it only from cpus for which
> they have explicitly registered interest (via a cpumask passed in through
> another genetlink command).
> 
>> Does the user space program somehow communicate its pid to the kernel?
> 
> 
> Yes. When the listener registers interest in a set of cpus, as described
> above, its (genl_info->pid) is being stored in the per-cpu list of
> listeners for those cpus. When a task exits on one of those cpus, the
> exit data is only sent via genetlink_unicast to those pids
> (really, nl_pids) who are on that cpu's listener list.
> 
> 
> Now that I think more about it, netlink is really maintaining a pidhash
> of nl_pids, not process pids, right ? So if one userapp were to open
> multiple sockets using NETLINK_GENERIC protocol (regardless of how many
> of those are for the taskstats), each of them would have to use a
> different nl_pid. Hence, it would be valid for the taskstats layer to 
> use netlink_lookup() at any time to see if the corresponding socket were
> closed ?
> 

Here's a strawman for the problem we're trying to solve: get
notification of the close of a NETLINK_GENERIC socket that had
been used to register interest for some cpus within taskstats.

 From looking at the netlink code, the way to go seems to be

- it maintains a pidhash of nl_pids that are currently
registered to listen to atleast one cpu. It also stores the
cpumask used.
- taskstats registers a notifier block within netlink_chain
and receives a callback on the NETLINK_URELEASE event, similar
to drivers/scsci/scsi_transport_iscsi.c: iscsi_rcv_nl_event()

- the callback checks to see that the protocol is NETLINK_GENERIC
and that the nl_pid for the socket is in taskstat's pidhash. If so, it
does a cleanup using the stored cpumask and releases the nl_pid
from the pidhash.

We can even do away with the deregister command altogether and
simply rely on this autocleanup.

--Shailabh

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-04 16:37                                                                                 ` Shailabh Nagar
@ 2006-07-04 19:24                                                                                   ` jamal
  2006-07-05 14:09                                                                                     ` Shailabh Nagar
  0 siblings, 1 reply; 134+ messages in thread
From: jamal @ 2006-07-04 19:24 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: pj, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel,
	netdev, Andrew Morton

Shailabh,

On Tue, 2006-04-07 at 12:37 -0400, Shailabh Nagar wrote:
[..]
> Here's a strawman for the problem we're trying to solve: get
> notification of the close of a NETLINK_GENERIC socket that had
> been used to register interest for some cpus within taskstats.
> 
>  From looking at the netlink code, the way to go seems to be
> 
> - it maintains a pidhash of nl_pids that are currently
> registered to listen to atleast one cpu. It also stores the
> cpumask used.
> - taskstats registers a notifier block within netlink_chain
> and receives a callback on the NETLINK_URELEASE event, similar
> to drivers/scsci/scsi_transport_iscsi.c: iscsi_rcv_nl_event()
> 
> - the callback checks to see that the protocol is NETLINK_GENERIC
> and that the nl_pid for the socket is in taskstat's pidhash. If so, it
> does a cleanup using the stored cpumask and releases the nl_pid
> from the pidhash.
> 

Sound quiet reasonable.  I am beginning to wonder whether we should do 
do the NETLINK_URELEASE in general for NETLINK_GENERIC

> We can even do away with the deregister command altogether and
> simply rely on this autocleanup.

I think if you may still need the register if you are going to allow
multiple sockets per listener process, no?
The other question is how do you correlate pid -> fd?

cheers,
jamal




^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-04  0:09                                                                           ` Shailabh Nagar
@ 2006-07-04 19:59                                                                             ` Paul Jackson
  0 siblings, 0 replies; 134+ messages in thread
From: Paul Jackson @ 2006-07-04 19:59 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: akpm, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel,
	hadi, netdev

Shailabh wrote:
> Perhaps I should use the the other ascii format for specifying cpumasks 
> since its more amenable
> to specifying an upper bound for the length of the ascii string and is 
> more compact ?

Eh - basically - I don't have a strong opinion either way.

I have a slight esthetic preference toward using list of ranges format
from shell scripts and shell prompts, and using the 32-bit hex words
from C code:

	17-26,44-47		# shell - list of ranges
	0000f000,07fe0000	# C - 32-bit hex words

Since the primary interface you are working with is C code, that would
mean I'd slightly prefer the 32-bit hex word variant.

>From what I've seen neither of the reasons you gave for preferring
the 32-bit hex word format are persuasive (even though they both
lead to the same conclusion as I preferred ;):

    Which is more compact depends on that particular bit pattern
    you need to represent.  See for example the examples above.

    The lack of a perfect upper bound on the list of ranges format
    is a theoretical problem that I have never seen in practice.
    Only pathological constructs exceed six ascii characters per
    set bit.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-03 21:41                                                                         ` Andrew Morton
  2006-07-04  0:13                                                                           ` Shailabh Nagar
@ 2006-07-04 20:19                                                                           ` Paul Jackson
  2006-07-04 20:22                                                                             ` Paul Jackson
  1 sibling, 1 reply; 134+ messages in thread
From: Paul Jackson @ 2006-07-04 20:19 UTC (permalink / raw)
  To: Andrew Morton
  Cc: nagar, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel,
	hadi, netdev

Andrew wrote:
> OK, so we're passing in an ASCII string.  Fair enough, I think.  Paul would
> know better.

Not sure if I know better - just got stronger opinions.

I like the ASCII here - but this is one of those "he who
writes the code gets to 

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-04 20:19                                                                           ` Paul Jackson
@ 2006-07-04 20:22                                                                             ` Paul Jackson
  0 siblings, 0 replies; 134+ messages in thread
From: Paul Jackson @ 2006-07-04 20:22 UTC (permalink / raw)
  To: Paul Jackson
  Cc: akpm, nagar, Valdis.Kletnieks, jlan, balbir, csturtiv,
	linux-kernel, hadi, netdev

pj wrote:
> writes the code gets to 

Never mind that last incomplete post - I hit Send
when I meant to hit Cancel.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-04 19:24                                                                                   ` jamal
@ 2006-07-05 14:09                                                                                     ` Shailabh Nagar
  2006-07-05 20:25                                                                                       ` Chris Sturtivant
  0 siblings, 1 reply; 134+ messages in thread
From: Shailabh Nagar @ 2006-07-05 14:09 UTC (permalink / raw)
  To: hadi
  Cc: pj, Valdis.Kletnieks, jlan, balbir, csturtiv, linux-kernel,
	netdev, Andrew Morton

jamal wrote:
> Shailabh,
> 
> On Tue, 2006-04-07 at 12:37 -0400, Shailabh Nagar wrote:
> [..]
> 
>>Here's a strawman for the problem we're trying to solve: get
>>notification of the close of a NETLINK_GENERIC socket that had
>>been used to register interest for some cpus within taskstats.
>>
>> From looking at the netlink code, the way to go seems to be
>>
>>- it maintains a pidhash of nl_pids that are currently
>>registered to listen to atleast one cpu. It also stores the
>>cpumask used.
>>- taskstats registers a notifier block within netlink_chain
>>and receives a callback on the NETLINK_URELEASE event, similar
>>to drivers/scsci/scsi_transport_iscsi.c: iscsi_rcv_nl_event()
>>
>>- the callback checks to see that the protocol is NETLINK_GENERIC
>>and that the nl_pid for the socket is in taskstat's pidhash. If so, it
>>does a cleanup using the stored cpumask and releases the nl_pid
>>from the pidhash.
>>
> 
> 
> Sound quiet reasonable.  I am beginning to wonder whether we should do 
> do the NETLINK_URELEASE in general for NETLINK_GENERIC

I'd initially thought that might be useful but since NETLINK_GENERIC
is only "virtually" multiplexing the sockfd amongst each of its users,
I don't know what benefits a generic notifier at NETLINK_GENERIC layer
would bring (as opposed to each NETLINK_GENERIC user directly registering
its callback with netlink). Perhaps simplicity ?


>>We can even do away with the deregister command altogether and
>>simply rely on this autocleanup.
> 
> 
> I think if you may still need the register if you are going to allow
> multiple sockets per listener process, no?

The register command, yes. But an explicit deregister, as opposed to
auto cleanup on fd close, may not be used all that much :-)

> The other question is how do you correlate pid -> fd?

For the notifier callback, I thought netlink_release will
provide the nl_pid correspoding to the fd being closed ?
I can just do a search for that nl_pid in the taskstats-private pidhash.

The nl_pid gets into the pidhash using the genl_info->pid field
when the listener issues the register command.

Will that be correct ?

So here's the sequence of pids being used/hashed etc. Please let
me know if my assumptions are correct ?

1. Same listener thread opens 2 sockets

On sockfd1, does a bind() using
	sockaddr_nl.nl_pid = my_pid1
On sockfd2, does a bind() using
	sockaddr_nl.nl_pid = my_pid2

(one of my_pid1's could by its process pid but doesn't have to be)

2. Listener supplies cpumasks on each of the sockets through a
register command sent on sockfd1.

In the kernel, when the command is received,
the genl_info->pid field contains my_pid1

my_pid1 is stored in a pidhash alongwith the corresponding cpumask.

cpumask is used to store the my_pid1 into per-cpu lists for each
cpu in the mask.

3. When an exit event happens on one of those cpus in the mask,
it is sent to this listener using
	genlmsg_unicast(...., my_pid1)


4. When the listener closes sockfd1, netlink_release() gets called
and that calls a taskstats notifier callback (say taskstats_cb) with
	struct netlink_notify n =
	{ .protocol =  NETLINK_GENERIC, .pid = my_pid1 }

and using the .pid within, taskstats_cb can do a lookup within its
pidhash. If its present, use the cpumask stored alongside to go
clean up my_pid1 stored in the listener list of each cpu in the mask.


--Shailabh

	



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-03 15:02                                                                       ` Shailabh Nagar
  2006-07-03 15:55                                                                         ` Paul Jackson
  2006-07-03 16:31                                                                         ` Paul Jackson
@ 2006-07-05 17:20                                                                         ` Jay Lan
  2006-07-05 18:18                                                                           ` Shailabh Nagar
  2 siblings, 1 reply; 134+ messages in thread
From: Jay Lan @ 2006-07-05 17:20 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: Paul Jackson, akpm, Valdis.Kletnieks, balbir, csturtiv,
	linux-kernel, hadi, netdev

Shailabh Nagar wrote:

> Yes. If no one registers to listen on a particular CPU, data from tasks
> exiting on that cpu is not sent out at all.

Shailabh also wrote:

> During task exit, kernel goes through each registered listener (small 
> list) and decides which
> one needs to get this exit data and calls a genetlink_unicast to each 
> one that does need it.


Are we eliminating multicast taskstats data at exit time? A unicast
exit data with cpumask will do for me, but just like to be sure where
we are.

Thanks,
 - jay


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-05 17:20                                                                         ` Jay Lan
@ 2006-07-05 18:18                                                                           ` Shailabh Nagar
  0 siblings, 0 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-07-05 18:18 UTC (permalink / raw)
  To: Jay Lan
  Cc: Paul Jackson, akpm, Valdis.Kletnieks, balbir, csturtiv,
	linux-kernel, hadi, netdev

Jay Lan wrote:
> Shailabh Nagar wrote:
> 
> 
>>Yes. If no one registers to listen on a particular CPU, data from tasks
>>exiting on that cpu is not sent out at all.
> 
> 
> Shailabh also wrote:
> 
> 
>>During task exit, kernel goes through each registered listener (small 
>>list) and decides which
>>one needs to get this exit data and calls a genetlink_unicast to each 
>>one that does need it.
> 
> 
> 
> Are we eliminating multicast taskstats data at exit time? 

Yes. Only unicasts to each listener now.

> A unicast
> exit data with cpumask will do for me, but just like to be sure where
> we are.


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-05 14:09                                                                                     ` Shailabh Nagar
@ 2006-07-05 20:25                                                                                       ` Chris Sturtivant
  2006-07-05 20:32                                                                                         ` Shailabh Nagar
  0 siblings, 1 reply; 134+ messages in thread
From: Chris Sturtivant @ 2006-07-05 20:25 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: hadi, pj, Valdis.Kletnieks, jlan, balbir, linux-kernel, netdev,
	Andrew Morton

Shailabh Nagar wrote:
> So here's the sequence of pids being used/hashed etc. Please let
> me know if my assumptions are correct ?
>
> 1. Same listener thread opens 2 sockets
>
> On sockfd1, does a bind() using
> 	sockaddr_nl.nl_pid = my_pid1
> On sockfd2, does a bind() using
> 	sockaddr_nl.nl_pid = my_pid2
>
> (one of my_pid1's could by its process pid but doesn't have to be)
>   

For CSA, we are proposing to use a single (multi-threaded) demon that 
combines both the userland components for job and CSA that used to be in 
the kernel.  In this case, the pid will be the same for two connections 
along with the cpu range.  Does what your saying here mean that we 
should choose distinct values for my_pid1 and my_pid2 to avoid the two 
sockets looking the same?  I'm not too familiar with netlink, yet.

Best regards,


--Chris

-- 
-----------------------------------------------------------------
Chris Sturtivant, PhD,
Linux System Software,
SGI
(650) 933-1703
-----------------------------------------------------------------


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [Patch][RFC] Disabling per-tgid stats on task exit in taskstats
  2006-07-05 20:25                                                                                       ` Chris Sturtivant
@ 2006-07-05 20:32                                                                                         ` Shailabh Nagar
  0 siblings, 0 replies; 134+ messages in thread
From: Shailabh Nagar @ 2006-07-05 20:32 UTC (permalink / raw)
  To: Chris Sturtivant
  Cc: hadi, pj, Valdis.Kletnieks, jlan, balbir, linux-kernel, netdev,
	Andrew Morton

Chris Sturtivant wrote:
> Shailabh Nagar wrote:
> 
>> So here's the sequence of pids being used/hashed etc. Please let
>> me know if my assumptions are correct ?
>>
>> 1. Same listener thread opens 2 sockets
>>
>> On sockfd1, does a bind() using
>>     sockaddr_nl.nl_pid = my_pid1
>> On sockfd2, does a bind() using
>>     sockaddr_nl.nl_pid = my_pid2
>>
>> (one of my_pid1's could by its process pid but doesn't have to be)
>>   
> 
> 
> For CSA, we are proposing to use a single (multi-threaded) demon that
> combines both the userland components for job and CSA that used to be in
> the kernel.  In this case, the pid will be the same for two connections
> along with the cpu range.  Does what your saying here mean that we
> should choose distinct values for my_pid1 and my_pid2 to avoid the two
> sockets looking the same? 

Yes, that is my understanding and also whats mentioned in the bind()
section in
http://www.linuxjournal.com/article/7356

though I've yet to try it out myself (will do so shortly after
making the other suggested changes to the basic patch)

--Shailabh

> I'm not too familiar with netlink, yet.
> 
> Best regards,
> 
> 
> --Chris
> 


^ permalink raw reply	[flat|nested] 134+ messages in thread

end of thread, other threads:[~2006-07-05 20:33 UTC | newest]

Thread overview: 134+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-06-09  7:41 [Patch][RFC] Disabling per-tgid stats on task exit in taskstats Shailabh Nagar
2006-06-09  8:00 ` Andrew Morton
2006-06-09 10:51   ` Balbir Singh
2006-06-09 11:21     ` Andrew Morton
2006-06-09 13:20       ` Shailabh Nagar
2006-06-09 18:25         ` Jay Lan
2006-06-09 19:12           ` Shailabh Nagar
2006-06-09 15:36       ` Balbir Singh
2006-06-09 18:35         ` Jay Lan
2006-06-09 19:31           ` Shailabh Nagar
2006-06-09 21:56       ` Shailabh Nagar
2006-06-09 22:42         ` Jay Lan
2006-06-09 23:22           ` Andrew Morton
2006-06-09 23:47             ` Jay Lan
2006-06-09 23:56               ` Andrew Morton
2006-06-10 12:21               ` Shailabh Nagar
2006-06-12 18:31                 ` Jay Lan
2006-06-12 21:57                   ` Shailabh Nagar
2006-06-10 13:05               ` Shailabh Nagar
2006-06-12 18:54                 ` Jay Lan
2006-06-21 19:11         ` Jay Lan
2006-06-21 19:14           ` Jay Lan
2006-06-21 19:34             ` Shailabh Nagar
2006-06-21 23:35               ` Jay Lan
2006-06-21 23:45                 ` Shailabh Nagar
2006-06-23 17:14                 ` Shailabh Nagar
2006-06-23 18:19                   ` Jay Lan
2006-06-23 18:53                     ` Shailabh Nagar
2006-06-23 20:00                       ` Jay Lan
2006-06-23 20:16                         ` Shailabh Nagar
2006-06-23 20:36                           ` Jay Lan
2006-06-23 21:19                   ` Andrew Morton
2006-06-23 22:07                     ` Jay Lan
2006-06-23 23:47                       ` Andrew Morton
2006-06-24  2:59                         ` Shailabh Nagar
2006-06-24  4:39                           ` Andrew Morton
2006-06-24  5:59                             ` Shailabh Nagar
2006-06-26 17:33                               ` Jay Lan
2006-06-26 17:52                                 ` Shailabh Nagar
2006-06-26 17:55                                 ` Andrew Morton
2006-06-26 18:00                                   ` Shailabh Nagar
2006-06-26 18:12                                     ` Andrew Morton
2006-06-26 18:26                                       ` Jay Lan
2006-06-26 18:39                                         ` Andrew Morton
2006-06-26 18:49                                           ` Shailabh Nagar
2006-06-26 19:00                                           ` Jay Lan
2006-06-28 21:30                                           ` Jay Lan
2006-06-28 21:53                                             ` Andrew Morton
2006-06-28 22:02                                               ` Jay Lan
2006-06-29  8:40                                                 ` Paul Jackson
2006-06-29 12:30                                                   ` Valdis.Kletnieks
2006-06-29 16:44                                                     ` Paul Jackson
2006-06-29 18:01                                                       ` Andrew Morton
2006-06-29 18:07                                                         ` Paul Jackson
2006-06-29 18:26                                                         ` Paul Jackson
2006-06-29 19:15                                                           ` Shailabh Nagar
2006-06-29 19:41                                                             ` Paul Jackson
2006-06-29 21:42                                                               ` Shailabh Nagar
2006-06-29 21:54                                                                 ` Jay Lan
2006-06-29 22:09                                                                   ` Shailabh Nagar
2006-06-29 22:23                                                                 ` Paul Jackson
2006-06-30  0:15                                                                   ` Shailabh Nagar
2006-06-30  0:40                                                                     ` Paul Jackson
2006-06-30  1:00                                                                       ` Shailabh Nagar
2006-06-30  1:05                                                                         ` Paul Jackson
     [not found]                                                                   ` <44A46C6C.1090405@watson.ibm.com>
2006-06-30  0:38                                                                     ` Paul Jackson
2006-06-30  2:21                                                                       ` Paul Jackson
2006-06-30  2:46                                                                         ` Shailabh Nagar
2006-06-30  2:54                                                                           ` Paul Jackson
2006-06-30  3:02                                                                           ` Paul Jackson
2006-06-29 19:22                                                           ` Shailabh Nagar
2006-06-29 19:10                                                         ` Shailabh Nagar
2006-06-29 19:23                                                           ` Paul Jackson
2006-06-29 19:33                                                           ` Andrew Morton
2006-06-29 19:43                                                             ` Shailabh Nagar
2006-06-29 20:00                                                               ` Andrew Morton
2006-06-29 22:13                                                                 ` Shailabh Nagar
2006-06-29 23:00                                                                   ` jamal
2006-06-29 20:01                                                             ` Shailabh Nagar
2006-06-29 21:22                                                               ` Paul Jackson
2006-06-29 22:54                                                               ` jamal
2006-06-30  0:38                                                                 ` Shailabh Nagar
2006-06-30  1:05                                                                   ` Andrew Morton
2006-06-30  1:11                                                                     ` Shailabh Nagar
2006-06-30  1:30                                                                       ` jamal
2006-06-30  3:01                                                                         ` Shailabh Nagar
2006-06-30 12:45                                                                           ` jamal
2006-06-30  2:25                                                                     ` Paul Jackson
2006-06-30  2:35                                                                       ` Andrew Morton
2006-06-30  2:43                                                                         ` Paul Jackson
2006-06-29 19:33                                                         ` Jay Lan
2006-06-30 18:53                                                         ` Shailabh Nagar
2006-06-30 19:10                                                           ` Shailabh Nagar
2006-06-30 19:19                                                             ` Shailabh Nagar
2006-06-30 20:19                                                             ` jamal
2006-06-30 22:50                                                             ` Andrew Morton
2006-07-01  2:20                                                               ` Shailabh Nagar
2006-07-01  2:43                                                                 ` Andrew Morton
2006-07-01  3:37                                                                   ` Shailabh Nagar
2006-07-01  3:51                                                                     ` Andrew Morton
2006-07-03 21:11                                                                       ` Shailabh Nagar
2006-07-03 21:41                                                                         ` Andrew Morton
2006-07-04  0:13                                                                           ` Shailabh Nagar
2006-07-04  0:38                                                                             ` Andrew Morton
2006-07-04 20:19                                                                           ` Paul Jackson
2006-07-04 20:22                                                                             ` Paul Jackson
2006-07-04  0:54                                                                         ` Shailabh Nagar
2006-07-04  1:01                                                                           ` Andrew Morton
2006-07-04 13:05                                                                             ` jamal
2006-07-04 15:18                                                                               ` Shailabh Nagar
2006-07-04 16:37                                                                                 ` Shailabh Nagar
2006-07-04 19:24                                                                                   ` jamal
2006-07-05 14:09                                                                                     ` Shailabh Nagar
2006-07-05 20:25                                                                                       ` Chris Sturtivant
2006-07-05 20:32                                                                                         ` Shailabh Nagar
2006-07-03  4:53                                                                     ` Paul Jackson
2006-07-03 15:02                                                                       ` Shailabh Nagar
2006-07-03 15:55                                                                         ` Paul Jackson
2006-07-03 16:31                                                                         ` Paul Jackson
2006-07-04  0:09                                                                           ` Shailabh Nagar
2006-07-04 19:59                                                                             ` Paul Jackson
2006-07-05 17:20                                                                         ` Jay Lan
2006-07-05 18:18                                                                           ` Shailabh Nagar
2006-06-30 22:56                                                           ` Andrew Morton
2006-06-29 18:05                                                       ` Nick Piggin
2006-06-29 12:42                                                 ` Shailabh Nagar
2006-06-24  3:08                     ` Shailabh Nagar
2006-06-21 20:38           ` Andrew Morton
2006-06-21 21:31             ` Shailabh Nagar
2006-06-21 21:45               ` Jay Lan
2006-06-21 21:54                 ` Andrew Morton
2006-06-21 22:19                   ` Jay Lan
2006-06-21 21:59                 ` Shailabh Nagar
2006-06-09 15:55 ` Chris Sturtivant

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).