linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] new bitmap list format (for cpusets)
@ 2004-08-05 10:08 Paul Jackson
  2004-08-05 10:10 ` [PATCH] cpusets - big numa cpu and memory placement Paul Jackson
                   ` (2 more replies)
  0 siblings, 3 replies; 233+ messages in thread
From: Paul Jackson @ 2004-08-05 10:08 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Christoph Hellwig, Jack Steiner, Jesse Barnes, Sylvain Jeaugey,
	Dan Higgins, linux-kernel, Matthew Dobson, Simon Derr,
	Andi Kleen, lse-tech, Paul Jackson, Dimitri Sivanich

A bitmap print and parse format that provides lists of ranges of
numbers, to be first used for by cpusets (next patch).

Cpusets provide a way to manage subsets of CPUs and Memory Nodes
for scheduling and memory placement, via a new virtual file system,
usually mounted at /dev/cpuset.  Manipulation of cpusets can be done
directly via this file system, from the shell.

However, manipulating 512 bit cpumasks or 256 bit nodemasks (which
will get bigger) via hex mask strings is painful for humans.

The intention is to provide a format for the cpu and memory mask files
in /dev/cpusets that will stand the test of time.  This format is
supported by a couple of new lib/bitmap.c routines, for printing and
parsing these strings.  Wrappers for cpumask and nodemask are provided.

See the embedded comments, below in the patch, for more details of
the format.  The input format supports adding or removing specified
cpus or nodes, as well as entirely rewriting the mask.

 include/linux/bitmap.h   |    8 ++
 include/linux/cpumask.h  |   22 ++++++-
 include/linux/nodemask.h |   22 ++++++-
 lib/bitmap.c             |  142 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 189 insertions(+), 5 deletions(-)

Signed-off-by: Paul Jackson <pj@sgi.com>

Index: 2.6.8-rc2-mm2/include/linux/bitmap.h
===================================================================
--- 2.6.8-rc2-mm2.orig/include/linux/bitmap.h	2004-08-04 19:29:15.000000000 -0700
+++ 2.6.8-rc2-mm2/include/linux/bitmap.h	2004-08-04 19:41:10.000000000 -0700
@@ -41,7 +41,9 @@
  * bitmap_shift_right(dst, src, n, nbits)	*dst = *src >> n
  * bitmap_shift_left(dst, src, n, nbits)	*dst = *src << n
  * bitmap_scnprintf(buf, len, src, nbits)	Print bitmap src to buf
- * bitmap_parse(ubuf, ulen, dst, nbits)		Parse bitmap dst from buf
+ * bitmap_parse(ubuf, ulen, dst, nbits)		Parse bitmap dst from user buf
+ * bitmap_scnlistprintf(buf, len, src, nbits)	Print bitmap src as list to buf
+ * bitmap_parselist(buf, dst, nbits)		Parse bitmap dst from list
  */
 
 /*
@@ -98,6 +100,10 @@ extern int bitmap_scnprintf(char *buf, u
 			const unsigned long *src, int nbits);
 extern int bitmap_parse(const char __user *ubuf, unsigned int ulen,
 			unsigned long *dst, int nbits);
+extern int bitmap_scnlistprintf(char *buf, unsigned int len,
+			const unsigned long *src, int nbits);
+extern int bitmap_parselist(const char *buf, unsigned long *maskp,
+			int nmaskbits);
 extern int bitmap_find_free_region(unsigned long *bitmap, int bits, int order);
 extern void bitmap_release_region(unsigned long *bitmap, int pos, int order);
 extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order);
Index: 2.6.8-rc2-mm2/include/linux/cpumask.h
===================================================================
--- 2.6.8-rc2-mm2.orig/include/linux/cpumask.h	2004-08-04 19:29:34.000000000 -0700
+++ 2.6.8-rc2-mm2/include/linux/cpumask.h	2004-08-04 20:35:10.000000000 -0700
@@ -10,6 +10,8 @@
  *
  * For details of cpumask_scnprintf() and cpumask_parse(),
  * see bitmap_scnprintf() and bitmap_parse() in lib/bitmap.c.
+ * For details of cpulist_scnprintf() and cpulist_parse(), see
+ * bitmap_scnlistprintf() and bitmap_parselist(), also in bitmap.c.
  *
  * The available cpumask operations are:
  *
@@ -46,6 +48,8 @@
  *
  * int cpumask_scnprintf(buf, len, mask) Format cpumask for printing
  * int cpumask_parse(ubuf, ulen, mask)	Parse ascii string as cpumask
+ * int cpulist_scnprintf(buf, len, mask) Format cpumask as list for printing
+ * int cpulist_parse(buf, map)		Parse ascii string as cpulist
  *
  * for_each_cpu_mask(cpu, mask)		for-loop cpu over mask
  *
@@ -268,14 +272,28 @@ static inline int __cpumask_scnprintf(ch
 	return bitmap_scnprintf(buf, len, srcp->bits, nbits);
 }
 
-#define cpumask_parse(ubuf, ulen, src) \
-			__cpumask_parse((ubuf), (ulen), &(src), NR_CPUS)
+#define cpumask_parse(ubuf, ulen, dst) \
+			__cpumask_parse((ubuf), (ulen), &(dst), NR_CPUS)
 static inline int __cpumask_parse(const char __user *buf, int len,
 					cpumask_t *dstp, int nbits)
 {
 	return bitmap_parse(buf, len, dstp->bits, nbits);
 }
 
+#define cpulist_scnprintf(buf, len, src) \
+			__cpulist_scnprintf((buf), (len), &(src), NR_CPUS)
+static inline int __cpulist_scnprintf(char *buf, int len,
+					const cpumask_t *srcp, int nbits)
+{
+	return bitmap_scnlistprintf(buf, len, srcp->bits, nbits);
+}
+
+#define cpulist_parse(buf, dst) __cpulist_parse((buf), &(dst), NR_CPUS)
+static inline int __cpulist_parse(const char *buf, cpumask_t *dstp, int nbits)
+{
+	return bitmap_parselist(buf, dstp->bits, nbits);
+}
+
 #if NR_CPUS > 1
 #define for_each_cpu_mask(cpu, mask)		\
 	for ((cpu) = first_cpu(mask);		\
Index: 2.6.8-rc2-mm2/include/linux/nodemask.h
===================================================================
--- 2.6.8-rc2-mm2.orig/include/linux/nodemask.h	2004-08-04 19:29:29.000000000 -0700
+++ 2.6.8-rc2-mm2/include/linux/nodemask.h	2004-08-04 20:28:50.000000000 -0700
@@ -10,6 +10,8 @@
  *
  * For details of nodemask_scnprintf() and nodemask_parse(),
  * see bitmap_scnprintf() and bitmap_parse() in lib/bitmap.c.
+ * For details of nodelist_scnprintf() and nodelist_parse(), see
+ * bitmap_scnlistprintf() and bitmap_parselist(), also in bitmap.c.
  *
  * The available nodemask operations are:
  *
@@ -46,6 +48,8 @@
  *
  * int nodemask_scnprintf(buf, len, mask) Format nodemask for printing
  * int nodemask_parse(ubuf, ulen, mask)	Parse ascii string as nodemask
+ * int nodelist_scnprintf(buf, len, mask) Format nodemask as list for printing
+ * int nodelist_parse(buf, map)		Parse ascii string as nodelist
  *
  * for_each_node_mask(node, mask)	for-loop node over mask
  *
@@ -271,14 +275,28 @@ static inline int __nodemask_scnprintf(c
 	return bitmap_scnprintf(buf, len, srcp->bits, nbits);
 }
 
-#define nodemask_parse(ubuf, ulen, src) \
-			__nodemask_parse((ubuf), (ulen), &(src), MAX_NUMNODES)
+#define nodemask_parse(ubuf, ulen, dst) \
+			__nodemask_parse((ubuf), (ulen), &(dst), MAX_NUMNODES)
 static inline int __nodemask_parse(const char __user *buf, int len,
 					nodemask_t *dstp, int nbits)
 {
 	return bitmap_parse(buf, len, dstp->bits, nbits);
 }
 
+#define nodelist_scnprintf(buf, len, src) \
+			__nodelist_scnprintf((buf), (len), &(src), MAX_NUMNODES)
+static inline int __nodelist_scnprintf(char *buf, int len,
+					const nodemask_t *srcp, int nbits)
+{
+	return bitmap_scnlistprintf(buf, len, srcp->bits, nbits);
+}
+
+#define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES)
+static inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits)
+{
+	return bitmap_parselist(buf, dstp->bits, nbits);
+}
+
 #if MAX_NUMNODES > 1
 #define for_each_node_mask(node, mask)			\
 	for ((node) = first_node(mask);			\
Index: 2.6.8-rc2-mm2/lib/bitmap.c
===================================================================
--- 2.6.8-rc2-mm2.orig/lib/bitmap.c	2004-08-04 19:29:15.000000000 -0700
+++ 2.6.8-rc2-mm2/lib/bitmap.c	2004-08-04 21:44:41.000000000 -0700
@@ -291,6 +291,7 @@ EXPORT_SYMBOL(__bitmap_weight);
 #define nbits_to_hold_value(val)	fls(val)
 #define roundup_power2(val,modulus)	(((val) + (modulus) - 1) & ~((modulus) - 1))
 #define unhex(c)			(isdigit(c) ? (c - '0') : (toupper(c) - 'A' + 10))
+#define BASEDEC 10		/* fancier cpuset lists input in decimal */
 
 /**
  * bitmap_scnprintf - convert bitmap to an ASCII hex string.
@@ -409,6 +410,147 @@ int bitmap_parse(const char __user *ubuf
 }
 EXPORT_SYMBOL(bitmap_parse);
 
+/*
+ * bscnl_emit(buf, buflen, rbot, rtop, bp)
+ *
+ * Helper routine for bitmap_scnlistprintf().  Write decimal number
+ * or range to buf, suppressing output past buf+buflen, with optional
+ * comma-prefix.  Return len of what would be written to buf, if it
+ * all fit.
+ */
+
+int bscnl_emit(char *buf, int buflen, int rbot, int rtop, int len)
+{
+	if (len)
+		len += scnprintf(buf + len, buflen - len, ",");
+	if (rbot == rtop)
+		len += scnprintf(buf + len, buflen - len, "%d", rbot);
+	else
+		len += scnprintf(buf + len, buflen - len, "%d-%d", rbot, rtop);
+	return len;
+}
+
+/**
+ * bitmap_scnlistprintf - convert bitmap to an ASCII hex string, list format
+ * @buf: byte buffer into which string is placed
+ * @buflen: reserved size of @buf, in bytes
+ * @maskp: pointer to bitmap to convert
+ * @nmaskbits: size of bitmap, in bits
+ *
+ * Output format is a comma-separated list of decimal numbers and
+ * ranges.  Consecutively set bits are shown as two hyphen-separated
+ * decimal numbers, the smallest and largest bit numbers set in
+ * the range.  Output format is a compatible subset of the format
+ * accepted as input by bitmap_parselist().
+ *
+ * The return value is the number of characters which would be
+ * generated for the given input, excluding the trailing '\0', as
+ * per ISO C99.
+ */
+
+int bitmap_scnlistprintf(char *buf, unsigned int buflen,
+	const unsigned long *maskp, int nmaskbits)
+{
+	int len = 0;
+	/* current bit is 'cur', most recently seen range is [rbot, rtop] */
+	int cur, rbot, rtop;
+
+	rbot = cur = find_first_bit(maskp, nmaskbits);
+	while (cur < nmaskbits) {
+		rtop = cur;
+		cur = find_next_bit(maskp, nmaskbits, cur+1);
+		if (cur >= nmaskbits || cur > rtop + 1) {
+			len = bscnl_emit(buf, buflen, rbot, rtop, len);
+			rbot = cur;
+		}
+	}
+	return len;
+}
+EXPORT_SYMBOL(bitmap_scnlistprintf);
+
+/**
+ * bitmap_parselist - parses a more flexible format for inputting bit masks
+ * @buf: read nul-terminated user string from this buffer
+ * @mask: write resulting mask here
+ * @nmaskbits: number of bits in mask to be written
+ *
+ * The input format supports a space separated list of one or more comma
+ * separated sequences of ascii decimal bit numbers and ranges.  Each
+ * sequence may be preceded by one of the prefix characters '=',
+ * '-', '+', or '!', which have the following meanings:
+ *    '=': rewrite the mask to have only the bits specified in this sequence
+ *    '-': turn off the bits specified in this sequence
+ *    '+': turn on the bits specified in this sequence
+ *    '!': same as '-'.
+ *
+ * If no such initial character is specified, then the default prefix '='
+ * is presumed.  The list is evaluated and applied in left to right order.
+ *
+ * Eamples of input format:
+ *	0-4,9				# rewrites to 0,1,2,3,4,9
+ *	-9				# removes 9
+ *	+6-8				# adds 6,7,8
+ *	1-6 -0,2-4 +11-14,16-19 -14-16	# same as 1,5,6,11-13,17-19
+ *	1-6 -0,2-4 +11-14,16-19 =14-16	# same as just 14,15,16
+ *
+ * Possible errno's returned for invalid input strings are:
+ *      -EINVAL:   second number in range smaller than first
+ *      -ERANGE:   bit number specified too large for mask
+ *      -EINVAL: invalid prefix char (not '=', '-', '+', or '!')
+ */
+
+int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits)
+{
+	char *p, *q;
+	int masklen = BITS_TO_LONGS(nmaskbits);
+
+	while ((p = strsep((char **)(&buf), " ")) != NULL) { /* blows const XXX */
+		char op = isdigit(*p) ? '=' : *p++;
+		unsigned long m[masklen];
+		int maskbytes = sizeof(m);
+		int i;
+
+		if (op == ' ')
+			continue;
+		memset(m, 0, maskbytes);
+
+		while ((q = strsep(&p, ",")) != NULL) {
+			unsigned a = simple_strtoul(q, 0, BASEDEC);
+			unsigned b = a;
+			char *cp = strchr(q, '-');
+			if (cp)
+				b = simple_strtoul(cp + 1, 0, BASEDEC);
+			if (!(a <= b))
+				return -EINVAL;
+			if (b >= nmaskbits)
+				return -ERANGE;
+			while (a <= b) {
+				set_bit(a, m);
+				a++;
+			}
+		}
+
+		switch (op) {
+			case '=':
+				memcpy(maskp, m, maskbytes);
+				break;
+			case '!':
+			case '-':
+				for (i = 0; i < masklen; i++)
+					maskp[i] &= ~m[i];
+				break;
+			case '+':
+				for (i = 0; i < masklen; i++)
+					maskp[i] |= m[i];
+				break;
+			default:
+				return -EINVAL;
+		}
+	}
+	return 0;
+}
+EXPORT_SYMBOL(bitmap_parselist);
+
 /**
  *	bitmap_find_free_region - find a contiguous aligned mem region
  *	@bitmap: an array of unsigned longs corresponding to the bitmap

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* [PATCH] cpusets - big numa cpu and memory placement
  2004-08-05 10:08 [PATCH] new bitmap list format (for cpusets) Paul Jackson
@ 2004-08-05 10:10 ` Paul Jackson
  2004-08-05 20:55   ` [Lse-tech] " Martin J. Bligh
  2004-08-05 20:47 ` [Lse-tech] [PATCH] new bitmap list format (for cpusets) Martin J. Bligh
  2004-08-11 13:11 ` Dinakar Guniguntala
  2 siblings, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-08-05 10:10 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Christoph Hellwig, Jack Steiner, Jesse Barnes, Sylvain Jeaugey,
	Dan Higgins, linux-kernel, Matthew Dobson, Simon Derr,
	Andi Kleen, lse-tech, Paul Jackson, Dimitri Sivanich

Andrew,

I would like to propose the following patch for inclusion in your
2.6.9-*mm series, when that opens.  It provides an important facility
for high performance computing on large systems.  Simon Derr of Bull
(France) and myself are the primary authors.

I offer it to lkml now, in order to invite continued feedback.
Thank-you to several who have provided valuable feedback so far,
including Christoph and Andi (I make no claim that they endorse
this patch).

This is the third time I have posted cpusets on lkml.  The first two
times, a month or two ago, were more preliminary.  I believe that
the code is now in good enough shape to be considered for inclusion
in your kernels.

The one prerequiste patch for this cpuset patch was just posted
before this one.  That was a patch to provide a new bitmap list
format, of which cpusets is the first user.

Changes since July 2 (previous lkml posting):
  - The bitmap, cpumask and nodemask work on which the earlier
    patches depended are now included in your patches.
  - Locking around the cpuset struct simplified and rewritten.
  - Just one cpuset patch now (plus bitmap list format), not 8 of them.
  - Memory restriction in page_alloc and vmscan added (thanks, Andi).
  - Term 'strict' for cpusets that others can't use changed to the
    term 'exclusive' (to avoid collision with the use of the same
    word in Andi's numa work for the reverse meaning).
  - Superfluous 'top_cpuset' layer removed from visible mounted
    cpuset file system.
  - The /proc/<pid>/cpuset hook for displaying a tasks current
    cpuset path uses seq_file now.
  - Notify_on_release calls /sbin/cpuset_release_agent, not
    /sbin/hotplug.  [Hence no CONFIG_HOTPLUG dependency.]
  - kernel/cpuset.c cpuset_sprintf_list() code moved to lib/bitmap.c,
    and rewritten to be simpler.
  - kernel/cpuset.c cpuset_path() code simplified.

This patch has been built on top of 2.6.8-rc2-mm2, for several arch's,
with and without CONFIG_CPUSET.  No doubt you will be glad to know that
it has much fewer arch dependencies (none, that I know of) than the
dreaded cpumask patch.  It has been built, booted and tested in various
forms over the last several months by a few developers at SGI and Bull.

===

Cpusets provide a mechanism for assigning a set of CPUs and Memory
Nodes to a set of tasks.

Cpusets constrain the CPU and Memory placement of tasks to only
the processor and memory resources within a tasks current cpuset.
They form a nested hierarchy visible in a virtual file system.
These are the essential hooks, beyond what is already present,
required to manage dynamic job placement on large systems.

Cpusets require small kernel hooks in init, exit, fork, mempolicy,
sched_setaffinity, page_alloc and vmscan.  And they require a "struct
cpuset" pointer and a "mems_allowed" nodemask_t (to go along with the
"cpus_allowed" cpumask_t that's already there) in each task struct.

These hooks:
  1) establish and propagate cpusets, 
  2) enforce CPU placement in sched_setaffinity,
  3) enforce Memory placement in mbind and sys_set_mempolicy,
  4) restrict page allocation and scanning to mems_allowed, and
  5) restrict migration and set_cpus_allowed to cpus_allowed.

The other required hook, restricting task scheduling to CPUs
in a tasks cpus_allowed mask, is already present.

Cpusets extend the usefulness of, the existing placement support that
was added to Linux 2.6 kernels: sched_setaffinity() for CPU placement,
and mbind and set_mempolicy for memory placement.  On smaller or
dedicated use systems, the existing calls are often sufficient.

On larger NUMA systems, running more than one, performance critical,
job, it is necessary to be able to manage jobs in their entirety.
This includes providing a job with exclusive CPU and memory that no
other job can use, and being able to list all tasks currently in a
cpuset.

A given job running within a cpuset, would likely use the existing
placement calls to manage its CPU and memory placement in more detail.

Cpusets are named, nested sets of CPUs and Memory Nodes.  Each cpuset
is represented by a directory in the cpuset virtual file system,
normally mounted at /dev/cpuset.

Each cpuset directory provides the following files, which can be
read and written:

  cpus:
      List of CPUs allowed to tasks in that cpuset.
  
  mems:
      List of Memory Nodes allowed to tasks in that cpuset.
  
  tasks:
      List of pid's of tasks in that cpuset.
  
  cpu_exclusive:
      Flag (0 or 1) - if set, cpuset has exclusive use of
      its CPUs (no sibling or cousin cpuset may overlap CPUs).
  
  mem_exclusive:
      Flag (0 or 1) - if set, cpuset has exclusive use of
      its Memory Nodes (no sibling or cousin may overlap).
  
  notify_on_release:
      Flag (0 or 1) - if set, then /sbin/cpuset_release_agent
      will be invoked, with the name (/dev/cpuset relative path)
      of that cpuset in argv[1], when the last user of it (task
      or child cpuset) goes away.  This supports automatic
      cleanup of abandoned cpusets.

In addition one new filetype is added to the /proc file system:

  /proc/<pid>/cpuset:
      For each task (pid), list its cpuset path, relative to the
      root of the cpuset file system.  This file is read-only.

New cpusets are created using 'mkdir' (at the shell or in C).
Old ones are removed using 'rmdir'.  The above files are accessed
using read(2) and write(2) system calls, or shell commands such
as 'cat' and 'echo'.

The CPUs and Memory Nodes in a given cpuset are always a subset
of its parent.  The root cpuset has all possible CPUs and Memory
Nodes in the system.  A cpuset may be exclusive (cpu or memory)
only if its parent is similarly exclusive.

See further Documentation/cpusets.txt, at the top of the following
patch.

 Documentation/cpusets.txt |  381 +++++++++++
 fs/proc/base.c            |   19 
 include/linux/cpuset.h    |   61 +
 include/linux/sched.h     |    6 
 init/Kconfig              |   10 
 init/main.c               |    3 
 kernel/Makefile           |    1 
 kernel/cpuset.c           | 1477 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/exit.c             |    2 
 kernel/fork.c             |    3 
 kernel/sched.c            |    9 
 mm/mempolicy.c            |    9 
 mm/page_alloc.c           |   14 
 mm/vmscan.c               |   19 
 14 files changed, 2009 insertions(+), 5 deletions(-)

Signed-off-by: Paul Jackson <pj@sgi.com>

Index: 2.6.8-rc2-mm2/Documentation/cpusets.txt
===================================================================
--- 2.6.8-rc2-mm2.orig/Documentation/cpusets.txt	2003-03-14 05:07:09.000000000 -0800
+++ 2.6.8-rc2-mm2/Documentation/cpusets.txt	2004-08-05 01:44:59.000000000 -0700
@@ -0,0 +1,387 @@
+				CPUSETS
+				-------
+
+Copyright (C) 2004 BULL SA.
+Written by Simon.Derr@bull.net
+
+Portions Copyright (c) 2004 Silicon Graphics, Inc.
+Modified by Paul Jackson <pj@sgi.com>
+
+CONTENTS:
+=========
+
+1. Cpusets
+  1.1 What are cpusets ?
+  1.2 Why are cpusets needed ?
+  1.3 How are cpusets implemented ?
+  1.4 How do I use cpusets ?
+2. Usage Examples and Syntax
+  2.1 Basic Usage
+  2.2 Adding/removing cpus
+  2.3 Setting flags
+  2.4 Attaching processes
+3. Questions
+4. Contact
+
+1. Cpusets
+==========
+
+1.1 What are cpusets ?
+----------------------
+
+Cpusets provide a mechanism for assigning a set of CPUs and Memory
+Nodes to a set of tasks.
+
+Cpusets constrain the CPU and Memory placement of tasks to only
+the resources within a tasks current cpuset.  They form a nested
+hierarchy visible in a virtual file system.  These are the essential
+hooks, beyond what is already present, required to manage dynamic
+job placement on large systems.
+
+Each task has a pointer to a cpuset.  Multiple tasks may reference
+the same cpuset.  Requests by a task, using the sched_setaffinity(2)
+system call to include CPUs in its CPU affinity mask, and using the
+mbind(2) and set_mempolicy(2) system calls to include Memory Nodes
+in its memory policy, are both filtered through that tasks cpuset,
+filtering out any CPUs or Memory Nodes not in that cpuset.  The
+scheduler will not schedule a task on a CPU that is not allowed in
+its cpus_allowed vector, and the kernel page allocator will not
+allocate a page on a node that is not allowed in the requesting tasks
+mems_allowed vector.
+
+If a cpuset is cpu or mem exclusive, no other cpuset, other than a direct
+ancestor or descendent, may share any of the same CPUs or Memory Nodes.
+
+User level code may create and destroy cpusets by name in the cpuset
+virtual file system, manage the attributes and permissions of these
+cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
+specify and query to which cpuset a task is assigned, and list the
+task pids assigned to a cpuset.
+
+
+1.2 Why are cpusets needed ?
+----------------------------
+
+The management of large computer systems, with many processors (CPUs),
+complex memory cache hierarchies and multiple Memory Nodes having
+non-uniform access times (NUMA) presents additional challenges for
+the efficient scheduling and memory placement of processes.
+
+Frequently more modest sized systems can be operated with adequate
+efficiency just by letting the operating system automatically share
+the available CPU and Memory resources amongst the requesting tasks.
+
+But larger systems, which benefit more from careful processor and
+memory placement to reduce memory access times and contention,
+and which typically represent a larger investment for the customer,
+can benefit from explictly placing jobs on properly sized subsets of
+the system.
+
+This can be especially valuable on:
+
+    * Web Servers running multiple instances of the same web application,
+    * Servers running different applications (for instance, a web server
+      and a database), or
+    * NUMA systems running large HPC applications with demanding
+      performance characteristics.
+
+These subsets, or "soft partitions" must be able to be dynamically
+adjusted, as the job mix changes, without impacting other concurrently
+executing jobs.
+
+The kernel cpuset patch provides the minimum essential kernel
+mechanisms required to efficiently implement such subsets.  It
+leverages existing CPU and Memory Placement facilities in the Linux
+kernel to avoid any additional impact on the critical scheduler or
+memory allocator code.
+
+
+1.3 How are cpusets implemented ?
+---------------------------------
+
+Cpusets provide a Linux kernel (2.6.7 and above) mechanism to constrain
+which CPUs and Memory Nodes are used by a process or set of processes.
+
+The Linux kernel already has a pair of mechanisms to specify on which
+CPUs a task may be scheduled (sched_setaffinity) and on which Memory
+Nodes it may obtain memory (mbind, set_mempolicy).
+
+Cpusets extends these two mechanisms as follows:
+
+ - Cpusets are sets of allowed CPUs and Memory Nodes, known to the
+   kernel.
+ - Each task in the system is attached to a cpuset, via a pointer
+   in the task structure to a reference counted cpuset structure.
+ - Calls to sched_setaffinity are filtered to just those CPUs
+   allowed in that tasks cpuset.
+ - Calls to mbind and set_mempolicy are filtered to just
+   those Memory Nodes allowed in that tasks cpuset.
+ - The "top_cpuset" contains all the systems CPUs and Memory
+   Nodes.
+ - For any cpuset, one can define child cpusets containing a subset
+   of the parents CPU and Memory Node resources.
+ - The hierarchy of cpusets can be mounted at /dev/cpuset, for
+   browsing and manipulation from user space.
+ - A cpuset may be marked exclusive, which ensures that no other
+   cpuset (except direct ancestors and descendents) may contain
+   any overlapping CPUs or Memory Nodes.
+ - You can list all the tasks (by pid) attached to any cpuset.
+
+The implementation of cpusets requires a few, simple hooks
+into the rest of the kernel, none in performance critical paths:
+
+ - in main/init.c, to initialize the top_cpuset at system boot.
+ - in fork and exit, to attach and detach a task from its cpuset.
+ - in sched_setaffinity, to mask the requested CPUs by what's
+   allowed in that tasks cpuset.
+ - in sched.c migrate_all_tasks(), to keep migrating tasks within
+   the CPUs allowed by their cpuset, if possible.
+ - in the mbind and set_mempolicy system calls, to mask the requested
+   Memory Nodes by what's allowed in that tasks cpuset.
+ - in page_alloc, to restrict memory to allowed nodes.
+ - in vmscan.c, to restrict page recovery to the current cpuset.
+
+In addition a new file system, of type "cpuset" may be mounted,
+typically at /dev/cpuset, to enable browsing and modifying the cpusets
+presently known to the kernel.  No new system calls are added for
+cpusets - all support for querying and modifying cpusets is via
+this cpuset file system.
+
+Each task under /proc has an added file named 'cpuset', displaying
+the cpuset name, as the path relative to the root of the cpuset file
+system.
+
+Each cpuset is represented by a directory in the cpuset file system
+containing the following files describing that cpuset:
+
+ - cpus: list of CPUs in that cpuset
+ - mems: list of Memory Nodes in that cpuset
+ - cpu_exclusive flag: is cpu placement exclusive?
+ - mem_exclusive flag: is memory placement exclusive?
+ - tasks: list of tasks (by pid) attached to that cpuset
+
+New cpusets are created using the mkdir system call or shell
+command.  The properties of a cpuset, such as its flags, allowed
+CPUs and Memory Nodes, and attached tasks, are modified by writing
+to the appropriate file in that cpusets directory, as listed above.
+
+The named hierarchical structure of nested cpusets allows partitioning
+a large system into nested, dynamically changeable, "soft-partitions".
+
+The attachment of each task, automatically inherited at fork by any
+children of that task, to a cpuset allows organizing the work load
+on a system into related sets of tasks such that each set is constrained
+to using the CPUs and Memory Nodes of a particular cpuset.  A task
+may be re-attached to any other cpuset, if allowed by the permissions
+on the necessary cpuset file system directories.
+
+Such management of a system "in the large" integrates smoothly with
+the detailed placement done on individual tasks and memory regions
+using the sched_setaffinity, mbind and set_mempolicy system calls.
+
+The following rules apply to each cpuset:
+
+ - Its CPUs and Memory Nodes must be a subset of its parents.
+ - It can only be marked exclusive if its parent is.
+ - If its cpu or memory is exclusive, they may not overlap any sibling.
+
+These rules, and the natural hierarchy of cpusets, enable efficient
+enforcement of the exclusive guarantee, without having to scan all
+cpusets every time any of them change to ensure nothing overlaps a
+exclusive cpuset.  Also, the use of a Linux virtual file system (vfs)
+to represent the cpuset hierarchy provides for a familiar permission
+and name space for cpusets, with a minimum of additional kernel code.
+
+
+1.4 How do I use cpusets ?
+--------------------------
+
+Be warned that cpusets work differently than you might expect.
+
+In order to avoid _any_ impact on existing critical scheduler and
+memory allocator code in the kernel, and to leverage the existing
+CPU and Memory placement facilities, putting a task in a particular
+cpuset does _not_ immediately affect its placement.
+
+It would have been possible (and initially cpusets were coded this
+way) to immediately change a tasks cpus_allowed affinity mask based
+on what cpuset it was placed in.  The sched_setaffinity call can be
+applied to any requested task.
+
+But the way numa placement support (added to 2.6 kernels in April
+2004 by Andi Kleen) works, it is not possible for one task to change
+another tasks Memory placement.  The mbind and set_mempolicy system
+calls only affect the current task.  There really wasn't a choice
+in this matter -- the mm's, vma's and zonelists that encode a tasks
+Memory placement are complicated, and cannot be safely changed from
+outside the current tasks context.
+
+So, cpuset placement only affects the future sched_setaffinity,
+mbind, and set_mempolicy system calls, by filtering out any CPUs
+and Memory Nodes that are not allowed in the specified tasks cpuset.
+Well, almost all.  See also the migrate_all_tasks() hook, listed above.
+
+To start a new job that is to be contained within a cpuset, this means
+the steps are:
+
+ 1) mkdir /dev/cpuset
+ 2) mount -t cpuset none /dev/cpuset
+ 3) Create the new cpuset by doing mkdir's and write's (or echo's) in
+    the /dev/cpuset virtual file system.
+ 4) Start a task that will be the "founding father" of the new job.
+ 5) Attach that task to the new cpuset by writing its pid to the
+    /dev/cpuset tasks file for that cpuset.
+ 6) Have that task issue sched_setaffinity, mbind and set_mempolicy
+    system calls, specifying CPUs and Memory Nodes within its cpuset.
+    Anything it specifies outside will be ignored without complaint,
+    so if you request all CPUs and Memory Nodes in the system, you will
+    successfully get all that are available in your current cpuset.
+ 7) fork, exec or clone the job tasks from this founding father task.
+
+For example, the following sequence of commands will setup a cpuset
+named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
+and then start a subshell 'sh' in that cpuset:
+
+  mount -t cpuset none /dev/cpuset
+  cd /dev/cpuset/top_cpuset
+  mkdir Charlie
+  cd Charlie
+  /bin/echo 2-3 > cpus
+  /bin/echo 1 > mems
+  /bin/echo $$ > tasks
+  # 0xC is bitmask for CPUs 2-3
+  taskset 0xC numactl -m 1 sh
+  # The subshell 'sh' is now running in cpuset Charlie
+  # The next line should display 'top_cpuset/Charlie'
+  cat /proc/self/cpuset
+
+In the case that we want to force an existing job into a particular
+cpuset, or that we want to move the cpuset that a job is using,
+we will need some additional library code, not yet available as of
+this writing (July 2004), that will receive a particular signal,
+and reissue the necessary sched_setaffinity, mbind and set_mempolicy
+system calls from with the tasks current context.
+
+In the case that a change of cpuset includes wanting to move already
+allocated memory pages, consider further the work of IWAMOTO
+Toshihiro <iwamoto@valinux.co.jp> for page remapping and memory
+hotremoval, which can be found at:
+
+  http://people.valinux.co.jp/~iwamoto/mh.html
+
+The integration of cpusets with such memory migration is not yet
+available.
+
+In the future, a C library interface to cpusets will likely be
+available.  For now, the only way to query or modify cpusets is
+via the cpuset file system, using the various cd, mkdir, echo, cat,
+rmdir commands from the shell, or their equivalent from C.
+
+The sched_setaffinity calls can also be done at the shell prompt using
+SGI's runon or Robert Love's taskset.  The mbind and set_mempolicy
+calls can be done at the shell prompt using the numactl command
+(part of Andi's numa package).
+
+2. Usage Examples and Syntax
+============================
+
+2.1 Basic Usage
+---------------
+
+Creating, modifying, using the cpusets can be done through the cpuset
+virtual filesystem.
+
+To mount it, type:
+# mount -t cpuset none /dev/cpuset
+
+Then under /dev/cpuset you can find a tree that corresponds to the
+tree of the cpusets in the system. For instance, /dev/cpuset/top_cpuset
+is the cpuset that holds the whole system.
+
+If you want to create a new cpuset under top_cpuset:
+# cd /dev/cpuset/top_cpuset
+# mkdir my_cpuset
+
+Now you want to do something with this cpuset.
+# cd my_cpuset
+
+In this directory you can find several files:
+# ls
+cpus  cpu_exclusive  mems  mem_exclusive  tasks
+
+Reading them will give you information about the state of this cpuset:
+the CPUs and Memory Nodes it can use, the processes that are using
+it, its properties.  By writing to these files you can manipulate
+the cpuset.
+
+Set some flags:
+# /bin/echo 1 > cpu_exclusive
+
+Add some cpus:
+# /bin/echo 0-7 > cpus
+
+Now attach your shell to this cpuset:
+# /bin/echo $$ > tasks
+
+You can also create cpusets inside your cpuset by using mkdir in this
+directory.
+# mkdir my_sub_cs
+
+To remove a cpuset, just use rmdir:
+# rmdir my_sub_cs
+This will fail if the cpuset is in use (has cpusets inside, or has
+processes attached).
+
+2.2 Adding/removing cpus
+------------------------
+
+This is the syntax to use when writing in the cpus or mems files
+in cpuset directories:
+
+# /bin/echo 1-4 > cpus		-> set cpus list to cpus 1,2,3,4
+# /bin/echo 1,2,3,4 > cpus	-> set cpus list to cpus 1,2,3,4
+# /bin/echo +1 > cpus		-> add cpu 1 to the cpus list
+# /bin/echo -1-4 > cpus		-> remove cpus 1,2,3,4 from the cpus list
+# /bin/echo -1,2,3,4 > cpus	-> remove cpus 1,2,3,4 from the cpus list
+
+All these can be mixed together:
+# /bin/echo 1-7 -6 +9,10	-> set cpus list to 1,2,3,4,5,7,9,10
+
+2.3 Setting flags
+-----------------
+
+The syntax is very simple:
+
+# /bin/echo 1 > cpu_exclusive 	-> set flag 'cpu_exclusive'
+# /bin/echo 0 > cpu_exclusive 	-> unset flag 'cpu_exclusive'
+
+2.4 Attaching processes
+-----------------------
+
+# /bin/echo PID > tasks
+
+Note that it is PID, not PIDs. You can only attach ONE task at a time.
+If you have several tasks to attach, you have to do it one after another:
+
+# /bin/echo PID1 > tasks
+# /bin/echo PID2 > tasks
+	...
+# /bin/echo PIDn > tasks
+
+
+3. Questions
+============
+
+Q: what's up with this '/bin/echo' ?
+A: bash's builtin 'echo' command does not check calls to write() against
+   errors. If you use it in the cpuset file system, you won't be
+   able to tell whether a command succeeded or failed.
+
+Q: When I attach processes, only the first of the line gets really attached !
+A: We can only return one error code per call to write(). So you should also
+   put only ONE pid.
+
+4. Contact
+==========
+
+Web: http://www.bullopensource.org/cpuset
Index: 2.6.8-rc2-mm2/fs/proc/base.c
===================================================================
--- 2.6.8-rc2-mm2.orig/fs/proc/base.c	2004-08-04 21:44:05.000000000 -0700
+++ 2.6.8-rc2-mm2/fs/proc/base.c	2004-08-04 21:44:49.000000000 -0700
@@ -32,6 +32,7 @@
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/ptrace.h>
+#include <linux/cpuset.h>
 
 /*
  * For hysterical raisins we keep the same inumbers as in the old procfs.
@@ -60,6 +61,9 @@ enum pid_directory_inos {
 	PROC_TGID_MAPS,
 	PROC_TGID_MOUNTS,
 	PROC_TGID_WCHAN,
+#ifdef CONFIG_CPUSETS
+	PROC_TGID_CPUSET,
+#endif
 #ifdef CONFIG_SECURITY
 	PROC_TGID_ATTR,
 	PROC_TGID_ATTR_CURRENT,
@@ -83,6 +87,9 @@ enum pid_directory_inos {
 	PROC_TID_MAPS,
 	PROC_TID_MOUNTS,
 	PROC_TID_WCHAN,
+#ifdef CONFIG_CPUSETS
+	PROC_TID_CPUSET,
+#endif
 #ifdef CONFIG_SECURITY
 	PROC_TID_ATTR,
 	PROC_TID_ATTR_CURRENT,
@@ -123,6 +130,9 @@ static struct pid_entry tgid_base_stuff[
 #ifdef CONFIG_KALLSYMS
 	E(PROC_TGID_WCHAN,     "wchan",   S_IFREG|S_IRUGO),
 #endif
+#ifdef CONFIG_CPUSETS
+	E(PROC_TGID_CPUSET,    "cpuset",  S_IFREG|S_IRUGO),
+#endif
 	{0,0,NULL,0}
 };
 static struct pid_entry tid_base_stuff[] = {
@@ -145,6 +155,9 @@ static struct pid_entry tid_base_stuff[]
 #ifdef CONFIG_KALLSYMS
 	E(PROC_TID_WCHAN,      "wchan",   S_IFREG|S_IRUGO),
 #endif
+#ifdef CONFIG_CPUSETS
+	E(PROC_TID_CPUSET,     "cpuset",  S_IFREG|S_IRUGO),
+#endif
 	{0,0,NULL,0}
 };
 
@@ -1376,6 +1389,12 @@ static struct dentry *proc_pident_lookup
 			ei->op.proc_read = proc_pid_wchan;
 			break;
 #endif
+#ifdef CONFIG_CPUSETS
+		case PROC_TID_CPUSET:
+		case PROC_TGID_CPUSET:
+			inode->i_fop = &proc_cpuset_operations;
+			break;
+#endif
 		default:
 			printk("procfs: impossible type (%d)",p->type);
 			iput(inode);
Index: 2.6.8-rc2-mm2/include/linux/cpuset.h
===================================================================
--- 2.6.8-rc2-mm2.orig/include/linux/cpuset.h	2003-03-14 05:07:09.000000000 -0800
+++ 2.6.8-rc2-mm2/include/linux/cpuset.h	2004-08-04 21:44:49.000000000 -0700
@@ -0,0 +1,61 @@
+#ifndef _LINUX_CPUSET_H
+#define _LINUX_CPUSET_H
+/*
+ *  cpuset interface
+ *
+ *  Copyright (C) 2003 BULL SA
+ *  Copyright (C) 2004 Silicon Graphics, Inc.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/nodemask.h>
+
+#ifdef CONFIG_CPUSETS
+
+extern int cpuset_init(void);
+extern void cpuset_fork(struct task_struct *p);
+extern void cpuset_exit(struct task_struct *p);
+extern const cpumask_t cpuset_cpus_allowed(const struct task_struct *p);
+extern const nodemask_t cpuset_mems_allowed(const struct task_struct *p);
+void cpuset_init_current_mems_allowed(void);
+void cpuset_update_current_mems_allowed(void);
+void cpuset_restrict_to_mems_allowed(unsigned long *nodes);
+int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
+int cpuset_zone_allowed(struct zone *z);
+extern struct file_operations proc_cpuset_operations;
+
+#else /* !CONFIG_CPUSETS */
+
+static inline int cpuset_init(void) { return 0; }
+static inline void cpuset_fork(struct task_struct *p) {}
+static inline void cpuset_exit(struct task_struct *p) {}
+
+static inline const cpumask_t cpuset_cpus_allowed(struct task_struct *p)
+{
+	return cpu_possible_map;
+}
+
+static inline const nodemask_t cpuset_mems_allowed(struct task_struct *p)
+{
+	return node_possible_map;
+}
+
+static inline void cpuset_init_current_mems_allowed(void) {}
+static inline void cpuset_update_current_mems_allowed(void) {}
+static inline void cpuset_restrict_to_mems_allowed(unsigned long *nodes) {}
+
+static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
+{
+	return 1;
+}
+
+static inline int cpuset_zone_allowed(struct zone *z)
+{
+	return 1;
+}
+
+#endif /* !CONFIG_CPUSETS */
+
+#endif /* _LINUX_CPUSET_H */
Index: 2.6.8-rc2-mm2/include/linux/sched.h
===================================================================
--- 2.6.8-rc2-mm2.orig/include/linux/sched.h	2004-08-04 21:44:05.000000000 -0700
+++ 2.6.8-rc2-mm2/include/linux/sched.h	2004-08-04 21:44:49.000000000 -0700
@@ -13,6 +13,7 @@
 #include <linux/rbtree.h>
 #include <linux/thread_info.h>
 #include <linux/cpumask.h>
+#include <linux/nodemask.h>
 
 #include <asm/system.h>
 #include <asm/semaphore.h>
@@ -370,6 +371,7 @@ struct k_itimer {
 
 struct io_context;			/* See blkdev.h */
 void exit_io_context(void);
+struct cpuset;
 
 #define NGROUPS_SMALL		32
 #define NGROUPS_PER_BLOCK	((int)(PAGE_SIZE / sizeof(gid_t)))
@@ -551,6 +553,10 @@ struct task_struct {
 	struct rw_semaphore pagg_sem;
 #endif
 
+#ifdef CONFIG_CPUSETS
+	struct cpuset *cpuset;
+	nodemask_t mems_allowed;
+#endif
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
Index: 2.6.8-rc2-mm2/init/Kconfig
===================================================================
--- 2.6.8-rc2-mm2.orig/init/Kconfig	2004-08-04 21:44:05.000000000 -0700
+++ 2.6.8-rc2-mm2/init/Kconfig	2004-08-04 21:44:49.000000000 -0700
@@ -278,6 +278,16 @@ config EPOLL
 	  Disabling this option will cause the kernel to be built without
 	  support for epoll family of system calls.
 
+config CPUSETS
+	bool "Cpuset support"
+	help
+	  This options will let you create and manage CPUSET's which
+	  allow dynamically partitioning a system into sets of CPUs and
+	  Memory Nodes and assigning tasks to run only within those sets.
+	  This is primarily useful on large SMP or NUMA systems.
+
+	  Say N if unsure.
+
 source "drivers/block/Kconfig.iosched"
 
 config CC_OPTIMIZE_FOR_SIZE
Index: 2.6.8-rc2-mm2/init/main.c
===================================================================
--- 2.6.8-rc2-mm2.orig/init/main.c	2004-08-04 21:44:05.000000000 -0700
+++ 2.6.8-rc2-mm2/init/main.c	2004-08-04 21:44:49.000000000 -0700
@@ -41,6 +41,7 @@
 #include <linux/writeback.h>
 #include <linux/cpu.h>
 #include <linux/efi.h>
+#include <linux/cpuset.h>
 #include <linux/unistd.h>
 #include <linux/rmap.h>
 #include <linux/mempolicy.h>
@@ -568,6 +569,8 @@ asmlinkage void __init start_kernel(void
 #ifdef CONFIG_PROC_FS
 	proc_root_init();
 #endif
+	cpuset_init();
+
 	check_bugs();
 
 	/* Do the rest non-__init'ed, we're now alive */
Index: 2.6.8-rc2-mm2/kernel/Makefile
===================================================================
--- 2.6.8-rc2-mm2.orig/kernel/Makefile	2004-08-04 21:44:05.000000000 -0700
+++ 2.6.8-rc2-mm2/kernel/Makefile	2004-08-04 21:44:49.000000000 -0700
@@ -25,6 +25,7 @@ obj-$(CONFIG_IKCONFIG_PROC) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
 obj-$(CONFIG_AUDIT) += audit.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
+obj-$(CONFIG_CPUSETS) += cpuset.o
 
 ifneq ($(CONFIG_IA64),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
Index: 2.6.8-rc2-mm2/kernel/cpuset.c
===================================================================
--- 2.6.8-rc2-mm2.orig/kernel/cpuset.c	2003-03-14 05:07:09.000000000 -0800
+++ 2.6.8-rc2-mm2/kernel/cpuset.c	2004-08-04 21:44:49.000000000 -0700
@@ -0,0 +1,1477 @@
+/*
+ *  kernel/cpuset.c
+ *
+ *  Processor and Memory placement constraints for sets of tasks.
+ *
+ *  Copyright (C) 2003 BULL SA.
+ *  Copyright (C) 2004 Silicon Graphics, Inc.
+ *
+ *  Portions derived from Patrick Mochel's sysfs code.
+ *  sysfs is Copyright (c) 2001-3 Patrick Mochel
+ *  Portions Copyright (c) 2004 Silicon Graphics, Inc.
+ *
+ *  2003-10-10 Written by Simon Derr <simon.derr@bull.net>
+ *  2003-10-22 Updates by Stephen Hemminger.
+ *  2004 May-July Rework by Paul Jackson <pj@sgi.com>
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#include <linux/config.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/cpuset.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include <linux/backing-dev.h>
+
+#include <asm/uaccess.h>
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+
+#define CPUSET_SUPER_MAGIC 		0x27e0eb
+
+struct cpuset {
+	unsigned long flags;		/* "unsigned long" so bitops work */
+	cpumask_t cpus_allowed;		/* CPUs allowed to tasks in cpuset */
+	nodemask_t mems_allowed;	/* Memory Nodes allowed to tasks */
+
+	atomic_t count;			/* count tasks using this cpuset */
+
+	/*
+	 * We link our 'sibling' struct into our parents 'children'.
+	 * Our children link their 'sibling' into our 'children'.
+	 */
+	struct list_head sibling;	/* my parents children */
+	struct list_head children;	/* my children */
+
+	struct cpuset *parent;		/* my parent */
+	struct dentry *dentry;		/* cpuset fs entry */
+};
+
+/* bits in struct cpuset flags field */
+typedef enum {
+	CS_CPU_EXCLUSIVE,
+	CS_MEM_EXCLUSIVE,
+	CS_REMOVED,
+	CS_NOTIFY_ON_RELEASE
+} cpuset_flagbits_t;
+
+/* convenient tests for these bits */
+static inline int is_cpu_exclusive(const struct cpuset *cs)
+{
+	return !!test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
+}
+
+static inline int is_mem_exclusive(const struct cpuset *cs)
+{
+	return !!test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
+}
+
+static inline int is_removed(const struct cpuset *cs)
+{
+	return !!test_bit(CS_REMOVED, &cs->flags);
+}
+
+static inline int notify_on_release(const struct cpuset *cs)
+{
+	return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
+}
+
+static struct cpuset top_cpuset = {
+	.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
+	.cpus_allowed = CPU_MASK_ALL,
+	.mems_allowed = NODE_MASK_ALL,
+	.count = ATOMIC_INIT(0),
+	.sibling = LIST_HEAD_INIT(top_cpuset.sibling),
+	.children = LIST_HEAD_INIT(top_cpuset.children),
+	.parent = NULL,
+	.dentry = NULL,
+};
+
+static struct vfsmount *cpuset_mount;
+static struct super_block *cpuset_sb = NULL;
+
+/*
+ * cpuset_sem should be held by anyone who is depending on the children
+ * or sibling lists of any cpuset, or performing non-atomic operations
+ * on the flags or *_allowed values of a cpuset, such as raising the
+ * CS_REMOVED flag bit iff it is not already raised, or reading and
+ * conditionally modifying the *_allowed values.  One kernel global
+ * cpuset semaphore should be sufficient - these things don't change
+ * that much.
+ *
+ * The code that modifies cpusets holds cpuset_sem across the entire
+ * operation, from cpuset_common_file_write() down, single threading
+ * all cpuset modifications (except for counter manipulations from
+ * fork and exit) across the system.  This presumes that cpuset
+ * modifications are rare - better kept simple and safe, even if slow.
+ *
+ * The code that reads cpusets, such as in cpuset_common_file_read()
+ * and below, only holds cpuset_sem across small pieces of code, such
+ * as when reading out possibly multi-word cpumasks and nodemasks, as
+ * the risks are less, and the desire for performance a little greater.
+ * The proc_cpuset_show() routine needs to hold cpuset_sem to insure
+ * that no cs->dentry is NULL, as it walks up the cpuset tree to root.
+ *
+ * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't
+ * (usually) grab cpuset_sem.  These are the two most performance
+ * critical pieces of code here.  The exception occurs on exit(),
+ * if the last task using a cpuset exits, and the cpuset was marked
+ * notify_on_release.  In that case, the cpuset_sem is taken, the
+ * path to the released cpuset calculated, and a usermode call made
+ * to /sbin/cpuset_release_agent with the name of the cpuset (path
+ * relative to the root of cpuset file system) as the argument.
+ *
+ * A cpuset can only be deleted if both its 'count' of using tasks is
+ * zero, and its list of 'children' cpusets is empty.  Since all tasks
+ * in the system use _some_ cpuset, and since there is always at least
+ * one task in the system (init, pid == 1), therefore, top_cpuset
+ * always has either children cpusets and/or using tasks.  So no need
+ * for any special hack to ensure that top_cpuset cannot be deleted.
+ */
+
+static DECLARE_MUTEX(cpuset_sem);
+
+/*
+ * A couple of forward declarations required, due to cyclic reference loop:
+ *  cpuset_mkdir -> cpuset_create -> cpuset_populate_dir -> cpuset_add_file
+ *  -> cpuset_create_file -> cpuset_dir_inode_operations -> cpuset_mkdir.
+ */
+
+static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry);
+
+static struct backing_dev_info cpuset_backing_dev_info = {
+	.ra_pages = 0,		/* No readahead */
+	.memory_backed = 1,	/* Does not contribute to dirty memory */
+};
+
+static struct inode *cpuset_new_inode(mode_t mode)
+{
+	struct inode *inode = new_inode(cpuset_sb);
+
+	if (inode) {
+		inode->i_mode = mode;
+		inode->i_uid = current->fsuid;
+		inode->i_gid = current->fsgid;
+		inode->i_blksize = PAGE_CACHE_SIZE;
+		inode->i_blocks = 0;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info;
+	}
+	return inode;
+}
+
+static void cpuset_diput(struct dentry *dentry, struct inode *inode)
+{
+	/* is dentry a directory ? if so, kfree() associated cpuset */
+	if (S_ISDIR(inode->i_mode)) {
+		struct cpuset *cs = (struct cpuset *)dentry->d_fsdata;
+		BUG_ON(!(is_removed(cs)));
+		kfree(cs);
+	}
+	iput(inode);
+}
+
+static struct dentry_operations cpuset_dops = {
+	.d_iput = cpuset_diput,
+};
+
+static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name)
+{
+	struct qstr qstr;
+	struct dentry *d;
+
+	qstr.name = name;
+	qstr.len = strlen(name);
+	qstr.hash = full_name_hash(name, qstr.len);
+	d = lookup_hash(&qstr, parent);
+	if (d)
+		d->d_op = &cpuset_dops;
+	return d;
+}
+
+static void remove_dir(struct dentry *d)
+{
+	struct dentry *parent = dget(d->d_parent);
+
+	d_delete(d);
+	simple_rmdir(parent->d_inode, d);
+	dput(parent);
+}
+
+/*
+ * NOTE : the dentry must have been dget()'ed
+ */
+static void cpuset_d_remove_dir(struct dentry *dentry)
+{
+	struct list_head *node;
+
+	spin_lock(&dcache_lock);
+	node = dentry->d_subdirs.next;
+	while (node != &dentry->d_subdirs) {
+		struct dentry *d = list_entry(node, struct dentry, d_child);
+		list_del_init(node);
+		if (d->d_inode) {
+			d = dget_locked(d);
+			spin_unlock(&dcache_lock);
+			d_delete(d);
+			simple_unlink(dentry->d_inode, d);
+			dput(d);
+			spin_lock(&dcache_lock);
+		}
+		node = dentry->d_subdirs.next;
+	}
+	list_del_init(&dentry->d_child);
+	spin_unlock(&dcache_lock);
+	remove_dir(dentry);
+}
+
+static struct super_operations cpuset_ops = {
+	.statfs = simple_statfs,
+	.drop_inode = generic_delete_inode,
+};
+
+static int cpuset_fill_super(struct super_block *sb, void *unused_data,
+							int unused_silent)
+{
+	struct inode *inode;
+	struct dentry *root;
+
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = CPUSET_SUPER_MAGIC;
+	sb->s_op = &cpuset_ops;
+	cpuset_sb = sb;
+
+	inode = cpuset_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR);
+	if (inode) {
+		inode->i_op = &simple_dir_inode_operations;
+		inode->i_fop = &simple_dir_operations;
+		/* directories start off with i_nlink == 2 (for "." entry) */
+		inode->i_nlink++;
+	} else {
+		return -ENOMEM;
+	}
+
+	root = d_alloc_root(inode);
+	if (!root) {
+		iput(inode);
+		return -ENOMEM;
+	}
+	sb->s_root = root;
+	return 0;
+}
+
+static struct super_block *cpuset_get_sb(struct file_system_type *fs_type,
+					int flags, const char *unused_dev_name,
+					void *data)
+{
+	return get_sb_single(fs_type, flags, data, cpuset_fill_super);
+}
+
+static struct file_system_type cpuset_fs_type = {
+	.name = "cpuset",
+	.get_sb = cpuset_get_sb,
+	.kill_sb = kill_litter_super,
+};
+
+/* struct cftype:
+ *
+ * The files in the cpuset filesystem mostly have a very simple read/write
+ * handling, some common function will take care of it. Nevertheless some cases
+ * (read tasks) are special and therefore I define this structure for every
+ * kind of file.
+ *
+ *
+ * When reading/writing to a file:
+ *	- the cpuset to use in file->f_dentry->d_parent->d_fsdata
+ *	- the 'cftype' of the file is file->f_dentry->d_fsdata
+ */
+
+struct cftype {
+	char *name;
+	int private;
+	int (*open) (struct inode *inode, struct file *file);
+	ssize_t (*read) (struct file *file, char __user *buf, size_t nbytes,
+							loff_t *ppos);
+	int (*write) (struct file *file, const char *buf, size_t nbytes,
+							loff_t *ppos);
+	int (*release) (struct inode *inode, struct file *file);
+};
+
+static inline struct cpuset *__d_cs(struct dentry *dentry)
+{
+	return (struct cpuset *)dentry->d_fsdata;
+}
+
+static inline struct cftype *__d_cft(struct dentry *dentry)
+{
+	return (struct cftype *)dentry->d_fsdata;
+}
+
+/*
+ * Call with cpuset_sem held.  Writes path of cpuset into buf.
+ * Returns 0 on success, -errno on error.
+ */
+
+static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
+{
+	char *start;
+
+	start = buf + buflen;
+
+	*--start = '\0';
+	for (;;) {
+		int len = cs->dentry->d_name.len;
+		if ((start -= len) < buf)
+			return -ENAMETOOLONG;
+		memcpy(start, cs->dentry->d_name.name, len);
+		cs = cs->parent;
+		if (!cs)
+			break;
+		if (!cs->parent)
+			continue;
+		if (--start < buf)
+			return -ENAMETOOLONG;
+		*start = '/';
+	}
+	memmove(buf, start, buf + buflen - start);
+	return 0;
+}
+
+/*
+ * Notify userspace when a cpuset is released, by running
+ * /sbin/cpuset_release_agent with the name of the cpuset (path
+ * relative to the root of cpuset file system) as the argument.
+ *
+ * Most likely, this user command will try to rmdir this cpuset.
+ *
+ * This races with the possibility that some other task will be
+ * attached to this cpuset before it is removed, or that some other
+ * user task will 'mkdir' a child cpuset of this cpuset.  That's ok.
+ * The presumed 'rmdir' will fail quietly if this cpuset is no longer
+ * unused, and this cpuset will be reprieved from its death sentence,
+ * to continue to serve a useful existence.  Next time it's released,
+ * we will get notified again, if it still has 'notify_on_release' set.
+ *
+ * Note final arg to call_usermodehelper() is 0 - that means
+ * don't wait.  Since we are holding the global cpuset_sem here,
+ * and we are asking another thread (started from keventd) to rmdir a
+ * cpuset, we can't wait - or we'd deadlock with the removing thread
+ * on cpuset_sem.
+ */
+
+static int cpuset_release_agent(char *cpuset_str)
+{
+	char *argv[3], *envp[3];
+	int i;
+
+	i = 0;
+	argv[i++] = "/sbin/cpuset_release_agent";
+	argv[i++] = cpuset_str;
+	argv[i] = NULL;
+
+	i = 0;
+	/* minimal command environment */
+	envp[i++] = "HOME=/";
+	envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+	envp[i] = NULL;
+
+	return call_usermodehelper(argv[0], argv, envp, 0);
+}
+
+/*
+ * Either cs->count of using tasks transitioned to zero, or the
+ * cs->children list of child cpusets just became empty.  If this
+ * cs is notify_on_release() and now both the user count is zero and
+ * the list of children is empty, send notice to user land.
+ */
+
+static void check_for_release(struct cpuset *cs)
+{
+	if (notify_on_release(cs) && atomic_read(&cs->count) == 0 &&
+	    list_empty(&cs->children)) {
+		char *buf;
+
+		buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+		if (!buf)
+			return;
+		if (cpuset_path(cs, buf, PAGE_SIZE) < 0)
+			goto out;
+		cpuset_release_agent(buf);
+	out:
+		kfree(buf);
+	}
+}
+
+/*
+ * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
+ *
+ * One cpuset is a subset of another if all its allowed CPUs and
+ * Memory Nodes are a subset of the other, and its exclusive flags
+ * are only set if the other's are set.
+ */
+
+static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
+{
+	return	cpus_subset(p->cpus_allowed, q->cpus_allowed) &&
+		nodes_subset(p->mems_allowed, q->mems_allowed) &&
+		is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
+		is_mem_exclusive(p) <= is_mem_exclusive(q);
+}
+
+/*
+ * validate_change() - Used to validate that any proposed cpuset change
+ *		       follows the structural rules for cpusets.
+ *
+ * If we replaced the flag and mask values of the current cpuset
+ * (cur) with those values in the trial cpuset (trial), would
+ * our various subset and exclusive rules still be valid?  Presumes
+ * cpuset_sem held.
+ *
+ * 'cur' is the address of an actual, in-use cpuset.  Operations
+ * such as list traversal that depend on the actual address of the
+ * cpuset in the list must use cur below, not trial.
+ *
+ * 'trial' is the address of bulk structure copy of cur, with
+ * perhaps one or more of the fields cpus_allowed, mems_allowed,
+ * or flags changed to new, trial values.
+ *
+ * Return 0 if valid, -errno if not.
+ */
+
+static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
+{
+	struct cpuset *c, *par = cur->parent;
+
+	/*
+	 * Don't mess with Big Daddy - top_cpuset must remain maximal.
+	 * And besides, the rest of this routine blows chunks if par == 0.
+	 */
+	if (cur == &top_cpuset)
+		return -EPERM;
+
+	/* Any in-use cpuset must have at least ONE cpu and mem */
+	if (atomic_read(&trial->count) > 1) {
+		if (cpus_empty(trial->cpus_allowed))
+			return -ENOSPC;
+		if (nodes_empty(trial->mems_allowed))
+			return -ENOSPC;
+	}
+
+	/* We must be a subset of our parent cpuset */
+	if (!is_cpuset_subset(trial, par))
+		return -EACCES;
+
+	/* Each of our child cpusets must be a subset of us */
+	list_for_each_entry(c, &cur->children, sibling) {
+		if (!is_cpuset_subset(c, trial))
+			return -EBUSY;
+	}
+
+	/* If either I or some sibling (!= me) is exclusive, we can't overlap */
+	list_for_each_entry(c, &par->children, sibling) {
+		if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
+		    c != cur &&
+		    cpus_intersects(trial->cpus_allowed, c->cpus_allowed)
+		) {
+			return -EINVAL;
+		}
+		if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
+		    c != cur &&
+		    nodes_intersects(trial->mems_allowed, c->mems_allowed)
+		) {
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int update_cpumask(struct cpuset *cs, char *buf)
+{
+	struct cpuset trialcs;
+	int retval;
+
+	trialcs = *cs;
+	retval = cpulist_parse(buf, trialcs.cpus_allowed);
+	if (retval < 0)
+		return retval;
+	retval = validate_change(cs, &trialcs);
+	if (retval == 0)
+		cs->cpus_allowed = trialcs.cpus_allowed;
+	return retval;
+}
+
+static int update_nodemask(struct cpuset *cs, char *buf)
+{
+	struct cpuset trialcs;
+	int retval;
+
+	trialcs = *cs;
+	retval = nodelist_parse(buf, trialcs.mems_allowed);
+	if (retval < 0)
+		return retval;
+	retval = validate_change(cs, &trialcs);
+	if (retval == 0)
+		cs->mems_allowed = trialcs.mems_allowed;
+	return retval;
+}
+
+/*
+ * update_flag - read a 0 or a 1 in a file and update associated flag
+ * bit:	the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
+ *						CS_NOTIFY_ON_RELEASE)
+ * cs:	the cpuset to update
+ * buf:	the buffer where we read the 0 or 1
+ */
+
+static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
+{
+	int turning_on;
+	struct cpuset trialcs;
+	int err;
+
+	turning_on = (simple_strtoul(buf, NULL, 10) != 0);
+
+	trialcs = *cs;
+	if (turning_on)
+		set_bit(bit, &trialcs.flags);
+	else
+		clear_bit(bit, &trialcs.flags);
+
+	err = validate_change(cs, &trialcs);
+	if (err == 0) {
+		if (turning_on)
+			set_bit(bit, &cs->flags);
+		else
+			clear_bit(bit, &cs->flags);
+	}
+	return err;
+}
+
+static int attach_task(struct cpuset *cs, char *buf)
+{
+	pid_t pid;
+	struct task_struct *tsk;
+	struct cpuset *oldcs;
+
+	if (sscanf(buf, "%d", &pid) != 1)
+		return -EIO;
+	if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+		return -ENOSPC;
+
+	if (pid) {
+		read_lock(&tasklist_lock);
+
+		tsk = find_task_by_pid(pid);
+		if (!tsk) {
+			read_unlock(&tasklist_lock);
+			return -ESRCH;
+		}
+
+		get_task_struct(tsk);
+		read_unlock(&tasklist_lock);
+
+		if ((current->euid) && (current->euid != tsk->uid)
+		    && (current->euid != tsk->suid)) {
+			put_task_struct(tsk);
+			return -EACCES;
+		}
+	} else {
+		tsk = current;
+		get_task_struct(tsk);
+	}
+
+	task_lock(tsk);
+	oldcs = tsk->cpuset;
+	if (!oldcs) {
+		task_unlock(tsk);
+		put_task_struct(tsk);
+		return -ESRCH;
+	}
+	atomic_inc(&cs->count);
+	tsk->cpuset = cs;
+	task_unlock(tsk);
+
+	put_task_struct(tsk);
+	if (atomic_dec_and_test(&oldcs->count))
+		check_for_release(oldcs);
+	return 0;
+}
+
+/* The various types of files and directories in a cpuset file system */
+
+typedef enum {
+	FILE_ROOT,
+	FILE_DIR,
+	FILE_CPULIST,
+	FILE_MEMLIST,
+	FILE_CPU_EXCLUSIVE,
+	FILE_MEM_EXCLUSIVE,
+	FILE_NOTIFY_ON_RELEASE,
+	FILE_TASKLIST,
+} cpuset_filetype_t;
+
+static ssize_t cpuset_common_file_write(struct file *file, const char *userbuf,
+					size_t nbytes, loff_t *unused_ppos)
+{
+	struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
+	struct cftype *cft = __d_cft(file->f_dentry);
+	cpuset_filetype_t type = cft->private;
+	char *buffer;
+	int retval = 0;
+
+	/* Crude upper limit on largest legitimate cpulist user might write. */
+	if (nbytes > 100 + 6 * NR_CPUS)
+		return -E2BIG;
+
+	/* +1 for nul-terminator */
+	if ((buffer = kmalloc(nbytes + 1, GFP_KERNEL)) == 0)
+		return -ENOMEM;
+
+	if (copy_from_user(buffer, userbuf, nbytes)) {
+		retval = -EFAULT;
+		goto out1;
+	}
+	buffer[nbytes] = 0;	/* nul-terminate */
+
+	down(&cpuset_sem);
+
+	if (is_removed(cs)) {
+		retval = -ENODEV;
+		goto out2;
+	}
+
+	switch (type) {
+	case FILE_CPULIST:
+		retval = update_cpumask(cs, buffer);
+		break;
+	case FILE_MEMLIST:
+		retval = update_nodemask(cs, buffer);
+		break;
+	case FILE_CPU_EXCLUSIVE:
+		retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer);
+		break;
+	case FILE_MEM_EXCLUSIVE:
+		retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
+		break;
+	case FILE_NOTIFY_ON_RELEASE:
+		retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
+		break;
+	case FILE_TASKLIST:
+		retval = attach_task(cs, buffer);
+		break;
+	default:
+		retval = -EINVAL;
+		goto out2;
+	}
+
+	if (retval == 0)
+		retval = nbytes;
+out2:
+	up(&cpuset_sem);
+out1:
+	kfree(buffer);
+	return retval;
+}
+
+static ssize_t cpuset_file_write(struct file *file, const char *buf,
+						size_t nbytes, loff_t *ppos)
+{
+	ssize_t retval = 0;
+	struct cftype *cft = __d_cft(file->f_dentry);
+	if (!cft)
+		return -ENODEV;
+
+	/* special function ? */
+	if (cft->write)
+		retval = cft->write(file, buf, nbytes, ppos);
+	else
+		retval = cpuset_common_file_write(file, buf, nbytes, ppos);
+
+	return retval;
+}
+
+/*
+ * These ascii lists should be read in a single call, by using a user
+ * buffer large enough to hold the entire map.  If read in smaller
+ * chunks, there is no guarantee of atomicity.  Since the display format
+ * used, list of ranges of sequential numbers, is variable length,
+ * and since these maps can change value dynamically, one could read
+ * gibberish by doing partial reads while a list was changing.
+ * A single large read to a buffer that crosses a page boundary is
+ * ok, because the result being copied to user land is not recomputed
+ * across a page fault.
+ */
+
+static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
+{
+	cpumask_t mask;
+
+	down(&cpuset_sem);
+	mask = cs->cpus_allowed;
+	up(&cpuset_sem);
+
+	return cpulist_scnprintf(page, PAGE_SIZE, mask);
+}
+
+static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
+{
+	nodemask_t mask;
+
+	down(&cpuset_sem);
+	mask = cs->mems_allowed;
+	up(&cpuset_sem);
+
+	return nodelist_scnprintf(page, PAGE_SIZE, mask);
+}
+
+static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
+				size_t nbytes, loff_t *ppos)
+{
+	struct cftype *cft = __d_cft(file->f_dentry);
+	struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
+	cpuset_filetype_t type = cft->private;
+	char *page;
+	ssize_t retval = 0;
+	char *s;
+	char *start;
+	size_t n;
+
+	if (!(page = (char *)__get_free_page(GFP_KERNEL)))
+		return -ENOMEM;
+
+	s = page;
+
+	switch (type) {
+	case FILE_CPULIST:
+		s += cpuset_sprintf_cpulist(s, cs);
+		break;
+	case FILE_MEMLIST:
+		s += cpuset_sprintf_memlist(s, cs);
+		break;
+	case FILE_CPU_EXCLUSIVE:
+		*s++ = is_cpu_exclusive(cs) ? '1' : '0';
+		break;
+	case FILE_MEM_EXCLUSIVE:
+		*s++ = is_mem_exclusive(cs) ? '1' : '0';
+		break;
+	case FILE_NOTIFY_ON_RELEASE:
+		*s++ = notify_on_release(cs) ? '1' : '0';
+		break;
+	default:
+		retval = -EINVAL;
+		goto out;
+	}
+	*s++ = '\n';
+	*s = '\0';
+
+	start = page + *ppos;
+	n = s - start;
+	retval = n - copy_to_user(buf, start, min(n, nbytes));
+	*ppos += retval;
+out:
+	free_page((unsigned long)page);
+	return retval;
+}
+
+static ssize_t cpuset_file_read(struct file *file, char *buf, size_t nbytes,
+								loff_t *ppos)
+{
+	ssize_t retval = 0;
+	struct cftype *cft = __d_cft(file->f_dentry);
+	if (!cft)
+		return -ENODEV;
+
+	/* special function ? */
+	if (cft->read)
+		retval = cft->read(file, buf, nbytes, ppos);
+	else
+		retval = cpuset_common_file_read(file, buf, nbytes, ppos);
+
+	return retval;
+}
+
+static int cpuset_file_open(struct inode *inode, struct file *file)
+{
+	int err;
+	struct cftype *cft;
+
+	err = generic_file_open(inode, file);
+	if (err)
+		return err;
+
+	cft = __d_cft(file->f_dentry);
+	if (!cft)
+		return -ENODEV;
+	if (cft->open)
+		err = cft->open(inode, file);
+	else
+		err = 0;
+
+	return err;
+}
+
+static int cpuset_file_release(struct inode *inode, struct file *file)
+{
+	struct cftype *cft = __d_cft(file->f_dentry);
+	if (cft->release)
+		return cft->release(inode, file);
+	return 0;
+}
+
+static struct file_operations cpuset_file_operations = {
+	.read = cpuset_file_read,
+	.write = cpuset_file_write,
+	.llseek = generic_file_llseek,
+	.open = cpuset_file_open,
+	.release = cpuset_file_release,
+};
+
+static struct inode_operations cpuset_dir_inode_operations = {
+	.lookup = simple_lookup,
+	.mkdir = cpuset_mkdir,
+	.rmdir = cpuset_rmdir,
+};
+
+static int cpuset_create_file(struct dentry *dentry, int mode)
+{
+	struct inode *inode;
+
+	if (!dentry)
+		return -ENOENT;
+	if (dentry->d_inode)
+		return -EEXIST;
+
+	inode = cpuset_new_inode(mode);
+	if (!inode)
+		return -ENOMEM;
+
+	if (S_ISDIR(mode)) {
+		inode->i_op = &cpuset_dir_inode_operations;
+		inode->i_fop = &simple_dir_operations;
+	
+		/* start off with i_nlink == 2 (for "." entry) */
+		inode->i_nlink++;
+	} else if (S_ISREG(mode)) {
+		inode->i_size = 0;
+		inode->i_fop = &cpuset_file_operations;
+	}
+
+	d_instantiate(dentry, inode);
+	dget(dentry);	/* Extra count - pin the dentry in core */
+	return 0;
+}
+
+/*
+ *	cpuset_create_dir - create a directory for an object.
+ *	cs: 	the cpuset we create the directory for.
+ *		It must have a valid ->parent field
+ *		And we are going to fill its ->dentry field.
+ *	name:	The name to give to the cpuset directory. Will be copied.
+ *	mode:	mode to set on new directory.
+ */
+
+static int cpuset_create_dir(struct cpuset *cs, const char *name, int mode)
+{
+	struct dentry *dentry = NULL;
+	struct dentry *parent;
+	int error = 0;
+
+	parent = cs->parent->dentry;
+	dentry = cpuset_get_dentry(parent, name);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+	error = cpuset_create_file(dentry, S_IFDIR | mode);
+	if (!error) {
+		dentry->d_fsdata = cs;
+		parent->d_inode->i_nlink++;
+		cs->dentry = dentry;
+	}
+	dput(dentry);
+
+	return error;
+}
+
+/* MUST be called with dir->d_inode->i_sem held */
+
+static int cpuset_add_file(struct dentry *dir, const struct cftype *cft)
+{
+	struct dentry *dentry;
+	int error;
+
+	dentry = cpuset_get_dentry(dir, cft->name);
+	if (!IS_ERR(dentry)) {
+		error = cpuset_create_file(dentry, 0644 | S_IFREG);
+		if (!error)
+			dentry->d_fsdata = (void *)cft;
+		dput(dentry);
+	} else
+		error = PTR_ERR(dentry);
+	return error;
+}
+
+/*
+ * Stuff for reading the 'tasks' file.
+ *
+ * Reading this file can return large amounts of data if a cpuset has
+ * *lots* of attached tasks. So it may need several calls to read(),
+ * but we cannot guarantee that the information we produce is correct
+ * unless we produce it entirely atomically.
+ *
+ * Upon first file read(), a struct ctr_struct is allocated, that
+ * will have a pointer to an array (also allocated here).  The struct
+ * ctr_struct * is stored in file->private_data.  Its resources will
+ * be freed by release() when the file is closed.  The array is used
+ * to sprintf the PIDs and then used by read().
+ */
+
+/* cpusets_tasks_read array */
+
+struct ctr_struct {
+	int *array;
+	int count;
+};
+
+static struct ctr_struct *cpuset_tasks_mkctr(struct file *file)
+{
+	struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
+	struct ctr_struct *ctr;
+	pid_t *array;
+	int n, max;
+	pid_t i, j, last;
+	struct task_struct *g, *p;
+
+	ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
+	if (!ctr)
+		return NULL;
+
+	/*
+	 * If cpuset gets more users after we read count, we won't have
+	 * enough space - tough.  This race is indistinguishable to the
+	 * caller from the case that the additional cpuset users didn't
+	 * show up until sometime later on.  Grabbing cpuset_sem would
+	 * not help, because cpuset_fork() doesn't grab cpuset_sem.
+	 */
+
+	max = atomic_read(&cs->count);
+	array = kmalloc(max * sizeof(pid_t), GFP_KERNEL);
+	if (!array) {
+		kfree(ctr);
+		return NULL;
+	}
+
+	n = 0;
+	read_lock(&tasklist_lock);
+	do_each_thread(g, p) {
+		if (p->cpuset == cs) {
+			array[n++] = p->pid;
+			if (unlikely(n == max))
+				goto array_full;
+		}
+	}
+	while_each_thread(g, p);
+array_full:
+	read_unlock(&tasklist_lock);
+
+	/* stupid bubble sort */
+	for (i = 0; i < n - 1; i++) {
+		for (j = 0; j < n - 1 - i; j++)
+			if (array[j + 1] < array[j]) {
+				pid_t tmp = array[j];
+				array[j] = array[j + 1];
+				array[j + 1] = tmp;
+			}
+	}
+
+	/*
+	 * Collapse sorted array by grouping consecutive pids.
+	 * Code range of pids with a negative pid on the second.
+	 * Read from array[i]; write to array]j]; j <= i always.
+	 */
+	last = array[0];  /* any value != array[0] - 1 */
+	j = -1;
+	for (i = 0; i < n; i++) {
+		pid_t curr = array[i];
+		/* consecutive pids ? */
+		if (curr - last == 1) {
+			/* move destination index if it has not been done */
+			if (array[j] > 0)
+				j++;
+			array[j] = -curr;
+		} else
+			array[++j] = curr;
+		last = curr;
+	}
+
+	ctr->array = array;
+	ctr->count = j + 1;
+	file->private_data = (void *)ctr;
+	return ctr;
+}
+
+/* printf one pid from an array
+ * different formatting depending on whether it is positive or negative,
+ * or whether it is or not the first pid or the last
+ */
+static int array_pid_sprintf(char *buf, pid_t *array, int idx, int last)
+{
+	pid_t v = array[idx];
+	int l = 0;
+
+	if (v < 0) {		/* second pid of a range of pids */
+		v = -v;
+		buf[l++] = '-';
+	} else {		/* first pid of a range, or not a range */
+		if (idx)	/* comma only if it's not the first */
+			buf[l++] = ',';
+	}
+	l += sprintf(buf + l, "%d", v);
+	/* newline after last record */
+	if (idx == last)
+		l += sprintf(buf + l, "\n");
+	return l;
+}
+
+static ssize_t cpuset_tasks_read(struct file *file, char __user *buf,
+						size_t nbytes, loff_t *ppos)
+{
+	struct ctr_struct *ctr = (struct ctr_struct *)file->private_data;
+	int *array, nr_pids, i;
+	size_t len, lastlen = 0;
+	char *page;
+
+	/* allocate buffer and fill it on first call to read() */
+	if (!ctr) {
+		ctr = cpuset_tasks_mkctr(file);
+		if (!ctr)
+			return -ENOMEM;
+	}
+
+	array = ctr->array;
+	nr_pids = ctr->count;
+
+	if (!(page = (char *)__get_free_page(GFP_KERNEL)))
+		return -ENOMEM;
+
+	i = *ppos;		/* index of pid being printed */
+	len = 0;		/* length of data sprintf'ed in the page */
+
+	while ((len < PAGE_SIZE - 10) && (i < nr_pids) && (len < nbytes)) {
+		lastlen = array_pid_sprintf(page + len, array, i++, nr_pids - 1);
+		len += lastlen;
+	}
+
+	/* if we wrote too much, remove last record */
+	if (len > nbytes) {
+		len -= lastlen;
+		i--;
+	}
+
+	*ppos = i;
+
+	if (copy_to_user(buf, page, len))
+		len = -EFAULT;
+	free_page((unsigned long)page);
+	return len;
+}
+
+static int cpuset_tasks_release(struct inode *unused_inode, struct file *file)
+{
+	struct ctr_struct *ctr;
+
+	/* we have nothing to do if no read-access is needed */
+	if (!(file->f_mode & FMODE_READ))
+		return 0;
+
+	ctr = (struct ctr_struct *)file->private_data;
+	kfree(ctr->array);
+	kfree(ctr);
+	return 0;
+}
+
+/*
+ * for the common functions, 'private' gives the type of file
+ */
+
+static struct cftype cft_tasks = {
+	.name = "tasks",
+	.read = cpuset_tasks_read,
+	.release = cpuset_tasks_release,
+	.private = FILE_TASKLIST,
+};
+
+static struct cftype cft_cpus = {
+	.name = "cpus",
+	.private = FILE_CPULIST,
+};
+
+static struct cftype cft_mems = {
+	.name = "mems",
+	.private = FILE_MEMLIST,
+};
+
+static struct cftype cft_cpu_exclusive = {
+	.name = "cpu_exclusive",
+	.private = FILE_CPU_EXCLUSIVE,
+};
+
+static struct cftype cft_mem_exclusive = {
+	.name = "mem_exclusive",
+	.private = FILE_MEM_EXCLUSIVE,
+};
+
+static struct cftype cft_notify_on_release = {
+	.name = "notify_on_release",
+	.private = FILE_NOTIFY_ON_RELEASE,
+};
+
+/* MUST be called with ->d_inode->i_sem held */
+static int cpuset_populate_dir(struct dentry *cs_dentry)
+{
+	int err;
+
+	if ((err = cpuset_add_file(cs_dentry, &cft_cpus)) < 0)
+		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_mems)) < 0)
+		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_cpu_exclusive)) < 0)
+		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_mem_exclusive)) < 0)
+		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0)
+		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
+		return err;
+	return 0;
+}
+
+/*
+ *	cpuset_create - create a cpuset
+ *	parent:	cpuset that will be parent of the new cpuset.
+ *	name:		name of the new cpuset. Will be strcpy'ed.
+ *	mode:		mode to set on new inode
+ *
+ *	Must be called with the semaphore on the parent inode held
+ */
+
+static long cpuset_create(struct cpuset *parent, const char *name, int mode)
+{
+	struct cpuset *cs;
+	int err;
+
+	cs = kmalloc(sizeof(*cs), GFP_KERNEL);
+	if (!cs)
+		return -ENOMEM;
+
+	down(&cpuset_sem);
+	cs->flags = 0;
+	if (notify_on_release(parent))
+		set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
+	cs->cpus_allowed = parent->cpus_allowed;
+	cs->mems_allowed = parent->mems_allowed;
+	atomic_set(&cs->count, 0);
+	INIT_LIST_HEAD(&cs->sibling);
+	INIT_LIST_HEAD(&cs->children);
+
+	cs->parent = parent;
+
+	list_add(&cs->sibling, &cs->parent->children);
+
+	err = cpuset_create_dir(cs, name, mode);
+	if (err < 0)
+		goto err;
+	err = cpuset_populate_dir(cs->dentry);
+	/* If err < 0, we have a half-filled directory - oh well ;) */
+	up(&cpuset_sem);
+	return 0;
+err:
+	list_del(&cs->sibling);
+	up(&cpuset_sem);
+	kfree(cs);
+	return err;
+}
+
+static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct dentry *d_parent = dentry->d_parent;
+	struct cpuset *c_parent = (struct cpuset *)d_parent->d_fsdata;
+
+	/* the vfs holds inode->i_sem already */
+	return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
+}
+
+static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
+{
+	struct cpuset *cs = (struct cpuset *)dentry->d_fsdata;
+	struct dentry *d;
+	struct cpuset *parent;
+
+	/* the vfs holds both inode->i_sem already */
+
+	down(&cpuset_sem);
+	if (atomic_read(&cs->count) > 0) {
+		up(&cpuset_sem);
+		return -EBUSY;
+	}
+	if (!list_empty(&cs->children)) {
+		up(&cpuset_sem);
+		return -EBUSY;
+	}
+	spin_lock(&cs->dentry->d_lock);
+	parent = cs->parent;
+	set_bit(CS_REMOVED, &cs->flags);
+	list_del(&cs->sibling);	/* delete my sibling from parent->children */
+	if (list_empty(&parent->children))
+		check_for_release(parent);
+	d = dget(cs->dentry);
+	cs->dentry = NULL;
+	spin_unlock(&d->d_lock);
+	cpuset_d_remove_dir(d);
+	dput(d);
+	up(&cpuset_sem);
+	return 0;
+}
+
+/**
+ * cpuset_init - initialize cpusets at system boot
+ *
+ * Description: Initialize top_cpuset and the cpuset internal file system,
+ **/
+
+int __init cpuset_init(void)
+{
+	struct dentry *root;
+	int err;
+
+	top_cpuset.cpus_allowed = cpu_possible_map;
+	top_cpuset.mems_allowed = node_possible_map;
+
+	init_task.cpuset = &top_cpuset;
+
+	err = register_filesystem(&cpuset_fs_type);
+	if (err < 0)
+		goto out;
+	cpuset_mount = kern_mount(&cpuset_fs_type);
+	if (IS_ERR(cpuset_mount)) {
+		printk(KERN_ERR "cpuset: could not mount!\n");
+		err = PTR_ERR(cpuset_mount);
+		cpuset_mount = NULL;
+		goto out;
+	}
+	root = cpuset_mount->mnt_sb->s_root;
+	root->d_fsdata = &top_cpuset;
+	root->d_inode->i_nlink++;
+	top_cpuset.dentry = root;
+	root->d_inode->i_op = &cpuset_dir_inode_operations;
+	err = cpuset_populate_dir(root);
+out:
+	return err;
+}
+
+/**
+ * cpuset_fork - attach newly forked task to its parents cpuset.
+ * @p: pointer to task_struct of forking parent process.
+ *
+ * Description: By default, on fork, a task inherits its
+ * parents cpuset.  The pointer to the shared cpuset is
+ * automatically copied in fork.c by dup_task_struct().
+ * This cpuset_fork() routine need only increment the usage
+ * counter in that cpuset.
+ **/
+
+void cpuset_fork(struct task_struct *tsk)
+{
+	atomic_inc(&tsk->cpuset->count);
+}
+
+/**
+ * cpuset_exit - detach cpuset from exiting task
+ * @tsk: pointer to task_struct of exiting process
+ *
+ * Description: Detach cpuset from @tsk and release it.
+ *
+ **/
+
+void cpuset_exit(struct task_struct *tsk)
+{
+	struct cpuset *cs;
+
+	task_lock(tsk);
+	cs = tsk->cpuset;
+	tsk->cpuset = NULL;
+	task_unlock(tsk);
+
+	if (atomic_dec_and_test(&cs->count)) {
+		down(&cpuset_sem);	
+		check_for_release(cs);
+		up(&cpuset_sem);
+	}
+}
+
+/**
+ * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
+ *
+ * Description: Returns the cpumask_t cpus_allowed of the cpuset
+ * attached to the specified @tsk.
+ **/
+
+const cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
+{
+	cpumask_t mask;
+
+	down(&cpuset_sem);
+	task_lock((struct task_struct *)tsk);
+	if (tsk->cpuset)
+		mask = tsk->cpuset->cpus_allowed;
+	else
+		mask = CPU_MASK_ALL;
+	task_unlock((struct task_struct *)tsk);
+	up(&cpuset_sem);
+
+	return mask;
+}
+
+/**
+ * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
+ *
+ * Description: Returns the nodemask_t mems_allowed of the cpuset
+ * attached to the specified @tsk.
+ **/
+
+const nodemask_t cpuset_mems_allowed(const struct task_struct *tsk)
+{
+	nodemask_t mask;
+
+	down(&cpuset_sem);
+	task_lock((struct task_struct *)tsk);
+	if (tsk->cpuset)
+		mask = tsk->cpuset->mems_allowed;
+	else
+		mask = NODE_MASK_ALL;
+	task_unlock((struct task_struct *)tsk);
+	up(&cpuset_sem);
+
+	return mask;
+}
+
+void cpuset_init_current_mems_allowed(void)
+{
+	current->mems_allowed = NODE_MASK_ALL;
+}
+
+void cpuset_update_current_mems_allowed()
+{
+	current->mems_allowed = cpuset_mems_allowed(current);
+}
+
+void cpuset_restrict_to_mems_allowed(unsigned long *nodes)
+{
+	bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed),
+							MAX_NUMNODES);
+}
+
+/*
+ * Are any of the nodes on zonelist zl allowed in current->mems_allowed?
+ */
+int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
+{
+	int i;
+
+	for (i = 0; zl->zones[i]; i++) {
+		int nid = zl->zones[i]->zone_pgdat->node_id;
+
+		if (node_isset(nid, current->mems_allowed))
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * Is 'current' valid, and is zone z allowed in current->mems_allowed?
+ */
+int cpuset_zone_allowed(struct zone *z)
+{
+	return in_interrupt() ||
+		node_isset(z->zone_pgdat->node_id, current->mems_allowed);
+}
+
+/*
+ * proc_cpuset_show()
+ *  - Print tasks cpuset path into seq_file.
+ *  - Used for /proc/<pid>/cpuset.
+ */
+
+static int proc_cpuset_show(struct seq_file *m, void *v)
+{
+	struct cpuset *cs;
+	struct task_struct *tsk;
+	char *buf;
+	int retval = 0;
+
+	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	tsk = m->private;
+	down(&cpuset_sem);
+	task_lock(tsk);
+	cs = tsk->cpuset;
+	task_unlock(tsk);
+	if (!cs) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	retval = cpuset_path(cs, buf, PAGE_SIZE);
+	if (retval < 0)
+		goto out;
+	seq_puts(m, buf);
+	seq_putc(m, '\n');
+out:
+	up(&cpuset_sem);
+	kfree(buf);
+	return retval;
+}
+
+static int cpuset_open(struct inode *inode, struct file *file)
+{
+	struct task_struct *tsk = PROC_I(inode)->task;
+	return single_open(file, proc_cpuset_show, tsk);
+}
+
+struct file_operations proc_cpuset_operations = {
+	.open		= cpuset_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+EXPORT_SYMBOL(proc_cpuset_operations);
Index: 2.6.8-rc2-mm2/kernel/exit.c
===================================================================
--- 2.6.8-rc2-mm2.orig/kernel/exit.c	2004-08-04 21:44:05.000000000 -0700
+++ 2.6.8-rc2-mm2/kernel/exit.c	2004-08-04 21:44:49.000000000 -0700
@@ -29,6 +29,7 @@
 #include <asm/unistd.h>
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
+#include <linux/cpuset.h>
 
 extern void sem_exit (void);
 extern struct task_struct *child_reaper;
@@ -829,6 +830,7 @@ asmlinkage NORET_TYPE void do_exit(long 
 	__exit_fs(tsk);
 	exit_namespace(tsk);
 	exit_thread();
+	cpuset_exit(tsk);
 
 	if (tsk->signal->leader)
 		disassociate_ctty(1);
Index: 2.6.8-rc2-mm2/kernel/fork.c
===================================================================
--- 2.6.8-rc2-mm2.orig/kernel/fork.c	2004-08-04 21:44:05.000000000 -0700
+++ 2.6.8-rc2-mm2/kernel/fork.c	2004-08-04 21:44:49.000000000 -0700
@@ -38,6 +38,7 @@
 #include <linux/audit.h>
 #include <linux/rmap.h>
 #include <linux/pagg.h>
+#include <linux/cpuset.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -1106,6 +1107,8 @@ struct task_struct *copy_process(unsigne
 	if (p->ptrace & PT_PTRACED)
 		__ptrace_link(p, current->parent);
 
+	cpuset_fork(p);
+
 	attach_pid(p, PIDTYPE_PID, p->pid);
 	if (thread_group_leader(p)) {
 		attach_pid(p, PIDTYPE_TGID, p->tgid);
Index: 2.6.8-rc2-mm2/kernel/sched.c
===================================================================
--- 2.6.8-rc2-mm2.orig/kernel/sched.c	2004-08-04 21:44:05.000000000 -0700
+++ 2.6.8-rc2-mm2/kernel/sched.c	2004-08-04 21:44:49.000000000 -0700
@@ -43,6 +43,7 @@
 #include <linux/percpu.h>
 #include <linux/perfctr.h>
 #include <linux/kthread.h>
+#include <linux/cpuset.h>
 #include <asm/tlb.h>
 
 #include <asm/unistd.h>
@@ -2537,7 +2538,7 @@ out_unlock:
 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
 				      unsigned long __user *user_mask_ptr)
 {
-	cpumask_t new_mask;
+	cpumask_t new_mask, cpus_allowed;
 	int retval;
 	task_t *p;
 
@@ -2570,6 +2571,8 @@ asmlinkage long sys_sched_setaffinity(pi
 			!capable(CAP_SYS_NICE))
 		goto out_unlock;
 
+	cpus_allowed = cpuset_cpus_allowed(p);
+	cpus_and(new_mask, new_mask, cpus_allowed);
 	retval = set_cpus_allowed(p, new_mask);
 
 out_unlock:
@@ -3138,7 +3141,9 @@ static void migrate_all_tasks(int src_cp
 		if (dest_cpu == NR_CPUS)
 			dest_cpu = any_online_cpu(tsk->cpus_allowed);
 		if (dest_cpu == NR_CPUS) {
-			cpus_setall(tsk->cpus_allowed);
+			tsk->cpus_allowed = cpuset_cpus_allowed(tsk);
+			if (!cpus_intersects(tsk->cpus_allowed, cpu_online_map))
+				cpus_setall(tsk->cpus_allowed);
 			dest_cpu = any_online_cpu(tsk->cpus_allowed);
 
 			/* Don't tell them about moving exiting tasks
Index: 2.6.8-rc2-mm2/mm/mempolicy.c
===================================================================
--- 2.6.8-rc2-mm2.orig/mm/mempolicy.c	2004-08-04 21:44:05.000000000 -0700
+++ 2.6.8-rc2-mm2/mm/mempolicy.c	2004-08-04 21:44:49.000000000 -0700
@@ -67,6 +67,7 @@
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/nodemask.h>
+#include <linux/cpuset.h>
 #include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/string.h>
@@ -164,6 +165,10 @@ static int get_nodes(unsigned long *node
 	if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
 		return -EFAULT;
 	nodes[nlongs-1] &= endmask;
+	/* Update current mems_allowed */
+	cpuset_update_current_mems_allowed();
+	/* Ignore nodes not set in current->mems_allowed */
+	cpuset_restrict_to_mems_allowed(nodes);
 	return mpol_check_policy(mode, nodes);
 }
 
@@ -574,8 +579,10 @@ static struct zonelist *zonelist_policy(
 		break;
 	case MPOL_BIND:
 		/* Lower zones don't get a policy applied */
+		/* Careful: current->mems_allowed might have moved */
 		if (gfp >= policy_zone)
-			return policy->v.zonelist;
+			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
+				return policy->v.zonelist;
 		/*FALL THROUGH*/
 	case MPOL_INTERLEAVE: /* should not happen */
 	case MPOL_DEFAULT:
Index: 2.6.8-rc2-mm2/mm/page_alloc.c
===================================================================
--- 2.6.8-rc2-mm2.orig/mm/page_alloc.c	2004-08-04 21:44:05.000000000 -0700
+++ 2.6.8-rc2-mm2/mm/page_alloc.c	2004-08-04 21:44:49.000000000 -0700
@@ -31,6 +31,7 @@
 #include <linux/topology.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
+#include <linux/cpuset.h>
 #include <linux/nodemask.h>
 
 #include <asm/tlbflush.h>
@@ -626,6 +627,9 @@ __alloc_pages(unsigned int gfp_mask, uns
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *z = zones[i];
 
+		if (!cpuset_zone_allowed(z))
+			continue;
+
 		min = (1<<order) + z->protection[alloc_type];
 
 		/*
@@ -653,6 +657,9 @@ __alloc_pages(unsigned int gfp_mask, uns
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *z = zones[i];
 
+		if (!cpuset_zone_allowed(z))
+			continue;
+
 		min = (1<<order) + z->protection[alloc_type];
 
 		if (gfp_mask & __GFP_HIGH)
@@ -678,6 +685,9 @@ rebalance:
 		for (i = 0; zones[i] != NULL; i++) {
 			struct zone *z = zones[i];
 
+			if (!cpuset_zone_allowed(z))
+				continue;
+
 			page = buffered_rmqueue(z, order, gfp_mask);
 			if (page) {
 				zone_statistics(zonelist, z);
@@ -704,6 +714,9 @@ rebalance:
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *z = zones[i];
 
+		if (!cpuset_zone_allowed(z))
+			continue;
+
 		min = (1UL << order) + z->protection[alloc_type];
 
 		if (z->free_pages >= min ||
@@ -1315,6 +1328,7 @@ void __init build_all_zonelists(void)
 	for(i = 0 ; i < numnodes ; i++)
 		build_zonelists(NODE_DATA(i));
 	printk("Built %i zonelists\n", numnodes);
+	cpuset_init_current_mems_allowed();
 }
 
 /*
Index: 2.6.8-rc2-mm2/mm/vmscan.c
===================================================================
--- 2.6.8-rc2-mm2.orig/mm/vmscan.c	2004-08-04 21:44:05.000000000 -0700
+++ 2.6.8-rc2-mm2/mm/vmscan.c	2004-08-04 21:44:49.000000000 -0700
@@ -31,6 +31,7 @@
 #include <linux/rmap.h>
 #include <linux/topology.h>
 #include <linux/cpu.h>
+#include <linux/cpuset.h>
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
 
@@ -874,6 +875,9 @@ shrink_caches(struct zone **zones, struc
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *zone = zones[i];
 
+		if (!cpuset_zone_allowed(zone))
+			continue;
+
 		zone->temp_priority = sc->priority;
 		if (zone->prev_priority > sc->priority)
 			zone->prev_priority = sc->priority;
@@ -917,6 +921,9 @@ int try_to_free_pages(struct zone **zone
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *zone = zones[i];
 
+		if (!cpuset_zone_allowed(zone))
+			continue;
+
 		zone->temp_priority = DEF_PRIORITY;
 		lru_pages += zone->nr_active + zone->nr_inactive;
 	}
@@ -958,8 +965,14 @@ int try_to_free_pages(struct zone **zone
 	if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
 		out_of_memory(gfp_mask);
 out:
-	for (i = 0; zones[i] != 0; i++)
-		zones[i]->prev_priority = zones[i]->temp_priority;
+	for (i = 0; zones[i] != 0; i++) {
+		struct zone *zone = zones[i];
+
+		if (!cpuset_zone_allowed(zone))
+			continue;
+
+		zone->prev_priority = zone->temp_priority;
+	}
 	return ret;
 }
 
@@ -1167,6 +1180,8 @@ void wakeup_kswapd(struct zone *zone)
 {
 	if (zone->free_pages > zone->pages_low)
 		return;
+	if (!cpuset_zone_allowed(zone))
+		return;
 	if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
 		return;
 	wake_up_interruptible(&zone->zone_pgdat->kswapd_wait);

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] new bitmap list format (for cpusets)
  2004-08-05 10:08 [PATCH] new bitmap list format (for cpusets) Paul Jackson
  2004-08-05 10:10 ` [PATCH] cpusets - big numa cpu and memory placement Paul Jackson
@ 2004-08-05 20:47 ` Martin J. Bligh
  2004-08-05 21:45   ` Paul Jackson
  2004-08-09  8:01   ` Paul Jackson
  2004-08-11 13:11 ` Dinakar Guniguntala
  2 siblings, 2 replies; 233+ messages in thread
From: Martin J. Bligh @ 2004-08-05 20:47 UTC (permalink / raw)
  To: Paul Jackson, Andrew Morton
  Cc: Christoph Hellwig, Jack Steiner, Jesse Barnes, Sylvain Jeaugey,
	Dan Higgins, linux-kernel, Matthew Dobson, Simon Derr,
	Andi Kleen, lse-tech, Dimitri Sivanich

Can't we just do this up in userspace, with some manipulation tools 
if you really have that many CPUs? I'm not convinced it makes sense
to make the kernel interface that complicated ...

m.

--On Thursday, August 05, 2004 03:08:47 -0700 Paul Jackson <pj@sgi.com> wrote:

> A bitmap print and parse format that provides lists of ranges of
> numbers, to be first used for by cpusets (next patch).
> 
> Cpusets provide a way to manage subsets of CPUs and Memory Nodes
> for scheduling and memory placement, via a new virtual file system,
> usually mounted at /dev/cpuset.  Manipulation of cpusets can be done
> directly via this file system, from the shell.
> 
> However, manipulating 512 bit cpumasks or 256 bit nodemasks (which
> will get bigger) via hex mask strings is painful for humans.
> 
> The intention is to provide a format for the cpu and memory mask files
> in /dev/cpusets that will stand the test of time.  This format is
> supported by a couple of new lib/bitmap.c routines, for printing and
> parsing these strings.  Wrappers for cpumask and nodemask are provided.
> 
> See the embedded comments, below in the patch, for more details of
> the format.  The input format supports adding or removing specified
> cpus or nodes, as well as entirely rewriting the mask.
> 
>  include/linux/bitmap.h   |    8 ++
>  include/linux/cpumask.h  |   22 ++++++-
>  include/linux/nodemask.h |   22 ++++++-
>  lib/bitmap.c             |  142 +++++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 189 insertions(+), 5 deletions(-)
> 
> Signed-off-by: Paul Jackson <pj@sgi.com>
> 
> Index: 2.6.8-rc2-mm2/include/linux/bitmap.h
> ===================================================================
> --- 2.6.8-rc2-mm2.orig/include/linux/bitmap.h	2004-08-04 19:29:15.000000000 -0700
> +++ 2.6.8-rc2-mm2/include/linux/bitmap.h	2004-08-04 19:41:10.000000000 -0700
> @@ -41,7 +41,9 @@
>   * bitmap_shift_right(dst, src, n, nbits)	*dst = *src >> n
>   * bitmap_shift_left(dst, src, n, nbits)	*dst = *src << n
>   * bitmap_scnprintf(buf, len, src, nbits)	Print bitmap src to buf
> - * bitmap_parse(ubuf, ulen, dst, nbits)		Parse bitmap dst from buf
> + * bitmap_parse(ubuf, ulen, dst, nbits)		Parse bitmap dst from user buf
> + * bitmap_scnlistprintf(buf, len, src, nbits)	Print bitmap src as list to buf
> + * bitmap_parselist(buf, dst, nbits)		Parse bitmap dst from list
>   */
>  
>  /*
> @@ -98,6 +100,10 @@ extern int bitmap_scnprintf(char *buf, u
>  			const unsigned long *src, int nbits);
>  extern int bitmap_parse(const char __user *ubuf, unsigned int ulen,
>  			unsigned long *dst, int nbits);
> +extern int bitmap_scnlistprintf(char *buf, unsigned int len,
> +			const unsigned long *src, int nbits);
> +extern int bitmap_parselist(const char *buf, unsigned long *maskp,
> +			int nmaskbits);
>  extern int bitmap_find_free_region(unsigned long *bitmap, int bits, int order);
>  extern void bitmap_release_region(unsigned long *bitmap, int pos, int order);
>  extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order);
> Index: 2.6.8-rc2-mm2/include/linux/cpumask.h
> ===================================================================
> --- 2.6.8-rc2-mm2.orig/include/linux/cpumask.h	2004-08-04 19:29:34.000000000 -0700
> +++ 2.6.8-rc2-mm2/include/linux/cpumask.h	2004-08-04 20:35:10.000000000 -0700
> @@ -10,6 +10,8 @@
>   *
>   * For details of cpumask_scnprintf() and cpumask_parse(),
>   * see bitmap_scnprintf() and bitmap_parse() in lib/bitmap.c.
> + * For details of cpulist_scnprintf() and cpulist_parse(), see
> + * bitmap_scnlistprintf() and bitmap_parselist(), also in bitmap.c.
>   *
>   * The available cpumask operations are:
>   *
> @@ -46,6 +48,8 @@
>   *
>   * int cpumask_scnprintf(buf, len, mask) Format cpumask for printing
>   * int cpumask_parse(ubuf, ulen, mask)	Parse ascii string as cpumask
> + * int cpulist_scnprintf(buf, len, mask) Format cpumask as list for printing
> + * int cpulist_parse(buf, map)		Parse ascii string as cpulist
>   *
>   * for_each_cpu_mask(cpu, mask)		for-loop cpu over mask
>   *
> @@ -268,14 +272,28 @@ static inline int __cpumask_scnprintf(ch
>  	return bitmap_scnprintf(buf, len, srcp->bits, nbits);
>  }
>  
> -#define cpumask_parse(ubuf, ulen, src) \
> -			__cpumask_parse((ubuf), (ulen), &(src), NR_CPUS)
> +#define cpumask_parse(ubuf, ulen, dst) \
> +			__cpumask_parse((ubuf), (ulen), &(dst), NR_CPUS)
>  static inline int __cpumask_parse(const char __user *buf, int len,
>  					cpumask_t *dstp, int nbits)
>  {
>  	return bitmap_parse(buf, len, dstp->bits, nbits);
>  }
>  
> +#define cpulist_scnprintf(buf, len, src) \
> +			__cpulist_scnprintf((buf), (len), &(src), NR_CPUS)
> +static inline int __cpulist_scnprintf(char *buf, int len,
> +					const cpumask_t *srcp, int nbits)
> +{
> +	return bitmap_scnlistprintf(buf, len, srcp->bits, nbits);
> +}
> +
> +#define cpulist_parse(buf, dst) __cpulist_parse((buf), &(dst), NR_CPUS)
> +static inline int __cpulist_parse(const char *buf, cpumask_t *dstp, int nbits)
> +{
> +	return bitmap_parselist(buf, dstp->bits, nbits);
> +}
> +
>  #if NR_CPUS > 1
>  #define for_each_cpu_mask(cpu, mask)		\
>  	for ((cpu) = first_cpu(mask);		\
> Index: 2.6.8-rc2-mm2/include/linux/nodemask.h
> ===================================================================
> --- 2.6.8-rc2-mm2.orig/include/linux/nodemask.h	2004-08-04 19:29:29.000000000 -0700
> +++ 2.6.8-rc2-mm2/include/linux/nodemask.h	2004-08-04 20:28:50.000000000 -0700
> @@ -10,6 +10,8 @@
>   *
>   * For details of nodemask_scnprintf() and nodemask_parse(),
>   * see bitmap_scnprintf() and bitmap_parse() in lib/bitmap.c.
> + * For details of nodelist_scnprintf() and nodelist_parse(), see
> + * bitmap_scnlistprintf() and bitmap_parselist(), also in bitmap.c.
>   *
>   * The available nodemask operations are:
>   *
> @@ -46,6 +48,8 @@
>   *
>   * int nodemask_scnprintf(buf, len, mask) Format nodemask for printing
>   * int nodemask_parse(ubuf, ulen, mask)	Parse ascii string as nodemask
> + * int nodelist_scnprintf(buf, len, mask) Format nodemask as list for printing
> + * int nodelist_parse(buf, map)		Parse ascii string as nodelist
>   *
>   * for_each_node_mask(node, mask)	for-loop node over mask
>   *
> @@ -271,14 +275,28 @@ static inline int __nodemask_scnprintf(c
>  	return bitmap_scnprintf(buf, len, srcp->bits, nbits);
>  }
>  
> -#define nodemask_parse(ubuf, ulen, src) \
> -			__nodemask_parse((ubuf), (ulen), &(src), MAX_NUMNODES)
> +#define nodemask_parse(ubuf, ulen, dst) \
> +			__nodemask_parse((ubuf), (ulen), &(dst), MAX_NUMNODES)
>  static inline int __nodemask_parse(const char __user *buf, int len,
>  					nodemask_t *dstp, int nbits)
>  {
>  	return bitmap_parse(buf, len, dstp->bits, nbits);
>  }
>  
> +#define nodelist_scnprintf(buf, len, src) \
> +			__nodelist_scnprintf((buf), (len), &(src), MAX_NUMNODES)
> +static inline int __nodelist_scnprintf(char *buf, int len,
> +					const nodemask_t *srcp, int nbits)
> +{
> +	return bitmap_scnlistprintf(buf, len, srcp->bits, nbits);
> +}
> +
> +#define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES)
> +static inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits)
> +{
> +	return bitmap_parselist(buf, dstp->bits, nbits);
> +}
> +
>  #if MAX_NUMNODES > 1
>  #define for_each_node_mask(node, mask)			\
>  	for ((node) = first_node(mask);			\
> Index: 2.6.8-rc2-mm2/lib/bitmap.c
> ===================================================================
> --- 2.6.8-rc2-mm2.orig/lib/bitmap.c	2004-08-04 19:29:15.000000000 -0700
> +++ 2.6.8-rc2-mm2/lib/bitmap.c	2004-08-04 21:44:41.000000000 -0700
> @@ -291,6 +291,7 @@ EXPORT_SYMBOL(__bitmap_weight);
>  #define nbits_to_hold_value(val)	fls(val)
>  #define roundup_power2(val,modulus)	(((val) + (modulus) - 1) & ~((modulus) - 1))
>  #define unhex(c)			(isdigit(c) ? (c - '0') : (toupper(c) - 'A' + 10))
> +#define BASEDEC 10		/* fancier cpuset lists input in decimal */
>  
>  /**
>   * bitmap_scnprintf - convert bitmap to an ASCII hex string.
> @@ -409,6 +410,147 @@ int bitmap_parse(const char __user *ubuf
>  }
>  EXPORT_SYMBOL(bitmap_parse);
>  
> +/*
> + * bscnl_emit(buf, buflen, rbot, rtop, bp)
> + *
> + * Helper routine for bitmap_scnlistprintf().  Write decimal number
> + * or range to buf, suppressing output past buf+buflen, with optional
> + * comma-prefix.  Return len of what would be written to buf, if it
> + * all fit.
> + */
> +
> +int bscnl_emit(char *buf, int buflen, int rbot, int rtop, int len)
> +{
> +	if (len)
> +		len += scnprintf(buf + len, buflen - len, ",");
> +	if (rbot == rtop)
> +		len += scnprintf(buf + len, buflen - len, "%d", rbot);
> +	else
> +		len += scnprintf(buf + len, buflen - len, "%d-%d", rbot, rtop);
> +	return len;
> +}
> +
> +/**
> + * bitmap_scnlistprintf - convert bitmap to an ASCII hex string, list format
> + * @buf: byte buffer into which string is placed
> + * @buflen: reserved size of @buf, in bytes
> + * @maskp: pointer to bitmap to convert
> + * @nmaskbits: size of bitmap, in bits
> + *
> + * Output format is a comma-separated list of decimal numbers and
> + * ranges.  Consecutively set bits are shown as two hyphen-separated
> + * decimal numbers, the smallest and largest bit numbers set in
> + * the range.  Output format is a compatible subset of the format
> + * accepted as input by bitmap_parselist().
> + *
> + * The return value is the number of characters which would be
> + * generated for the given input, excluding the trailing '\0', as
> + * per ISO C99.
> + */
> +
> +int bitmap_scnlistprintf(char *buf, unsigned int buflen,
> +	const unsigned long *maskp, int nmaskbits)
> +{
> +	int len = 0;
> +	/* current bit is 'cur', most recently seen range is [rbot, rtop] */
> +	int cur, rbot, rtop;
> +
> +	rbot = cur = find_first_bit(maskp, nmaskbits);
> +	while (cur < nmaskbits) {
> +		rtop = cur;
> +		cur = find_next_bit(maskp, nmaskbits, cur+1);
> +		if (cur >= nmaskbits || cur > rtop + 1) {
> +			len = bscnl_emit(buf, buflen, rbot, rtop, len);
> +			rbot = cur;
> +		}
> +	}
> +	return len;
> +}
> +EXPORT_SYMBOL(bitmap_scnlistprintf);
> +
> +/**
> + * bitmap_parselist - parses a more flexible format for inputting bit masks
> + * @buf: read nul-terminated user string from this buffer
> + * @mask: write resulting mask here
> + * @nmaskbits: number of bits in mask to be written
> + *
> + * The input format supports a space separated list of one or more comma
> + * separated sequences of ascii decimal bit numbers and ranges.  Each
> + * sequence may be preceded by one of the prefix characters '=',
> + * '-', '+', or '!', which have the following meanings:
> + *    '=': rewrite the mask to have only the bits specified in this sequence
> + *    '-': turn off the bits specified in this sequence
> + *    '+': turn on the bits specified in this sequence
> + *    '!': same as '-'.
> + *
> + * If no such initial character is specified, then the default prefix '='
> + * is presumed.  The list is evaluated and applied in left to right order.
> + *
> + * Eamples of input format:
> + *	0-4,9				# rewrites to 0,1,2,3,4,9
> + *	-9				# removes 9
> + *	+6-8				# adds 6,7,8
> + *	1-6 -0,2-4 +11-14,16-19 -14-16	# same as 1,5,6,11-13,17-19
> + *	1-6 -0,2-4 +11-14,16-19 =14-16	# same as just 14,15,16
> + *
> + * Possible errno's returned for invalid input strings are:
> + *      -EINVAL:   second number in range smaller than first
> + *      -ERANGE:   bit number specified too large for mask
> + *      -EINVAL: invalid prefix char (not '=', '-', '+', or '!')
> + */
> +
> +int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits)
> +{
> +	char *p, *q;
> +	int masklen = BITS_TO_LONGS(nmaskbits);
> +
> +	while ((p = strsep((char **)(&buf), " ")) != NULL) { /* blows const XXX */
> +		char op = isdigit(*p) ? '=' : *p++;
> +		unsigned long m[masklen];
> +		int maskbytes = sizeof(m);
> +		int i;
> +
> +		if (op == ' ')
> +			continue;
> +		memset(m, 0, maskbytes);
> +
> +		while ((q = strsep(&p, ",")) != NULL) {
> +			unsigned a = simple_strtoul(q, 0, BASEDEC);
> +			unsigned b = a;
> +			char *cp = strchr(q, '-');
> +			if (cp)
> +				b = simple_strtoul(cp + 1, 0, BASEDEC);
> +			if (!(a <= b))
> +				return -EINVAL;
> +			if (b >= nmaskbits)
> +				return -ERANGE;
> +			while (a <= b) {
> +				set_bit(a, m);
> +				a++;
> +			}
> +		}
> +
> +		switch (op) {
> +			case '=':
> +				memcpy(maskp, m, maskbytes);
> +				break;
> +			case '!':
> +			case '-':
> +				for (i = 0; i < masklen; i++)
> +					maskp[i] &= ~m[i];
> +				break;
> +			case '+':
> +				for (i = 0; i < masklen; i++)
> +					maskp[i] |= m[i];
> +				break;
> +			default:
> +				return -EINVAL;
> +		}
> +	}
> +	return 0;
> +}
> +EXPORT_SYMBOL(bitmap_parselist);
> +
>  /**
>   *	bitmap_find_free_region - find a contiguous aligned mem region
>   *	@bitmap: an array of unsigned longs corresponding to the bitmap
> 
> -- 
>                           I won't rest till it's the best ...
>                           Programmer, Linux Scalability
>                           Paul Jackson <pj@sgi.com> 1.650.933.1373
> 
> 
> -------------------------------------------------------
> This SF.Net email is sponsored by OSTG. Have you noticed the changes on
> Linux.com, ITManagersJournal and NewsForge in the past few weeks? Now,
> one more big change to announce. We are now OSTG- Open Source Technology
> Group. Come see the changes on the new OSTG site. www.ostg.com
> _______________________________________________
> Lse-tech mailing list
> Lse-tech@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/lse-tech
> 
> 



^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-05 10:10 ` [PATCH] cpusets - big numa cpu and memory placement Paul Jackson
@ 2004-08-05 20:55   ` Martin J. Bligh
  2004-08-06  2:05     ` Paul Jackson
  0 siblings, 1 reply; 233+ messages in thread
From: Martin J. Bligh @ 2004-08-05 20:55 UTC (permalink / raw)
  To: Paul Jackson, Andrew Morton
  Cc: Christoph Hellwig, Jack Steiner, Jesse Barnes, Sylvain Jeaugey,
	Dan Higgins, linux-kernel, Matthew Dobson, Simon Derr,
	Andi Kleen, lse-tech, Dimitri Sivanich

> Cpusets extend the usefulness of, the existing placement support that
> was added to Linux 2.6 kernels: sched_setaffinity() for CPU placement,
> and mbind and set_mempolicy for memory placement.  On smaller or
> dedicated use systems, the existing calls are often sufficient.
> 
> On larger NUMA systems, running more than one, performance critical,
> job, it is necessary to be able to manage jobs in their entirety.
> This includes providing a job with exclusive CPU and memory that no
> other job can use, and being able to list all tasks currently in a
> cpuset.

I'm not sure I understand the rationale behind this ... perhaps you could
explain it further. We already have mechanisms to bind a process to 
particular CPUs or node's memory. 

To provide exclusivity seems valuable ... ie to stop the default 
allocations using node X's memory, or CPU Y, and potentially even to
migrate existing users off that.

But that'd seem to be a whole lot simpler than this patch ... what else
are we gaining from CPU sets? The patch is massive, so hard to see exactly
what you're doing ... is the point to add back virtualized memory and 
CPU numbering sets specific to each process or group of them, a la 
cpumemsets thing you were posting a year or two ago?

M.



^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] new bitmap list format (for cpusets)
  2004-08-05 20:47 ` [Lse-tech] [PATCH] new bitmap list format (for cpusets) Martin J. Bligh
@ 2004-08-05 21:45   ` Paul Jackson
       [not found]     ` <Pine.A41.4.53.0408060930100.20680@isabelle.frec.bull.fr>
  2004-08-09  8:01   ` Paul Jackson
  1 sibling, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-08-05 21:45 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: akpm, hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, lse-tech, sivanich

Martin asks:
> Can't we just do this up in userspace, ...

Aha - I was expecting this question.  Howdy.

We could, I suppose (do this fancy bitmap formatting in userland).

It's certainly been a pleasure to Simon Derr, Sylvain Jeaugey and
myself, over the last six months, to be able to easily manipulate
these big masks using classic Unix commands like cat and echo.

The ability to atomically update a mask is unique to this interface.
The existing bitmap_parse/bitmap_scnprintf interface only allows
for a complete rewrite, not atomically adding or removing a node.

However, I am not aware of a reason why we need the atomic update.

Simon ... could you comment on this, and perhaps better motivate
          this new bitmap list format?

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-05 20:55   ` [Lse-tech] " Martin J. Bligh
@ 2004-08-06  2:05     ` Paul Jackson
  2004-08-06  3:24       ` Martin J. Bligh
  0 siblings, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-08-06  2:05 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: akpm, hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, lse-tech, sivanich

Martin wrote:
> I'm not sure I understand the rationale behind this ...

Thank-you for your question, Martin.

Unlike the first patch in this set (the fancier bitmap format),
this cpuset patch is important to us, as you likely suspected.

I hope I can do it justice.

> is the point to add back virtualized memory and 
> CPU numbering sets specific to each process or group of them,
> a la cpumemsets thing you were posting a year or two ago?

To answer the easy question first, no.  No virtual numbering anymore. We
might do some virtualizing in user library code, but so far as this
patch and the kernel are aware, cpu number 17 is cpu number 17, all the
time, using the same cpu and node numberings as used in the other kernel
API's (setaffinity, mbind and set_mempolicy) and in the kernel cpumasks
and nodemasks.

The bulk of this patch comes from providing named, nested placement (cpu
and memory) regions -- cpusets, with a file system style namespace and
permission model.

The need for supporting such a model comes in managing large systems,
when they are divided and subdivided into subsets of cpu and memory
resources, dedicated to departments, groups, jobs, threads.  Especially
on NUMA systems, maintaining processor-memory locality is important.
This locality must be maintained in a hierarchical fashion.

The VP of Information Systems is not going to be personally placing the
8 parallel threads of the weather simulator run by someone in one of his
departments.  He can agree that that department gets exclusive use of
half of the machine over the weekends, because that's what they budgeted
for.  Then it gets pushed down.

Imagine, say, a large system that is shared by several departments, with
shifting resources between them.  At any point in time, each department
has certain dedicated resources (cpu and memory) allocated to them. 
Within a department, they may be runing multiple large applications - a
database server, a web server, a big simulation, other large HPC apps. 
Some of these may be very performance critical, and require their own
dedicated resources.  In many cases, the customer will be running some
form of batch manager software to help administer the work load.

The result is a hierarchy of these regions, which require, I claim, a
kernel supported hierarchical name space, with permissions, to which
tasks are attached, and which own subsets of the systems cpu and memory.

On most _any_ numa systems running a mixed and shifting load, this
ability to manage the systems use, to control placement and minimize
interaction, is essential to stable, repeatable performance.  On smaller
or dedicated use systems, the existing calls are entirely sufficient. 
On larger, nested use systems, the critical numa resources of processor
and memory need to be managed in a nested fashion.

The existing cpu and memory placement facilities, added in 2.6,
set_schedaffinity (for cpus) and mbind/set_mempolicy (for memory) are
just the right thing for an individual task to manage in detail its
placement across the resources available to it (the online cpus and
nodes if CONFIG_CPUSET is disabled, or within the cpuset if cpusets
are enabled).

But they do not provide the named hierarchy with kernel enforced
permissions and resource support required to sanely manage the
largest multi-use systems.

Three additional ways to approach this patch:

 1) The first file in the patch, Documentation/cpusets.txt,
    describes this facility, and its purpose.

 2) Look at the hooks in the rest of the kernel.  I have spent much
    time minimizing these hooks, so that they are few in number,
    placed as best I could in low maintenace code in the kernel,
    and vanish if CONFIG_CPUSETS is disabled.  But in addition
    to evaluating the risk and impact of the hooks, you can get
    a further sense of how cpusets works from these hooks.
    These hooks are listed in Documentation/cpusets.txt.

 3) Look at the include/linux/cpusets.h header file.  It shows
    the tiny interface with the rest of the kernel, which
    pretty much evaporates if CONFIG_CPUSET is disabled.

By way of analogy, when I had an 8 inch floppy disk drive, I didn't need
much of a file system.  Initially, didn't even need subdirectories, just
a list of files.  But as storage grew, and became a shared resource on
corporate systems, a hierarchical file system, with names, permissions
and sufficient hooks for managing the storage resource, became
essential.

Now, as big iron is growing from tens, to hundreds, soon to thousands,
of processors and memory nodes, their users require a manageable
resource hierarchy of the essential compute resources.

I understand that it is the proper job of a kernel to present the
essential resources of a system to user code, in a sensibly named and
controlled fashion, without imposing policy.  For nested resources, a
file system is a good fit, both for the names, and the associated
permission model.  It took more code (ifdef'd CONFIG_CPUSET, almost
entirely in the kernel/cpuset.c file), doing it this way.  But it is
the natural model to use, when it fits, as in this case.

Certainly, for the class of customer that SGI has on its big Irix
systems, we have already seen that this sort of facility is essential
for certain customer sites.  I hesitate to say "Irix" here, because
the Irix kernel code is another world, not directly useful in Linux.

Fortunately, Simon and Sylvain of Bull (France) detemined, sometime last
year, that they had the same large system cpu/memory management needs,
and Simon wrote this initial kernel code, entirely untainted with Irix
experience so far as I know.  Their focus is apparently more commercial
systems, whereas SGI's focus is more HPC apps.  But the facilities
needed are the same.

My primary contribution has been in removing code, and doing what I
could to learn how to best adapt it to Linux, in a way that meets our
needs, with the most minimal of impact on others (~zero runtime if not
configured, very low maintenance load on the kernel source).  As Simon
and Sylvain can atest, I have thrown away alot of the code and features
they wanted, in order to reduce the kernel footprint.  The cpu and node
renumbering you remembered was one of the things I threw out.  And I
have rewritten much more code, as I have learned the coding style that
is most comfortable within the Linux kernel.  The long term health and
maintainability of the Linux kernel is important to myself and my
employer.

If there is further explanation I can provide, or if there is design or
code change you see that is important to including cpusets in the
kernel, I welcome your input.  Or nits and details, whatever.  For me,
SGI, and Bull, this one is a biggie.  I anticipate for others as well,
as more companies venture into big iron Linux.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-06  2:05     ` Paul Jackson
@ 2004-08-06  3:24       ` Martin J. Bligh
  2004-08-06  8:31         ` Paul Jackson
  2004-08-06 15:30         ` Erich Focht
  0 siblings, 2 replies; 233+ messages in thread
From: Martin J. Bligh @ 2004-08-06  3:24 UTC (permalink / raw)
  To: Paul Jackson
  Cc: akpm, hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, lse-tech, sivanich

>> is the point to add back virtualized memory and 
>> CPU numbering sets specific to each process or group of them,
>> a la cpumemsets thing you were posting a year or two ago?
> 
> To answer the easy question first, no.  No virtual numbering anymore. We
> might do some virtualizing in user library code, but so far as this
> patch and the kernel are aware, cpu number 17 is cpu number 17, all the
> time, using the same cpu and node numberings as used in the other kernel
> API's (setaffinity, mbind and set_mempolicy) and in the kernel cpumasks
> and nodemasks.

OK, good ;-) I don't think the kernel should have to deal with that stuff.
Sorry, it's just a little difficult to dive into a large patch without 
a higher level idea what it's trying to do (which after your last email, 
I think I have a much better grasp on).
 
...

> The existing cpu and memory placement facilities, added in 2.6,
> set_schedaffinity (for cpus) and mbind/set_mempolicy (for memory) are
> just the right thing for an individual task to manage in detail its
> placement across the resources available to it (the online cpus and
> nodes if CONFIG_CPUSET is disabled, or within the cpuset if cpusets
> are enabled).

I agree that the current mechanisms are not wholly sufficient - the
most obvious failing being that whilst you can bind a process to a
resource, there's very little support for making a resource exclusively
available to a process or set thereof.
 
> But they do not provide the named hierarchy with kernel enforced
> permissions and resource support required to sanely manage the
> largest multi-use systems.

Right ... but I'm kind of shocked by the size of the patch to fix what
seems like a fairly simple problem. The other thing that seems to glare
at me is the overlap between what you have here and PAGG/CKRM. Does
either cpusets depend on PAGG/CKRM or vice versa? They seem to have 
similar goals, and it'd be strange to have two independant mechanisms.

> Three additional ways to approach this patch:
> 
>  1) The first file in the patch, Documentation/cpusets.txt,
>     describes this facility, and its purpose.
> 
>  2) Look at the hooks in the rest of the kernel.  I have spent much
>     time minimizing these hooks, so that they are few in number,
>     placed as best I could in low maintenace code in the kernel,
>     and vanish if CONFIG_CPUSETS is disabled.  But in addition
>     to evaluating the risk and impact of the hooks, you can get
>     a further sense of how cpusets works from these hooks.
>     These hooks are listed in Documentation/cpusets.txt.
> 
>  3) Look at the include/linux/cpusets.h header file.  It shows
>     the tiny interface with the rest of the kernel, which
>     pretty much evaporates if CONFIG_CPUSET is disabled.

Thanks ... that'll help me. I'll try to look through it in some more
detail.

M.

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-06  3:24       ` Martin J. Bligh
@ 2004-08-06  8:31         ` Paul Jackson
  2004-08-06 15:30         ` Erich Focht
  1 sibling, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-08-06  8:31 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: akpm, hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, lse-tech, sivanich

Martin wrote:
> Sorry, it's just a little difficult to dive into a large
> patch without a higher level idea ...

No need to apologize.  I welcome your review.  I was well aware that the
next step for this patch was the "why would I want this ..." explanation.
Let me know if there is more I can explain.


> I don't think the kernel should have to deal with that
> [cpu and node number virtualization] stuff.

I agree, now.


> The other thing that seems to glare at me is the overlap
> between what you have here and PAGG/CKRM.  Does
> either cpusets depend on PAGG/CKRM or vice versa? 

None of these three, cpusets, PAGG or CKRM depend on the other, with the
possible exception that perhaps CKRM could make use of PAGG (whether it
does or not, or whether it should, I don't know - ask them).

Cpusets control _where_ a process can run and allocate.  The central
construct of cpusets is essentially a "soft partition" -- a set of CPUs
and Memory Nodes.  These can be arranged in a hierarchy, with names,
permissions, a couple of control bits.  Tasks can be moved between
cpusets, as allowed by the permission model.  You can attach a task to a
different cpuset if (1) you can access that cpuset (search permission to
some directory beneath /dev/cpuset), (2) write that cpusets "tasks"
file, and (3) have essentially kill rights on the task being placed.

Just as your basic file system provides a hierarchical model for
organizing your data files (places to put data), similarly cpusets
provides a hierarchical model for organizing the nodes on your big numa
box (places to run tasks).

CKRM tracks _how_ much of various interesting resouces tasks are using,
both measuring and limiting such usage.  It provides a way to manage
some of the shared system resources, such as CPU time, memory pages, I/O
and incoming network bandwith based on user defined groups of tasks
called classes (quoting from http://ckrm.sourceforge.net/ ;).  Unlike
cpusets and most of the rest of the kernel, CKRM doesn't just manage
individual tasks, one task at a time, but manages based on a dynamically
determined resource class it assigns to various kernel objects in
addition to tasks.

Cpusets provides a rich model of just the CPU and Memory resources, but
only manages tasks, using the traditional simple task pointer to a
shared reference counted object.

CKRM provides a rich structure for classifying a variety of kernel
objects, not just tasks, and managing their use, but it doesn't have a
particularly fancy model of any one of these resources (so far as I
know anyway ...).

PAGG is a just a mechanism that is useful for job accouting and resource
management.  It's just the hooks - for an inescapable job container and
hooks for loadable modules to be called on key events in the life of
tasks in that container, such as fork and exit.  PAGG provides a useful
mechanism for certain kinds of resource management and system accounting
modules, but is itself not a resource manager.

Hopefully, I haven't misrepresented CKRM and PAGG too much.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] new bitmap list format (for cpusets)
       [not found]     ` <Pine.A41.4.53.0408060930100.20680@isabelle.frec.bull.fr>
@ 2004-08-06 10:14       ` Paul Jackson
  0 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-08-06 10:14 UTC (permalink / raw)
  To: Simon Derr
  Cc: mbligh, akpm, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, colpatch, Simon.Derr, ak, lse-tech, sivanich

Simon wrote:
> it really would have been a shame to lose this ease of use 

I agree with Simon -- this formatting makes learning, using and messing
around with cpusets much easier.  No need to learn yet another limited
use utility that tends to insulate one from what's going on.  Direct and
easy kernel control is good.

It doesn't matter much one way or the other to user level C code.  That
code has some function or library routine to parse the ascii file to and
from the preferred internal bitmap either way, whether the file format
is fixed length hex, or variable length decimal lists.

If is just an issue of compiled kernel text size, we could conditionally
compile the 837 bytes of text (on an i386 build close at hand: 459 bytes
for bitmap_parselist, 199 bytes for bitmap_scnlistprintf and 179 for
bscnl_emit).  So far only cpusets uses this format.  But usually it's
more a matter of kernel source and future maintenance.

I appreciate that it is easy to be against everyone's kernel bloat,
except my own ;).

I view the tradeoff as kernel developer time versus the time of users of
cpusets.  I think that the users of cpusets will save more time if they
have this more friendly, less error prone, format, than the kernel
hackers will spend maintaining the formatting code.  However, I'm
comparing two small numbers with alot of uncertainty on each number, so
... who knows.  I try to optimize for minimum total expenditure of human
brain power.  As anyone can tell from listening to the news, that's the
resource we most critically short of ;).

So I vote to keep it.  But if it goes down the other way, that's not
a big problem.

Oops - one buglet I see - I forgot to mark bscnl_emit() helper routine
static, in lib/bitmap.c.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-06  3:24       ` Martin J. Bligh
  2004-08-06  8:31         ` Paul Jackson
@ 2004-08-06 15:30         ` Erich Focht
  2004-08-06 15:35           ` Martin J. Bligh
  2004-08-07  6:10           ` Paul Jackson
  1 sibling, 2 replies; 233+ messages in thread
From: Erich Focht @ 2004-08-06 15:30 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: lse-tech, Paul Jackson, akpm, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich

> > The existing cpu and memory placement facilities, added in 2.6,
> > set_schedaffinity (for cpus) and mbind/set_mempolicy (for memory) are
> > just the right thing for an individual task to manage in detail its
> > placement across the resources available to it (the online cpus and
> > nodes if CONFIG_CPUSET is disabled, or within the cpuset if cpusets
> > are enabled).
> 
> I agree that the current mechanisms are not wholly sufficient - the
> most obvious failing being that whilst you can bind a process to a
> resource, there's very little support for making a resource exclusively
> available to a process or set thereof.

For the record, we (NEC) are also a potential user of this patch on
the TX-7 NUMA machines. For our 2.4 kernels we are currently using
something with similar functionality but only two hierarchy levels. 
I would very much welcome the inclusion of cpusets. The patch got much
leaner compared to the early days, big part of it consists of
documentation (good!) and the user interface (also very nice, although
it duplicates some code). The rest is just needed. Besides: it's
encapsulated enough and doesn't hurt others. (BTW: I could imagine
using this on quad-opterons, too...)

> Right ... but I'm kind of shocked by the size of the patch to fix what
> seems like a fairly simple problem. The other thing that seems to glare
> at me is the overlap between what you have here and PAGG/CKRM. Does
> either cpusets depend on PAGG/CKRM or vice versa? They seem to have 
> similar goals, and it'd be strange to have two independant mechanisms.

There's no relation to PAGG but I think cpusets and CKRM should be
made to come together. One of CKRM's user interfaces is a filesystem
with the file-tree representing the class hierarchy. It's the same for
cpusets. I'd vote for cpusets going in soon. CKRM could be extended by
a cpusets controller which should be pretty trivial when using the
infrastructure of this patch. It simply needs to create classes
(cpusets) and attach processes to them. The enforcement of resources
happens automatically. When CKRM is mature to enter the kernel, one
could drop /dev/cpusets in favor of the CKRM way of doing it.

Regards,
Erich



^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-06 15:30         ` Erich Focht
@ 2004-08-06 15:35           ` Martin J. Bligh
  2004-08-06 15:48             ` Hubertus Franke
                               ` (3 more replies)
  2004-08-07  6:10           ` Paul Jackson
  1 sibling, 4 replies; 233+ messages in thread
From: Martin J. Bligh @ 2004-08-06 15:35 UTC (permalink / raw)
  To: Erich Focht
  Cc: lse-tech, Paul Jackson, akpm, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich

> There's no relation to PAGG but I think cpusets and CKRM should be
> made to come together. One of CKRM's user interfaces is a filesystem
> with the file-tree representing the class hierarchy. It's the same for
> cpusets. 

OK, that makes sense ...

> I'd vote for cpusets going in soon. CKRM could be extended by
> a cpusets controller which should be pretty trivial when using the
> infrastructure of this patch. It simply needs to create classes
> (cpusets) and attach processes to them. The enforcement of resources
> happens automatically. When CKRM is mature to enter the kernel, one
> could drop /dev/cpusets in favor of the CKRM way of doing it.

But I think that's dangerous. It's very hard to get rid of existing user
interfaces ... I'd much rather we sorted out what we're doing BEFORE
putting either in the kernel.

M.


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-06 15:35           ` Martin J. Bligh
@ 2004-08-06 15:48             ` Hubertus Franke
  2004-08-07  6:30               ` Paul Jackson
  2004-08-07  6:45               ` Paul Jackson
  2004-08-06 15:49             ` Hubertus Franke
                               ` (2 subsequent siblings)
  3 siblings, 2 replies; 233+ messages in thread
From: Hubertus Franke @ 2004-08-06 15:48 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Erich Focht, lse-tech, Paul Jackson, akpm, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich



Martin J. Bligh wrote:
>>There's no relation to PAGG but I think cpusets and CKRM should be
>>made to come together. One of CKRM's user interfaces is a filesystem
>>with the file-tree representing the class hierarchy. It's the same for
>>cpusets. 
> 
> 
> OK, that makes sense ...
> 
> 
>>I'd vote for cpusets going in soon. CKRM could be extended by
>>a cpusets controller which should be pretty trivial when using the
>>infrastructure of this patch. It simply needs to create classes
>>(cpusets) and attach processes to them. The enforcement of resources
>>happens automatically. When CKRM is mature to enter the kernel, one
>>could drop /dev/cpusets in favor of the CKRM way of doing it.
> 
> 
> But I think that's dangerous. It's very hard to get rid of existing user
> interfaces ... I'd much rather we sorted out what we're doing BEFORE
> putting either in the kernel.
> 
> M.
> 

We, CKRM, can put this on our stack, once we have settled how we are 
going to address the structural requirements that came out of the kernel 
summit.

As indicated above, this would mean to create a resource controller
and assign mask to them, which is not what we have done so far, as
our current controllers are more share focused. This should be a good 
excercise.

While we are on the topic, do you envision these sets to be somewhat 
hierarchical or simply a flat hierarchy ?

-- Hubertus Franke


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-06 15:35           ` Martin J. Bligh
  2004-08-06 15:48             ` Hubertus Franke
@ 2004-08-06 15:49             ` Hubertus Franke
  2004-08-06 15:52             ` Hubertus Franke
  2004-08-06 15:55             ` Erich Focht
  3 siblings, 0 replies; 233+ messages in thread
From: Hubertus Franke @ 2004-08-06 15:49 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Erich Focht, lse-tech, Paul Jackson, akpm, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich



Martin J. Bligh wrote:
>>There's no relation to PAGG but I think cpusets and CKRM should be
>>made to come together. One of CKRM's user interfaces is a filesystem
>>with the file-tree representing the class hierarchy. It's the same for
>>cpusets. 
> 
> 
> OK, that makes sense ...
> 
> 
>>I'd vote for cpusets going in soon. CKRM could be extended by
>>a cpusets controller which should be pretty trivial when using the
>>infrastructure of this patch. It simply needs to create classes
>>(cpusets) and attach processes to them. The enforcement of resources
>>happens automatically. When CKRM is mature to enter the kernel, one
>>could drop /dev/cpusets in favor of the CKRM way of doing it.
> 
> 
> But I think that's dangerous. It's very hard to get rid of existing user
> interfaces ... I'd much rather we sorted out what we're doing BEFORE
> putting either in the kernel.
> 
> M.
> 

We, CKRM, can put this on our stack, once we have settled how we are 
going to address the structural requirements that came out of the kernel 
summit.

As indicated above, this would mean to create a resource controller
and assign mask to them, which is not what we have done so far, as
our current controllers are more share focused. This should be a good 
excercise.

While we are on the topic, do you envision these sets to be somewhat 
hierarchical or simply a flat hierarchy ?

-- Hubertus Franke


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-06 15:35           ` Martin J. Bligh
  2004-08-06 15:48             ` Hubertus Franke
  2004-08-06 15:49             ` Hubertus Franke
@ 2004-08-06 15:52             ` Hubertus Franke
  2004-08-06 15:55             ` Erich Focht
  3 siblings, 0 replies; 233+ messages in thread
From: Hubertus Franke @ 2004-08-06 15:52 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Erich Focht, lse-tech, Paul Jackson, akpm, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich



Martin J. Bligh wrote:
>>There's no relation to PAGG but I think cpusets and CKRM should be
>>made to come together. One of CKRM's user interfaces is a filesystem
>>with the file-tree representing the class hierarchy. It's the same for
>>cpusets. 
> 
> 
> OK, that makes sense ...
> 
> 
>>I'd vote for cpusets going in soon. CKRM could be extended by
>>a cpusets controller which should be pretty trivial when using the
>>infrastructure of this patch. It simply needs to create classes
>>(cpusets) and attach processes to them. The enforcement of resources
>>happens automatically. When CKRM is mature to enter the kernel, one
>>could drop /dev/cpusets in favor of the CKRM way of doing it.
> 
> 
> But I think that's dangerous. It's very hard to get rid of existing user
> interfaces ... I'd much rather we sorted out what we're doing BEFORE
> putting either in the kernel.
> 
> M.
> 

We, CKRM, can put this on our stack, once we have settled how we are 
going to address the structural requirements that came out of the kernel 
summit.

As indicated above, this would mean to create a resource controller
and assign mask to them, which is not what we have done so far, as
our current controllers are more share focused. This should be a good 
excercise.

While we are on the topic, do you envision these sets to be somewhat 
hierarchical or simply a flat hierarchy ?

-- Hubertus Franke


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-06 15:35           ` Martin J. Bligh
                               ` (2 preceding siblings ...)
  2004-08-06 15:52             ` Hubertus Franke
@ 2004-08-06 15:55             ` Erich Focht
  3 siblings, 0 replies; 233+ messages in thread
From: Erich Focht @ 2004-08-06 15:55 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: lse-tech, Paul Jackson, akpm, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich

On Friday 06 August 2004 17:35, Martin J. Bligh wrote:
> > I'd vote for cpusets going in soon. CKRM could be extended by
> > a cpusets controller which should be pretty trivial when using the
> > infrastructure of this patch. It simply needs to create classes
> > (cpusets) and attach processes to them. The enforcement of resources
> > happens automatically. When CKRM is mature to enter the kernel, one
> > could drop /dev/cpusets in favor of the CKRM way of doing it.
> 
> But I think that's dangerous. It's very hard to get rid of existing user
> interfaces ... I'd much rather we sorted out what we're doing BEFORE
> putting either in the kernel.

So the user interfaces should be adapted before? I think this is
simple and then the elimination of /dev/cpusets in favor of /rcfs is
just deletion of code plus a simbolic link. The classes and cpusets
are both directories. The files in cpusets are: 
 - cpus: list of CPUs in that cpuset
 - mems: list of Memory Nodes in that cpuset
 - cpu_exclusive flag: is cpu placement exclusive?
 - mem_exclusive flag: is memory placement exclusive?
 - tasks: list of tasks (by pid) attached to that cpuset
The files in a CKRM class directory:
 - stats   : statistics (not needed for cpusets)
 - shares  : could contain cpus, mems, cpu_exclusive, mem_exclusive
 - members : same as reading /dev/cpusets/.../tasks
 - target  : same as writing /dev/cpusets/.../tasks

Changing the "shares" would mean something like
  echo "cpus +6-10" > .../shares

Just an idea...

Regards,
Erich


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-06 15:30         ` Erich Focht
  2004-08-06 15:35           ` Martin J. Bligh
@ 2004-08-07  6:10           ` Paul Jackson
  2004-08-07 15:22             ` Erich Focht
  2004-08-08 19:58             ` Shailabh Nagar
  1 sibling, 2 replies; 233+ messages in thread
From: Paul Jackson @ 2004-08-07  6:10 UTC (permalink / raw)
  To: Erich Focht
  Cc: mbligh, lse-tech, akpm, hch, steiner, jbarnes, sylvain.jeaugey,
	djh, linux-kernel, colpatch, Simon.Derr, ak, sivanich

Erich Focht wrote:
> we (NEC) are also a potential user of this patch

Good - welcome.


> I think cpusets and CKRM should be
> made to come together. One of CKRM's user interfaces is a filesystem
> with the file-tree representing the class hierarchy. It's the same for
> cpusets.

Hmmm ... this suggestion worries me, for a couple of reasons.

Just because cpusets and CKRM both have a hierarchy represented in a
file system doesn't mean it is, or can be, the same file system.  Not
all trees are the same.

Perhaps someone more expert in CKRM can help here.  The cpuset hierarchy
has some strict semantics:
 1) Any cpusets CPUs and Memory must be a subset of its parents.
 2) A cpuset may be exclusive for CPU or Memory only if its parent is.
 3) A CPU or Memory exclusive cpuset may not overlap its siblings.

See the routine kernel/cpuset.c:validate_change() for the exact
coding of these rules.

If we followed your suggestion, Erich, would these rules still hold?
I can't imagine that the CKRM folks have any existing hierarchies with
these particular rules.  They would need to if we went this way.

On the flip side, what additional rules, if any, would CKRM impose
on this hierarchy?

The other reason that this suggestion worries me is a bit more
philosophical.  I'm sure that for all the other, well known,
resources that CKRM manages, no one is proposing replacing whatever
existing names and mechanisms exist for those resources, such as
bandwidth, compute cycles, memory, ...  Rather I presume that CKRM
provides an additional resource management layer on top of the
existing resources, which retain their classic names and apparatus.

What you seem to be suggesting here, especially with this nice
picture from your next post:

        The files in cpusets are:
         - cpus: list of CPUs in that cpuset
         - mems: list of Memory Nodes in that cpuset
         - cpu_exclusive flag: is cpu placement exclusive?
         - mem_exclusive flag: is memory placement exclusive?
         - tasks: list of tasks (by pid) attached to that cpuset
        The files in a CKRM class directory:
         - stats   : statistics (not needed for cpusets)
         - shares  : could contain cpus, mems, cpu_exclusive, mem_exclusive
         - members : same as reading /dev/cpusets/.../tasks
         - target  : same as writing /dev/cpusets/.../tasks

        Changing the "shares" would mean something like
          echo "cpus +6-10" > .../shares

would remove the cpuset specific interface forever, leaving it only
visible via a more generic "shares, members, target" interface suitable
for abstract resource management.

I am afraid that this would make it harder for new users of cpusets to
figure them out.  Just cpusets by themselves add a new and strange layer
of abstraction, that will require a little bit of head scratching (as
Martin Bligh can testify to, from recent experience ;) for those
administering and managing the big iron where cpusets will be useful. 

To add yet another layer of abstractions on top of that, from the CKRM
world, might send quite a few users into mental overload, doing the
usual stupid things we all do when we have given up on understanding and
are just thrashing about, trying to get something to work.

I think we are onto something useful here, the hierarchical organizing
of compute resources of CPU and Memory, which will become increasingly
relevant in the coming years, with bigger machines and more complex
compute and memory architectures.

I'd hate to see cpusets hidden behind resource management terms from day
one.

And, looking at it from the CKRM side (not sure I can, I'll try ...)
would it not seem a bit odd to a CKRM user that just one of the resource
types managed, these cpusets, had no apparent existence outside of the
CKRM hierarchy, unlike all the other resources, which existed a priori,
and, I presume, continue their independent existance?

Obviously, I could use a little CKRM expertise here.

But my inclination is to continue to view these two projects as separate,
with the potential that CKRM will someday add cpusets to the resource types
that it can manage.

Thank-you.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-06 15:48             ` Hubertus Franke
@ 2004-08-07  6:30               ` Paul Jackson
  2004-08-07  6:45               ` Paul Jackson
  1 sibling, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-08-07  6:30 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: mbligh, efocht, lse-tech, akpm, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich

Hubertus wrote:
> As indicated above, this would mean to create a resource controller
> and assign mask to them, which is not what we have done so far, as
> our current controllers are more share focused.

Could you explain this a bit?  In particular, the phrases
"assign mask" and "share focused" went wizzing right on
past me.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-06 15:48             ` Hubertus Franke
  2004-08-07  6:30               ` Paul Jackson
@ 2004-08-07  6:45               ` Paul Jackson
  1 sibling, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-08-07  6:45 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: mbligh, efocht, lse-tech, akpm, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich

Hubertus asked:
> While we are on the topic, do you envision these sets to be somewhat 
> hierarchical or simply a flat hierarchy ?

I'm not sure what you mean by the distinction between a "somewhat"
hierarchy and a "simply flat" hierarchy ... I'll guess you're asking how
deep we envision these sets being.

I'd envision they start out just one or two deep, then over time they
tend to reflect the several layer deep organizational structure of the
institution paying for the big iron, _plus_ another layer or two to
handle the cpu/memory placement needs of more complex applications.

With occassional examples a few times that.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-07  6:10           ` Paul Jackson
@ 2004-08-07 15:22             ` Erich Focht
  2004-08-07 18:59               ` Paul Jackson
                                 ` (3 more replies)
  2004-08-08 19:58             ` Shailabh Nagar
  1 sibling, 4 replies; 233+ messages in thread
From: Erich Focht @ 2004-08-07 15:22 UTC (permalink / raw)
  To: Paul Jackson
  Cc: mbligh, lse-tech, akpm, hch, steiner, jbarnes, sylvain.jeaugey,
	djh, linux-kernel, colpatch, Simon.Derr, ak, sivanich

On Saturday 07 August 2004 08:10, Paul Jackson wrote:
> > I think cpusets and CKRM should be
> > made to come together. One of CKRM's user interfaces is a filesystem
> > with the file-tree representing the class hierarchy. It's the same for
> > cpusets.
> 
> Hmmm ... this suggestion worries me, for a couple of reasons.
> 
> Just because cpusets and CKRM both have a hierarchy represented in a
> file system doesn't mean it is, or can be, the same file system.  Not
> all trees are the same.

Cpusets are a complex resource which needs to be managed. You already
provided an interface for management but on the horizon there is this
CKRM thing... I really don't care too much about the interface as long
as it is comfortable (advocating for your bitset manipulation routines
here ;-). CKRM will some day come in and maybe try to unify the
resource control through a generalized interface. In my understand
CKRM "classes" are (for the cpusets resource) your "sets". I was
trying to anticipate that CKRM might want to present the single entry
point for managing resources, including cpusets.

If I understand correctly, CKRM is fine for simple resources like
amount of memory or cputime and designed to control flexible sharing
of these resources and ensure some degree of fairness. Cpusets is a
complex NUMA specific compound resource which actually only allows for
a rather static distribution across processes (especially with the
exclusive bits set). Including cpusets control into CKRM will be
trivial, because you already provide all that's needed.

What I proposed was to include cpusets ASAP. As we learned from
Hubertus, CKRM is undergoing some redesign (after the kernel summit),
so let's now get used to cpusets and forget about the generic resource
controller until that is mature to enter the kernel. When that happens
people might love the generic way of controlling resources and the
cpusets user interface will be yet another filesystem for controlling
some hierarchical structures... The complaints about the huge size of
the patch should therefore have in mind that we might well get rid of
the user interface part of it. The core infrastructure of cpusets will
be needed anyway and the amount of code is the absolutely required
minimum, IMHO.


> The other reason that this suggestion worries me is a bit more
> philosophical.  I'm sure that for all the other, well known,
> resources that CKRM manages, no one is proposing replacing whatever
> existing names and mechanisms exist for those resources, such as
> bandwidth, compute cycles, memory, ...  Rather I presume that CKRM
> provides an additional resource management layer on top of the
> existing resources, which retain their classic names and apparatus.
> [...]

I hope cpusets will be an "existing resource" when CKRM comes into
play. It's a compound resource built of cpus and memories (and the
name cpuset is a bit misleading) but it fully makes sense on a NUMA
machine to have these two elementary resources glued together. If CKRM
was to build a resource controller for cpu masks and memories, or two
separate resource controllers, the really acceptable end result would
look like the current cpusets infrastructure. So why waste time?

Later cpusets could borrow the user interface of CKRM or, if the
cpusets user interface is better suited, maybe we can just have a
/rcfs/cpusets/ directory tree with the current cpusets look and feel?
Question to CKRM people: would it make sense to have a class with
another way of control than the shares/targets/members files?

> I'd hate to see cpusets hidden behind resource management terms from day
> one.

That's an argument. Less RTFM mails, happier admins and users... A
better world ;-)

Regads,
Erich


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-07 15:22             ` Erich Focht
@ 2004-08-07 18:59               ` Paul Jackson
  2004-08-08  3:17               ` Paul Jackson
                                 ` (2 subsequent siblings)
  3 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-08-07 18:59 UTC (permalink / raw)
  To: Erich Focht
  Cc: mbligh, lse-tech, akpm, hch, steiner, jbarnes, sylvain.jeaugey,
	djh, linux-kernel, colpatch, Simon.Derr, ak, sivanich

Erich wrote:
> (and the name cpuset is a bit misleading)

I've just begun reading your reply.  A quick note on the name "cpusets".
Yes, it is a bit misleading.

On SGI's Irix, they are 'cpusets'.  On SGI's 2.4 Linux kernels, the
kernel portion is called 'cpumemsets', and the user support 'cpusets'. 
Independently, Simon of Bull chose 'cpusets'.  For a while, I was
lobbying Simon to change it to 'cpumemsets', then I decided to heck with
it, as the shorter, sweeter name seemed to be the more commonly used.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-07 15:22             ` Erich Focht
  2004-08-07 18:59               ` Paul Jackson
@ 2004-08-08  3:17               ` Paul Jackson
  2004-08-08 14:50               ` Martin J. Bligh
  2004-08-08 20:22               ` Shailabh Nagar
  3 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-08-08  3:17 UTC (permalink / raw)
  To: Erich Focht
  Cc: mbligh, lse-tech, akpm, hch, steiner, jbarnes, sylvain.jeaugey,
	djh, linux-kernel, colpatch, Simon.Derr, ak, sivanich

Erich wrote:
> The complaints about the huge size of the patch should therefore have
> in mind that we might well get rid of the user interface part of it.

To put some numbers on things, building 2.6.8-rc2-mm2 for arch=ia64,
with gcc 3.3.2, using sn2_defconfig, I see the following kernel text
byte costs:

	Enabling CONFIG_CPUSETS:   22384   (22028 cpuset.o, 356 hooks)
	The  bitmap list UI:        1552
	                           -----
	Total:                     23936

The bitmap list user interface is a fairly small part of the total.

Of the 22384 for CONFIG_CPUSETS, 22028 bytes is in kernel/cpuset.o and
the remaining 356 for the cpuset kernel hooks (which are essentially
zero if CONFIG_CPUSETS is disabled).


> The core infrastructure of cpusets will be needed anyway and the
> amount of code is the absolutely required minimum, IMHO.

I agree.  If anyone can see further opportunities to trim, let me know.


> What I proposed was to include cpusets ASAP

I agree.


>  A better world ;-)

Yeah !!

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-07 15:22             ` Erich Focht
  2004-08-07 18:59               ` Paul Jackson
  2004-08-08  3:17               ` Paul Jackson
@ 2004-08-08 14:50               ` Martin J. Bligh
  2004-08-11  0:43                 ` Paul Jackson
  2004-08-11  9:40                 ` Erich Focht
  2004-08-08 20:22               ` Shailabh Nagar
  3 siblings, 2 replies; 233+ messages in thread
From: Martin J. Bligh @ 2004-08-08 14:50 UTC (permalink / raw)
  To: Erich Focht, Paul Jackson
  Cc: lse-tech, akpm, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, colpatch, Simon.Derr, ak, sivanich

> If I understand correctly, CKRM is fine for simple resources like
> amount of memory or cputime and designed to control flexible sharing
> of these resources and ensure some degree of fairness. Cpusets is a
> complex NUMA specific compound resource which actually only allows for
> a rather static distribution across processes (especially with the
> exclusive bits set). Including cpusets control into CKRM will be
> trivial, because you already provide all that's needed.

I'd disagree with this - both are mechanisms for controlling the amount
of CPU time and memory that processes get to use. They have fundamentally
the same objective ... having 2 mechanisms to do the same thing with
different interfaces doesn't seem like a good plan. I don't think CKRM is 
anything like as far away from being ready as you seem to be implying -
we're talking about a month or two, I think.

M.


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-07  6:10           ` Paul Jackson
  2004-08-07 15:22             ` Erich Focht
@ 2004-08-08 19:58             ` Shailabh Nagar
  2004-10-01 23:41               ` Andrew Morton
  1 sibling, 1 reply; 233+ messages in thread
From: Shailabh Nagar @ 2004-08-08 19:58 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Erich Focht, mbligh, lse-tech, akpm, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich

Paul Jackson wrote:
> Erich Focht wrote:
> 
>>we (NEC) are also a potential user of this patch
> 
> 
> Good - welcome.
> 
> 
> 
>>I think cpusets and CKRM should be
>>made to come together. One of CKRM's user interfaces is a filesystem
>>with the file-tree representing the class hierarchy. It's the same for
>>cpusets.
> 
> 
> Hmmm ... this suggestion worries me, for a couple of reasons.
> 
> Just because cpusets and CKRM both have a hierarchy represented in a
> file system doesn't mean it is, or can be, the same file system.  Not
> all trees are the same.
> 
> Perhaps someone more expert in CKRM can help here.  The cpuset hierarchy
> has some strict semantics:
>  1) Any cpusets CPUs and Memory must be a subset of its parents.
>  2) A cpuset may be exclusive for CPU or Memory only if its parent is.
>  3) A CPU or Memory exclusive cpuset may not overlap its siblings.
> 
> See the routine kernel/cpuset.c:validate_change() for the exact
> coding of these rules.
> 
> If we followed your suggestion, Erich, would these rules still hold?
> I can't imagine that the CKRM folks have any existing hierarchies with
> these particular rules.  They would need to if we went this way.

As CKRM stands today, we wouldn't be able to impose these constraints 
for exactly the reasons you point out. The other controllers would not 
forbid the move of a task violating the above rules to a CKRM class but 
this controller (CKRM's version of cpusets) would. Currently, on a task 
move, CKRM's core calls per-controller callbacks so the controller can 
make modifications to the controller-specific per-class objects. But 
controllers can't prevent such a move.

However, one of the CKRM changes suggested in the Kernel Summit was to 
split up the controllers and not have them bundled within a "core" class 
as we call it. In this model, each task would directly belong to some 
controller-specific class.

If CKRM were to adopt this change, one *potential* (but not necessary) 
consequence, is to have multiple hierarchies, one per-controller, 
exposed to the user e.g. instead of /rcfs/taskclass/<sameclasstree>, we 
would have /rcfs/cpu/<oneclasstree> and /rcfs/mem/<anotherclasstree> etc.

In such a scenario, it would be more logical for the controller to 
constrain memberships (i.e. task moves, class share setting while it is 
part of a hierarchy etc.) and it would be easy for cpusets to get its 
semantics.


> 
> On the flip side, what additional rules, if any, would CKRM impose
> on this hierarchy?

Currently, we impose rules on the shares that one can set (child cannot 
have more than its parent, sibling shares should add up etc.) and we'd
discussed, but not implemented yet, some limit on how deep the common 
hierarchy would go.

> 
> The other reason that this suggestion worries me is a bit more
> philosophical.  I'm sure that for all the other, well known,
> resources that CKRM manages, no one is proposing replacing whatever
> existing names and mechanisms exist for those resources, such as
> bandwidth, compute cycles, memory, ...  Rather I presume that CKRM
> provides an additional resource management layer on top of the
> existing resources, which retain their classic names and apparatus.
> 
> What you seem to be suggesting here, especially with this nice
> picture from your next post:
> 
>         The files in cpusets are:
>          - cpus: list of CPUs in that cpuset
>          - mems: list of Memory Nodes in that cpuset
>          - cpu_exclusive flag: is cpu placement exclusive?
>          - mem_exclusive flag: is memory placement exclusive?
>          - tasks: list of tasks (by pid) attached to that cpuset
>         The files in a CKRM class directory:
>          - stats   : statistics (not needed for cpusets)
>          - shares  : could contain cpus, mems, cpu_exclusive, mem_exclusive
>          - members : same as reading /dev/cpusets/.../tasks
>          - target  : same as writing /dev/cpusets/.../tasks
> 
>         Changing the "shares" would mean something like
>           echo "cpus +6-10" > .../shares
> 
> would remove the cpuset specific interface forever, leaving it only
> visible via a more generic "shares, members, target" interface suitable
> for abstract resource management.
> 
> I am afraid that this would make it harder for new users of cpusets to
> figure them out.  Just cpusets by themselves add a new and strange layer
> of abstraction, that will require a little bit of head scratching (as
> Martin Bligh can testify to, from recent experience ;) for those
> administering and managing the big iron where cpusets will be useful. 
> 
> To add yet another layer of abstractions on top of that, from the CKRM
> world, might send quite a few users into mental overload, doing the
> usual stupid things we all do when we have given up on understanding and
> are just thrashing about, trying to get something to work.
> 
> I think we are onto something useful here, the hierarchical organizing
> of compute resources of CPU and Memory, which will become increasingly
> relevant in the coming years, with bigger machines and more complex
> compute and memory architectures.
> 
> I'd hate to see cpusets hidden behind resource management terms from day
> one.

Yup, thats a valid concern. In this current round of CKRM redesign, 
we're considering whether controllers should be allowed to export their 
own interface (in a sense) by accepting different kinds of share 
settings. That is already true today in case of the "stats" and "config" 
virtual files which don't have any CKRM-imposed semantics. Only "shares" 
  has a CKRM-defined set of values defined, not all of which are useful 
or will be implemented by a controller. We're debating whether to make 
that one controller-dependent too. If that happens, it'll make it 
somewhat better for cpusets. But I'm not sure if we'd want to go so far 
as to allow controllers to define what virtual files they export......we 
do that today for the classification engine because it is an entirely 
different beast but the controllers are similar.....

> And, looking at it from the CKRM side (not sure I can, I'll try ...)
> would it not seem a bit odd to a CKRM user that just one of the resource
> types managed, these cpusets, had no apparent existence outside of the
> CKRM hierarchy, unlike all the other resources, which existed a priori,
> and, I presume, continue their independent existance?

 From just the viewpoint of cpusets (not adding mem), it seems to be 
quite similar to what CKRM's other controllers are doing - grouping a 
per-task control (in your case, sched_setaffinity) using  hierarchical 
sets.

> 
> Obviously, I could use a little CKRM expertise here.
> 
> But my inclination is to continue to view these two projects as separate,
> with the potential that CKRM will someday add cpusets to the resource types
> that it can manage.

Umm... I'm quite sure you mean , you'll contribute code to do that, 
right ? :-)

It looks like the interface issue is the main one from both projects' 
pov. Hopefully things will become clearer in the next week or so when 
ckrm-tech thrashes out the Kernel Summit suggestion (it has other 
ramifications besides interface).

-- Shailabh
> 
> Thank-you.
> 


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-07 15:22             ` Erich Focht
                                 ` (2 preceding siblings ...)
  2004-08-08 14:50               ` Martin J. Bligh
@ 2004-08-08 20:22               ` Shailabh Nagar
  2004-08-09 15:57                 ` Hubertus Franke
  3 siblings, 1 reply; 233+ messages in thread
From: Shailabh Nagar @ 2004-08-08 20:22 UTC (permalink / raw)
  To: Erich Focht
  Cc: Paul Jackson, mbligh, lse-tech, akpm, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich

Erich Focht wrote:

> On Saturday 07 August 2004 08:10, Paul Jackson wrote:
> 
> Cpusets are a complex resource which needs to be managed. You already
> provided an interface for management but on the horizon there is this
> CKRM thing... I really don't care too much about the interface as long
> as it is comfortable (advocating for your bitset manipulation routines
> here ;-). CKRM will some day come in and maybe try to unify the
> resource control through a generalized interface. In my understand
> CKRM "classes" are (for the cpusets resource) your "sets". I was
> trying to anticipate that CKRM might want to present the single entry
> point for managing resources, including cpusets.

That is the intended utility of the CKRM core+interface, atleast for any 
resource for which it is useful to impose controls on a group of objects 
at once, as opposed to individually.

> 
> If I understand correctly, CKRM is fine for simple resources like
> amount of memory or cputime and designed to control flexible sharing
> of these resources and ensure some degree of fairness. Cpusets is a
> complex NUMA specific compound resource which actually only allows for
> a rather static distribution across processes (especially with the
> exclusive bits set). Including cpusets control into CKRM will be
> trivial, because you already provide all that's needed.

If we move to the new model where each controller has an independent 
hierarchy, this becomes a real possibility. We'd still need to negotiate 
on the interface. Implementationally its pretty simple....the main 
question is - should there be some uniformity in the interfaces at the 
/rcfs/<?> level for each controller or not. If there isn't, the only 
thing that CKRM brings to the table (for cpusets) is the filesystem.

> 
> What I proposed was to include cpusets ASAP. As we learned from
> Hubertus, CKRM is undergoing some redesign (after the kernel summit),
> so let's now get used to cpusets and forget about the generic resource
> controller until that is mature to enter the kernel. 

> When that happens
> people might love the generic way of controlling resources 

Might ? :-) We think its a home run :-)

> and the
> cpusets user interface will be yet another filesystem for controlling
> some hierarchical structures... The complaints about the huge size of
> the patch should therefore have in mind that we might well get rid of
> the user interface part of it. The core infrastructure of cpusets will
> be needed anyway and the amount of code is the absolutely required
> minimum, IMHO.
> 
> 
> 
>>The other reason that this suggestion worries me is a bit more
>>philosophical.  I'm sure that for all the other, well known,
>>resources that CKRM manages, no one is proposing replacing whatever
>>existing names and mechanisms exist for those resources, such as
>>bandwidth, compute cycles, memory, ...  Rather I presume that CKRM
>>provides an additional resource management layer on top of the
>>existing resources, which retain their classic names and apparatus.
>>[...]
> 
> 
> I hope cpusets will be an "existing resource" when CKRM comes into
> play. It's a compound resource built of cpus and memories (and the
> name cpuset is a bit misleading) but it fully makes sense on a NUMA
> machine to have these two elementary resources glued together. If CKRM
> was to build a resource controller for cpu masks and memories, or two
> separate resource controllers, the really acceptable end result would
> look like the current cpusets infrastructure. So why waste time?
> 
> Later cpusets could borrow the user interface of CKRM or, if the
> cpusets user interface is better suited, maybe we can just have a
> /rcfs/cpusets/ directory tree with the current cpusets look and feel?
> Question to CKRM people: would it make sense to have a class with
> another way of control than the shares/targets/members files?

Need to mull this over in ckrm-tech, as mentioned earlier.
There are two issues:
- should controllers be allowed to create their own virtual files ?
- are all of the existing shares/targets/members files sufficiently 
useful to existing and future controllers to make them available by 
default (and offer the user some consistency) ?

I feel the answer to the second one is a yes though I'm not convinced 
that the attributes within the shares file need to be the same.

But saying yes to the first one will mean controllers have to implement 
some filesystem-related code (as is done by CKRM's Classification Engine 
modules, which also sit under /rcfs but have a completely different 
interface in terms of virtual files). We could work something out where 
controllers could use common code where available and then roll their 
own extras.

If there's interest in this idea from the cpusets team and if we can 
come up with a way in which cpu/mem/io etc. could continue to share 
common rcfs code (as they do today) CKRM could consider this option.

-- Shailabh

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] new bitmap list format (for cpusets)
  2004-08-05 20:47 ` [Lse-tech] [PATCH] new bitmap list format (for cpusets) Martin J. Bligh
  2004-08-05 21:45   ` Paul Jackson
@ 2004-08-09  8:01   ` Paul Jackson
  2004-08-09 14:49     ` Martin J. Bligh
  1 sibling, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-08-09  8:01 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: akpm, hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, lse-tech, sivanich

I was looking at this bitmap list format patch over the weekend, and
came to the conclusion that the basic list format, as in the example:

	0,3,5,8-15

was a valuable improvement over a fixed length hex mask, but that on the
other hand the support for:

	the prefix characters '=', '-', '+', or '!'

was fluff, that few would learn to use, and fewer find essential.

So I redid the bitmap list format patch, removing the prefix character
support, and making another pass at compacting the input 'write' side
code in bitmap_parselist().

The kernel text costs in bytes for these two patches, on an i386 build,
are now:

	bitmap lists:	 592
	cpusets:	7718
			----
	total:		8310

Here's the new bitmap list patch.  It applies to 2.6.8-rc2-mm2.
It replaces the earlier bitmap list patch, that began this thread
on August 5, 2004.

========

A bitmap print and parse format that provides lists of ranges of
numbers, to be first used for by cpusets (next patch).

Cpusets provide a way to manage subsets of CPUs and Memory Nodes
for scheduling and memory placement, via a new virtual file system,
usually mounted at /dev/cpuset.  Manipulation of cpusets can be done
directly via this file system, from the shell.

However, manipulating 512 bit cpumasks or 256 bit nodemasks (which
will get bigger) via hex mask strings is painful for humans.

The intention is to provide a format for the cpu and memory mask files
in /dev/cpusets that will stand the test of time.  This format is
supported by a couple of new lib/bitmap.c routines, for printing and
parsing these strings.  Wrappers for cpumask and nodemask are provided.

 include/linux/bitmap.h   |    8 +++
 include/linux/cpumask.h  |   22 +++++++++-
 include/linux/nodemask.h |   22 +++++++++-
 lib/bitmap.c             |  103 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 150 insertions(+), 5 deletions(-)

Signed-off-by: Paul Jackson <pj@sgi.com>

Index: 2.6.8-rc2-mm2/include/linux/bitmap.h
===================================================================
--- 2.6.8-rc2-mm2.orig/include/linux/bitmap.h	2004-08-08 23:17:35.000000000 -0700
+++ 2.6.8-rc2-mm2/include/linux/bitmap.h	2004-08-08 23:24:57.000000000 -0700
@@ -41,7 +41,9 @@
  * bitmap_shift_right(dst, src, n, nbits)	*dst = *src >> n
  * bitmap_shift_left(dst, src, n, nbits)	*dst = *src << n
  * bitmap_scnprintf(buf, len, src, nbits)	Print bitmap src to buf
- * bitmap_parse(ubuf, ulen, dst, nbits)		Parse bitmap dst from buf
+ * bitmap_parse(ubuf, ulen, dst, nbits)		Parse bitmap dst from user buf
+ * bitmap_scnlistprintf(buf, len, src, nbits)	Print bitmap src as list to buf
+ * bitmap_parselist(buf, dst, nbits)		Parse bitmap dst from list
  */
 
 /*
@@ -98,6 +100,10 @@ extern int bitmap_scnprintf(char *buf, u
 			const unsigned long *src, int nbits);
 extern int bitmap_parse(const char __user *ubuf, unsigned int ulen,
 			unsigned long *dst, int nbits);
+extern int bitmap_scnlistprintf(char *buf, unsigned int len,
+			const unsigned long *src, int nbits);
+extern int bitmap_parselist(const char *buf, unsigned long *maskp,
+			int nmaskbits);
 extern int bitmap_find_free_region(unsigned long *bitmap, int bits, int order);
 extern void bitmap_release_region(unsigned long *bitmap, int pos, int order);
 extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order);
Index: 2.6.8-rc2-mm2/include/linux/cpumask.h
===================================================================
--- 2.6.8-rc2-mm2.orig/include/linux/cpumask.h	2004-08-08 23:17:35.000000000 -0700
+++ 2.6.8-rc2-mm2/include/linux/cpumask.h	2004-08-08 23:24:57.000000000 -0700
@@ -10,6 +10,8 @@
  *
  * For details of cpumask_scnprintf() and cpumask_parse(),
  * see bitmap_scnprintf() and bitmap_parse() in lib/bitmap.c.
+ * For details of cpulist_scnprintf() and cpulist_parse(), see
+ * bitmap_scnlistprintf() and bitmap_parselist(), also in bitmap.c.
  *
  * The available cpumask operations are:
  *
@@ -46,6 +48,8 @@
  *
  * int cpumask_scnprintf(buf, len, mask) Format cpumask for printing
  * int cpumask_parse(ubuf, ulen, mask)	Parse ascii string as cpumask
+ * int cpulist_scnprintf(buf, len, mask) Format cpumask as list for printing
+ * int cpulist_parse(buf, map)		Parse ascii string as cpulist
  *
  * for_each_cpu_mask(cpu, mask)		for-loop cpu over mask
  *
@@ -268,14 +272,28 @@ static inline int __cpumask_scnprintf(ch
 	return bitmap_scnprintf(buf, len, srcp->bits, nbits);
 }
 
-#define cpumask_parse(ubuf, ulen, src) \
-			__cpumask_parse((ubuf), (ulen), &(src), NR_CPUS)
+#define cpumask_parse(ubuf, ulen, dst) \
+			__cpumask_parse((ubuf), (ulen), &(dst), NR_CPUS)
 static inline int __cpumask_parse(const char __user *buf, int len,
 					cpumask_t *dstp, int nbits)
 {
 	return bitmap_parse(buf, len, dstp->bits, nbits);
 }
 
+#define cpulist_scnprintf(buf, len, src) \
+			__cpulist_scnprintf((buf), (len), &(src), NR_CPUS)
+static inline int __cpulist_scnprintf(char *buf, int len,
+					const cpumask_t *srcp, int nbits)
+{
+	return bitmap_scnlistprintf(buf, len, srcp->bits, nbits);
+}
+
+#define cpulist_parse(buf, dst) __cpulist_parse((buf), &(dst), NR_CPUS)
+static inline int __cpulist_parse(const char *buf, cpumask_t *dstp, int nbits)
+{
+	return bitmap_parselist(buf, dstp->bits, nbits);
+}
+
 #if NR_CPUS > 1
 #define for_each_cpu_mask(cpu, mask)		\
 	for ((cpu) = first_cpu(mask);		\
Index: 2.6.8-rc2-mm2/include/linux/nodemask.h
===================================================================
--- 2.6.8-rc2-mm2.orig/include/linux/nodemask.h	2004-08-08 23:17:35.000000000 -0700
+++ 2.6.8-rc2-mm2/include/linux/nodemask.h	2004-08-08 23:24:57.000000000 -0700
@@ -10,6 +10,8 @@
  *
  * For details of nodemask_scnprintf() and nodemask_parse(),
  * see bitmap_scnprintf() and bitmap_parse() in lib/bitmap.c.
+ * For details of nodelist_scnprintf() and nodelist_parse(), see
+ * bitmap_scnlistprintf() and bitmap_parselist(), also in bitmap.c.
  *
  * The available nodemask operations are:
  *
@@ -46,6 +48,8 @@
  *
  * int nodemask_scnprintf(buf, len, mask) Format nodemask for printing
  * int nodemask_parse(ubuf, ulen, mask)	Parse ascii string as nodemask
+ * int nodelist_scnprintf(buf, len, mask) Format nodemask as list for printing
+ * int nodelist_parse(buf, map)		Parse ascii string as nodelist
  *
  * for_each_node_mask(node, mask)	for-loop node over mask
  *
@@ -271,14 +275,28 @@ static inline int __nodemask_scnprintf(c
 	return bitmap_scnprintf(buf, len, srcp->bits, nbits);
 }
 
-#define nodemask_parse(ubuf, ulen, src) \
-			__nodemask_parse((ubuf), (ulen), &(src), MAX_NUMNODES)
+#define nodemask_parse(ubuf, ulen, dst) \
+			__nodemask_parse((ubuf), (ulen), &(dst), MAX_NUMNODES)
 static inline int __nodemask_parse(const char __user *buf, int len,
 					nodemask_t *dstp, int nbits)
 {
 	return bitmap_parse(buf, len, dstp->bits, nbits);
 }
 
+#define nodelist_scnprintf(buf, len, src) \
+			__nodelist_scnprintf((buf), (len), &(src), MAX_NUMNODES)
+static inline int __nodelist_scnprintf(char *buf, int len,
+					const nodemask_t *srcp, int nbits)
+{
+	return bitmap_scnlistprintf(buf, len, srcp->bits, nbits);
+}
+
+#define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES)
+static inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits)
+{
+	return bitmap_parselist(buf, dstp->bits, nbits);
+}
+
 #if MAX_NUMNODES > 1
 #define for_each_node_mask(node, mask)			\
 	for ((node) = first_node(mask);			\
Index: 2.6.8-rc2-mm2/lib/bitmap.c
===================================================================
--- 2.6.8-rc2-mm2.orig/lib/bitmap.c	2004-08-08 23:17:35.000000000 -0700
+++ 2.6.8-rc2-mm2/lib/bitmap.c	2004-08-09 00:11:57.000000000 -0700
@@ -291,6 +291,7 @@ EXPORT_SYMBOL(__bitmap_weight);
 #define nbits_to_hold_value(val)	fls(val)
 #define roundup_power2(val,modulus)	(((val) + (modulus) - 1) & ~((modulus) - 1))
 #define unhex(c)			(isdigit(c) ? (c - '0') : (toupper(c) - 'A' + 10))
+#define BASEDEC 10		/* fancier cpuset lists input in decimal */
 
 /**
  * bitmap_scnprintf - convert bitmap to an ASCII hex string.
@@ -409,6 +410,108 @@ int bitmap_parse(const char __user *ubuf
 }
 EXPORT_SYMBOL(bitmap_parse);
 
+/*
+ * bscnl_emit(buf, buflen, rbot, rtop, bp)
+ *
+ * Helper routine for bitmap_scnlistprintf().  Write decimal number
+ * or range to buf, suppressing output past buf+buflen, with optional
+ * comma-prefix.  Return len of what would be written to buf, if it
+ * all fit.
+ */
+static inline int bscnl_emit(char *buf, int buflen, int rbot, int rtop, int len)
+{
+	if (len > 0)
+		len += scnprintf(buf + len, buflen - len, ",");
+	if (rbot == rtop)
+		len += scnprintf(buf + len, buflen - len, "%d", rbot);
+	else
+		len += scnprintf(buf + len, buflen - len, "%d-%d", rbot, rtop);
+	return len;
+}
+
+/**
+ * bitmap_scnlistprintf - convert bitmap to list format ASCII string
+ * @buf: byte buffer into which string is placed
+ * @buflen: reserved size of @buf, in bytes
+ * @maskp: pointer to bitmap to convert
+ * @nmaskbits: size of bitmap, in bits
+ *
+ * Output format is a comma-separated list of decimal numbers and
+ * ranges.  Consecutively set bits are shown as two hyphen-separated
+ * decimal numbers, the smallest and largest bit numbers set in
+ * the range.  Output format is compatible with the format
+ * accepted as input by bitmap_parselist().
+ *
+ * The return value is the number of characters which would be
+ * generated for the given input, excluding the trailing '\0', as
+ * per ISO C99.
+ */
+int bitmap_scnlistprintf(char *buf, unsigned int buflen,
+	const unsigned long *maskp, int nmaskbits)
+{
+	int len = 0;
+	/* current bit is 'cur', most recently seen range is [rbot, rtop] */
+	int cur, rbot, rtop;
+
+	rbot = cur = find_first_bit(maskp, nmaskbits);
+	while (cur < nmaskbits) {
+		rtop = cur;
+		cur = find_next_bit(maskp, nmaskbits, cur+1);
+		if (cur >= nmaskbits || cur > rtop + 1) {
+			len = bscnl_emit(buf, buflen, rbot, rtop, len);
+			rbot = cur;
+		}
+	}
+	return len;
+}
+EXPORT_SYMBOL(bitmap_scnlistprintf);
+
+/**
+ * bitmap_parselist - convert list format ASCII string to bitmap
+ * @buf: read nul-terminated user string from this buffer
+ * @mask: write resulting mask here
+ * @nmaskbits: number of bits in mask to be written
+ *
+ * Input format is a comma-separated list of decimal numbers and
+ * ranges.  Consecutively set bits are shown as two hyphen-separated
+ * decimal numbers, the smallest and largest bit numbers set in
+ * the range.
+ *
+ * Returns 0 on success, -errno on invalid input strings:
+ *    -EINVAL:   second number in range smaller than first
+ *    -EINVAL:   invalid character in string
+ *    -ERANGE:   bit number specified too large for mask
+ */
+int bitmap_parselist(const char *bp, unsigned long *maskp, int nmaskbits)
+{
+	unsigned a, b;
+
+	bitmap_zero(maskp, nmaskbits);
+	do {
+		if (!isdigit(*bp))
+			return -EINVAL;
+		b = a = simple_strtoul(bp, (char **)&bp, BASEDEC);
+		if (*bp == '-') {
+			bp++;
+			if (!isdigit(*bp))
+				return -EINVAL;
+			b = simple_strtoul(bp, (char **)&bp, BASEDEC);
+		}
+		if (!(a <= b))
+			return -EINVAL;
+		if (b >= nmaskbits)
+			return -ERANGE;
+		while (a <= b) {
+			set_bit(a, maskp);
+			a++;
+		}
+		if (*bp == ',')
+			bp++;
+	} while (*bp != '\0' && *bp != '\n');
+	return 0;
+}
+EXPORT_SYMBOL(bitmap_parselist);
+
 /**
  *	bitmap_find_free_region - find a contiguous aligned mem region
  *	@bitmap: an array of unsigned longs corresponding to the bitmap


-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] new bitmap list format (for cpusets)
  2004-08-09  8:01   ` Paul Jackson
@ 2004-08-09 14:49     ` Martin J. Bligh
  2004-08-10 23:43       ` Paul Jackson
  0 siblings, 1 reply; 233+ messages in thread
From: Martin J. Bligh @ 2004-08-09 14:49 UTC (permalink / raw)
  To: Paul Jackson
  Cc: akpm, hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, lse-tech, sivanich

--Paul Jackson <pj@sgi.com> wrote (on Monday, August 09, 2004 01:01:06 -0700):

> I was looking at this bitmap list format patch over the weekend, and
> came to the conclusion that the basic list format, as in the example:
> 
> 	0,3,5,8-15
> 
> was a valuable improvement over a fixed length hex mask, but that on the
> other hand the support for:
> 
> 	the prefix characters '=', '-', '+', or '!'
> 
> was fluff, that few would learn to use, and fewer find essential.

OK, that looks a lot more palletable ;-)

Question:  it looks like you're only parsing on the read-side to me (which
is good, since it's highly unlikely to break anything existant), but the
function bitmap_scnlistprintf is still in there - is that needed? I can't
see any callers, but I might be missing one? I guess it might be for your
other patch, but it'd seem to make the parsing a whole lot more complicated
in userspace for the reader if we did use that ...

It looks like cpulist_scnprintf calls __cpulist_scnprintf, which just calls
bitmap_scnlistprintf, but nobody calls either of the former 2 ... ditto for
nodelist_scnprintf.

M.

PS. Similarly, do we really need both cpumask_parse and __cpumask_parse
in front of bitmap_parse? One seems to make sense for abstracting the generic
parse routine, but 2 seems like overkill ;-) (yeah, I know that was there
before this patch ... just seems odd).

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-08 20:22               ` Shailabh Nagar
@ 2004-08-09 15:57                 ` Hubertus Franke
  2004-08-10 11:31                   ` [ckrm-tech] " Paul Jackson
  0 siblings, 1 reply; 233+ messages in thread
From: Hubertus Franke @ 2004-08-09 15:57 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: Erich Focht, Paul Jackson, mbligh, lse-tech, akpm, hch, steiner,
	jbarnes, sylvain.jeaugey, djh, linux-kernel, colpatch,
	Simon.Derr, ak, sivanich, ckrm-tech

Please add ckrm-tech@lists.sourceforge.net if CKRM isses are requested.

See further comments to this thread below.

-- Hubertus

Shailabh Nagar wrote:

> Erich Focht wrote:
> 
>> On Saturday 07 August 2004 08:10, Paul Jackson wrote:
>>
>> Cpusets are a complex resource which needs to be managed. You already
>> provided an interface for management but on the horizon there is this
>> CKRM thing... I really don't care too much about the interface as long
>> as it is comfortable (advocating for your bitset manipulation routines
>> here ;-). CKRM will some day come in and maybe try to unify the
>> resource control through a generalized interface. In my understand
>> CKRM "classes" are (for the cpusets resource) your "sets". I was
>> trying to anticipate that CKRM might want to present the single entry
>> point for managing resources, including cpusets.
> 
> 
> That is the intended utility of the CKRM core+interface, atleast for any 
> resource for which it is useful to impose controls on a group of objects 
> at once, as opposed to individually.
> 
>>
>> If I understand correctly, CKRM is fine for simple resources like
>> amount of memory or cputime and designed to control flexible sharing
>> of these resources and ensure some degree of fairness. Cpusets is a
>> complex NUMA specific compound resource which actually only allows for
>> a rather static distribution across processes (especially with the
>> exclusive bits set). Including cpusets control into CKRM will be
>> trivial, because you already provide all that's needed.
> 
> 
> If we move to the new model where each controller has an independent 
> hierarchy, this becomes a real possibility. We'd still need to negotiate 
> on the interface. Implementationally its pretty simple....the main 
> question is - should there be some uniformity in the interfaces at the 
> /rcfs/<?> level for each controller or not. If there isn't, the only 
> thing that CKRM brings to the table (for cpusets) is the filesystem.
> 
>>
>> What I proposed was to include cpusets ASAP. As we learned from
>> Hubertus, CKRM is undergoing some redesign (after the kernel summit),
>> so let's now get used to cpusets and forget about the generic resource
>> controller until that is mature to enter the kernel. 
> 

Let's look where the restructuring is conceptually heading.
As indicated by Shailabh above (and requested at the kernel summit),
the resource controllers are becoming external entities in that they
will be addressed directly by through the /rcfs/<rc>/<class-hierarchy>,
rather then indirectly through their association with the classtypes
right now.

In essense, the /rcfs interface can be used if a strict hierarchy can be
generated in the class hierarchy for a given resource.
Furthermore, each resource controller manipulates a set of attributes 
and constraints. Today we are talking about shares (min,max, guarantee).
There is no reason why these attributes/constraints can not be resource 
controller specific. For instance for the cpu sets, the attribute would 
be "cpus_allowed" and the controller would verify its own constraints,
such as cpus_allowed has to be a subset of its parents cpus.
Whether at this point "shares" is still the right filename is debateable.


> Might ? :-) We think its a home run :-)
> 
>> and the
>> cpusets user interface will be yet another filesystem for controlling
>> some hierarchical structures... The complaints about the huge size of
>> the patch should therefore have in mind that we might well get rid of
>> the user interface part of it. The core infrastructure of cpusets will
>> be needed anyway and the amount of code is the absolutely required
>> minimum, IMHO.
>>
>>
>>
>>> The other reason that this suggestion worries me is a bit more
>>> philosophical.  I'm sure that for all the other, well known,
>>> resources that CKRM manages, no one is proposing replacing whatever
>>> existing names and mechanisms exist for those resources, such as
>>> bandwidth, compute cycles, memory, ...  Rather I presume that CKRM
>>> provides an additional resource management layer on top of the
>>> existing resources, which retain their classic names and apparatus.
>>> [...]
>>
>>
>>
>> I hope cpusets will be an "existing resource" when CKRM comes into
>> play. It's a compound resource built of cpus and memories (and the
>> name cpuset is a bit misleading) but it fully makes sense on a NUMA
>> machine to have these two elementary resources glued together. If CKRM
>> was to build a resource controller for cpu masks and memories, or two
>> separate resource controllers, the really acceptable end result would
>> look like the current cpusets infrastructure. So why waste time?
>>
>> Later cpusets could borrow the user interface of CKRM or, if the
>> cpusets user interface is better suited, maybe we can just have a
>> /rcfs/cpusets/ directory tree with the current cpusets look and feel?
>> Question to CKRM people: would it make sense to have a class with
>> another way of control than the shares/targets/members files?

See above.. I think if we relax the fixed attributes that currently
exist for "shares" and "stats" into something where the attribute
names are verified and interpreted by the resource controller than
that's effectively what you suggest here.

> 
> 
> Need to mull this over in ckrm-tech, as mentioned earlier.
> There are two issues:
> - should controllers be allowed to create their own virtual files ?
> - are all of the existing shares/targets/members files sufficiently 
> useful to existing and future controllers to make them available by 
> default (and offer the user some consistency) ?
> 
> I feel the answer to the second one is a yes though I'm not convinced 
> that the attributes within the shares file need to be the same.
> 
> But saying yes to the first one will mean controllers have to implement 
> some filesystem-related code (as is done by CKRM's Classification Engine 
> modules, which also sit under /rcfs but have a completely different 
> interface in terms of virtual files). We could work something out where 
> controllers could use common code where available and then roll their 
> own extras.

I don't think we need to worry about the file system here (yet).
rcfs takes care of the class object hierarchy and passes (as done today
in other cases ) its attribute-setting strings down to the resource 
controllers. We won't however have to do the parsing at /rcfs level.

> 
> If there's interest in this idea from the cpusets team and if we can 
> come up with a way in which cpu/mem/io etc. could continue to share 
> common rcfs code (as they do today) CKRM could consider this option.
> 
> -- Shailabh
> 
> 
> -------------------------------------------------------
> This SF.Net email is sponsored by OSTG. Have you noticed the changes on
> Linux.com, ITManagersJournal and NewsForge in the past few weeks? Now,
> one more big change to announce. We are now OSTG- Open Source Technology
> Group. Come see the changes on the new OSTG site. www.ostg.com
> _______________________________________________
> Lse-tech mailing list
> Lse-tech@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/lse-tech
> 


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-09 15:57                 ` Hubertus Franke
@ 2004-08-10 11:31                   ` Paul Jackson
  2004-08-10 22:38                     ` Shailabh Nagar
  0 siblings, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-08-10 11:31 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: nagar, efocht, mbligh, lse-tech, akpm, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich, ckrm-tech

I've been puzzling over the relationship of cpusets and CKRM the last
few days, unable to understand how they relate, or how either could
make much use of the other.

Others have noticed they both have a hierarchy, and are both concerned
with managing resources in some sense.  Hence more than one person has
suspected opportunities for closer integration of the two projects,
indeed, hoped for such opportunities, given that neither code base
has a reputation for being small.  Though, to be fair to CKRM, they
have substantial more code invested.  Outside of the cpusets.txt file
in Documentation, the cpuset patch is under 2000 lines involving 13
files, whereas a quick count of the June 2004 e13 ckrm and related
cpu patches shows over 15,000 lines involving 62 files.

Someone has suggested that we shouldn't accept the particular names
and directory structure of cpusets into the kernel until we understand
how this interacts with CKRM, because things like this are hard to
change once put in use, and CKRM might impose or at least recommend
different names or such.

The more I look, the more convinced I become that these two projects
are separate, in means and goals, with little interaction and less
opportunty for either to leverage the other.  Neither project should
be contingent on the other.

Warning:
	No one should take anything that follows as actually
	describing CKRM.  I can find statements on the CKRM web
	pages directly contradicting what I state, and I am certain
	that I'm somewhat to substantially confused.  I'll just go
	ahead and boldly describe CKRM as I currently understand it,
	in the hopes that someone knowledgeable in the project will
	thus more easily see my errors and offer corrections.

Here is my current understanding of cpusets and CKRM, and how they
differ.

Cpusets - Static Isolation:

    The essential purpose of cpusets is to support isolating large,
    long-running, multinode compute bound HPC (high performance
    computing) applications or relatively independent service jobs,
    on dedicated sets of processor and memory nodes.
    
    The (unobtainable) ideal of cpusets is to provide perfect
    isolation, for such jobs as:

     1) Massive compute jobs that might run hours or days, on dozens
	or hundreds of processors, consuming gigabytes or terabytes
	of main memory.  These jobs are often highly parallel, and
	carefully sized and placed to obtain maximum performance
	on NUMA hardware, where memory placement and bandwidth is
	critical.

     2) Independent services for which dedicated compute resources
        have been purchased or allocated, in units of one or more
	CPUs and Memory Nodes, such as a web server and a DBMS
	sharing a large system, but staying out of each others way.

    The essential new construct of cpusets is the set of dedicated
    compute resources - some processors and memory.  These sets have
    names, permissions, an exclusion property, and can be subdivided
    into subsets.

    The cpuset file system models a hierarchy of 'virtual computers',
    which hierarchy will be deeper on larger systems.

    The average lifespan of a cpuset used for (1) above is probably
    between hours and days, based on the job lifespan, though a couple
    of system cpusets will remain in place as long as the system is
    running.  The cpusets in (2) above might have a longer lifespan;
    you'd have to ask Simon Derr of Bull about that.

CKRM - Dynamic Sharing:

    My current, probably confused, understanding is that the purpose
    of CKRM is to enable managing different Qualities of Service, or
    "Classes" (*) on streams of transactions, queries, jobs, tasks that
    are sharing the same compute resources.  Even if there is some
    big honking service process such as an enterprise DBMS running,
    the point of CKRM is not focused on optimizing the overall
    performance of that job, but rather on distinguishing between
    various transactions flowing through the system, determining the
    quality of service (Class) allowed for each, measuring critical
    resource usage for each Class, and biasing resource allocation
    decisions, such as in the scheduler and allocator, to obtain the
    desired balance of resource usage between Classes, or the desired
    response time to particular favored Classes.

    This is certainly a more challenging objective than cpusets,
    in that it requires (1) tracking resource usage (cpu cycles,
    memory pages, i/o bandwidth) by Class, (2) assigning a Class to
    transactions moving through the system, and imputing that Class to
    the tasks handling each transaction, and (3) dynamically biasing
    scheduling and allocation decisions so as to affect the desired
    Quality of Service policies.
    
    The essential new construct of CKRM is the Class - a Quality
    of Service level.  Metrics, transactions, tasks, and resource
    decisions all have to be tracked or managed by Class.

    These Classes form a fairly shallow hierarchy of usage levels or
    service qualities, as perceived by the end users of the system.

    I'd guess that the average lifetime of a Class is months or years,
    as they can reflect the relative priority of relations with long
    standing, external customers.

Cpusets and CKRM have profoundly different purposes, economics and
motivations.

For one thing, the cpuset hierarchy and the class hierarchy are two
different things.  One provides semi-static collections of compute
resources, which I sometimes call virtual computers or soft partitions.
The other reflects the differing qualities of service which you find
it worth providing the originators of transactions into your system.
These have about as much to do with each other as the "Program Files"
on my sons game machine has to do with Linus' home directory.  Yup -
they're both representable in file system trees ;).

I see no value other than obfuscation to attempting to represent
either hierarchy in terms of the other.

One of the valuable parts of my cpuset proposal is that the cpuset
file system reflects the allocation of cpu and memory nodes to
cpusets in a visible and obvious fashion, and thanks to the Linux
vfs infrastructure, provides the customary file system hierarchy and
permission model with little additional cpuset code.  Cpusets have
user (administrator) provided pathnames, in a file system hierarchy,
with the usual and expected vfs support.  And the filenames (mems,
cpus, tasks, ...)  within each cpuset directory have a relevance that
should be preserved.  I don't see any value that the CKRM hierarchy
mechanisms, naming or semantics bring to that.

For another way to put the difference, CKRM is managing "commodity"
resources, such as cycles and bits.  One cycle is as good as the
next; it's just a question of who gets how many.  On the other hand,
cpusets manage precious named resources - such as an entire block
of 64 CPUs and associated memory on a 256 CPU system.  Each such
cpuset is a unique, named, first class, relatively long lasting
entity represented by its own directory in the cpuset file system,
and assigned a specific well known job to execute.

So what interaction or relationship if any do I see between cpusets
and CKRM?  Only one at the moment.  A major job running within a
long lasting cpuset might well want to make use of CKRM in order to
provide refined Qualities of Service to its clients.  This means that
the CKRM instance would need to understand that it's not managing
the entire physical system, but just some cpuset-defined subset.

A few days ago, one of the CKRM gurus encouraged me to look forward
to providing a CKRM controller for cpusets.  At the time, I nodded
knowingly at my screen, as if that all made sense.

Now, I've no clue what such a controller would be or do, or why anyone
would want one.

I look forward to having my likely serious confusions over CKRM
corrected.  Meanwhile, I remain convinced that cpusets and CKRM are
separate and distinct projects, and that neither should wait for
the other.

I continue to recommend that cpusets be accepted into the 2.6.9 mm
patches, and if that goes well, into Linus' tree.

Thank-you for reading.

    (*) The above description of a Class as a Quality of Service
        does _not_ match the phrase on http://ckrm.sourcefourge.net:
	    "A class is a group of Linux tasks (processes), ..."
	I'm speculating that this phrase is misleading.  More
	likely, it's just that I'm confused ;).


-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-10 11:31                   ` [ckrm-tech] " Paul Jackson
@ 2004-08-10 22:38                     ` Shailabh Nagar
  2004-08-11 10:42                       ` Erich Focht
  2004-08-14  8:51                       ` Paul Jackson
  0 siblings, 2 replies; 233+ messages in thread
From: Shailabh Nagar @ 2004-08-10 22:38 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Hubertus Franke, efocht, mbligh, lse-tech, akpm, hch, steiner,
	jbarnes, sylvain.jeaugey, djh, linux-kernel, colpatch,
	Simon.Derr, ak, sivanich, ckrm-tech

Paul Jackson wrote:

> 
> The more I look, the more convinced I become that these two projects
> are separate, in means and goals, with little interaction and less
> opportunty for either to leverage the other.  Neither project should
> be contingent on the other.

> 
> Warning:
> 	No one should take anything that follows as actually
> 	describing CKRM.  I can find statements on the CKRM web
> 	pages directly contradicting what I state, and I am certain
> 	that I'm somewhat to substantially confused.  I'll just go
> 	ahead and boldly describe CKRM as I currently understand it,
> 	in the hopes that someone knowledgeable in the project will
> 	thus more easily see my errors and offer corrections.

> 
> Here is my current understanding of cpusets and CKRM, and how they
> differ.
> 
> Cpusets - Static Isolation:
> 
>     The essential purpose of cpusets is to support isolating large,
>     long-running, multinode compute bound HPC (high performance
>     computing) applications or relatively independent service jobs,
>     on dedicated sets of processor and memory nodes.

CKRM's overall objective is to isolate the performance of a group of 
kernel objects from other groups. The grouping can be static 
(applications, users, etc.) or dynamic (processes of the same app can 
change membership from one group to another).

The group of objects is what we call a class.

The apparent dichotomy between what you describe and what we manage is 
resolved when you consider that all applications/users etc. finally boil 
down to some set of tasks making resource demands of cpu, mem, io etc.

Basically we have a flexible way of defining a group of tasks - what 
that group maps to in user space doesn't matter inside the kernel when
resource allocations are being done.

>     
>     The (unobtainable) ideal of cpusets is to provide perfect
>     isolation, for such jobs as:
> 
>      1) Massive compute jobs that might run hours or days, on dozens
> 	or hundreds of processors, consuming gigabytes or terabytes
> 	of main memory.  These jobs are often highly parallel, and
> 	carefully sized and placed to obtain maximum performance
> 	on NUMA hardware, where memory placement and bandwidth is
> 	critical.
> 
>      2) Independent services for which dedicated compute resources
>         have been purchased or allocated, in units of one or more
> 	CPUs and Memory Nodes, such as a web server and a DBMS
> 	sharing a large system, but staying out of each others way.
> 
>     The essential new construct of cpusets is the set of dedicated
>     compute resources - some processors and memory.  These sets have
>     names, permissions, an exclusion property, and can be subdivided
>     into subsets.

The only difference between CKRM and cpusets in the paragraphs above is 
that cpusets tries to achieve the isolation by a static partitioning of 
physical cpus and mem nodes. CKRM does so in terms of cpu time and 
memory pages.

> 
>     The cpuset file system models a hierarchy of 'virtual computers',
>     which hierarchy will be deeper on larger systems.
> 
>     The average lifespan of a cpuset used for (1) above is probably
>     between hours and days, based on the job lifespan, though a couple
>     of system cpusets will remain in place as long as the system is
>     running.  The cpusets in (2) above might have a longer lifespan;
>     you'd have to ask Simon Derr of Bull about that.

CKRM class lifespans depend on how the classes are defined by the 
sysadmin or delegated users. Classes representing users will last as 
long as the system is up, those representing a particular application 
will last as long as the app (typically - CKRM doesn't autodelete 
classes - the user who created it needs to do it himself).

> 
> CKRM - Dynamic Sharing:
> 
>     My current, probably confused, understanding is that the purpose
>     of CKRM is to enable managing different Qualities of Service, or
>     "Classes" (*) on streams of transactions, queries, jobs, tasks that
>     are sharing the same compute resources. 

It would be easier to think of classes as a grouping of tasks and 
sockets which, as an aggregate, have some share of each resource managed 
by CKRM. A class is not characterized by the QoS level, but the objects 
it groups. In particular, two classes can have the same QoS level (e.g. 
20% of total cpu time) and the same class can have its QoS level changed 
(from 20% to say 40%).

> Even if there is some
>     big honking service process such as an enterprise DBMS running,
>     the point of CKRM is not focused on optimizing the overall
>     performance of that job, but rather on distinguishing between
>     various transactions flowing through the system, determining the
>     quality of service (Class) allowed for each, measuring critical
>     resource usage for each Class, and biasing resource allocation
>     decisions, such as in the scheduler and allocator, to obtain the
>     desired balance of resource usage between Classes, or the desired
>     response time to particular favored Classes.

Managing the QoS of transactions (which tend to cross task/application 
boundaries) is a complicated use of CKRM which tries to exploit its 
support for flexible and dynamic grouping. Doing this requires some 
degree of application cooperation (it is only the app which can tell 
what transaction it is processing).

However transaction QoS management is not what CKRM, the kernel project, 
is doing. Its most commonly expected usage is to isolate the performance 
of one application from another or one user from another. Doing this is 
far easier than transactions since apps and users map to tasks/sockets
in easily understood ways that do not require any cooperation from the 
app/user (indeed we don't want any "cooperation" or "interference" from 
them !)

> 
>     This is certainly a more challenging objective than cpusets,
>     in that it requires (1) tracking resource usage (cpu cycles,
>     memory pages, i/o bandwidth) by Class, (2) assigning a Class to
>     transactions moving through the system, and imputing that Class to
>     the tasks handling each transaction, and (3) dynamically biasing
>     scheduling and allocation decisions so as to affect the desired
>     Quality of Service policies.

Correct, CKRM does have more work to do than cpusets does, since it 
controls more fine-grained resources than cpusets (cpu time vs cpus, mem 
pages vs. nodes).

However, it does get a lot of help from the system and does not have to 
carry the burden of 1) and 3) all by itself. 1) only requires existing 
resource usage data (cpu time consumed by a process) to be aggregated, 
additionally, into class statistics. 3) too can be done as an increment 
over existing schedulers, not a replacement. In case of the CPU, it 
means picking the next class to run and then choosing the next task to 
run. In mem, it means preferentially picking pages from an "over share" 
class to swap out etc.

>     
>     The essential new construct of CKRM is the Class - a Quality
>     of Service level.

As said above, this is not the right way to think of a class. Think 
groupings ! The Quality of Service level is an attribute of a class,
not its defining characteristic.

>  Metrics, transactions, tasks, and resource
>     decisions all have to be tracked or managed by Class.
> 
>     These Classes form a fairly shallow hierarchy of usage levels or
>     service qualities, as perceived by the end users of the system.
> 
>     I'd guess that the average lifetime of a Class is months or years,
>     as they can reflect the relative priority of relations with long
>     standing, external customers.
> 
> Cpusets and CKRM have profoundly different purposes, economics and
> motivations.

I would say the methods differ, not the purpose. Both are trying to 
performance-isolate groups of tasks - one uses the spatial dimension of 
cpu bindings, the other uses  the temporal dimension of cpu time.

> 
> For one thing, the cpuset hierarchy and the class hierarchy are two
> different things.  One provides semi-static collections of compute
> resources, which I sometimes call virtual computers or soft partitions.
> The other reflects the differing qualities of service which you find
> it worth providing the originators of transactions into your system.
> These have about as much to do with each other as the "Program Files"
> on my sons game machine has to do with Linus' home directory.  Yup -
> they're both representable in file system trees ;).

Again, I would disagree. The filesytem hierarchies of cpusets and CKRM 
have quite a few things in common.
- directories representing the grouping of tasks
- hierarchical subdivision aka a child can only subdivide what its 
parent has. In CKRM, only the % share that a parent gets from the system 
is further divisible amongst child classes. In cpusets, that resource 
happens to be the set of cpus_allowed.
- delegation of control through file permissions : both allow non-root 
users to control their resource allocations.
- binding of tasks to a group by writing pids to a special virtual file


> 
> I see no value other than obfuscation to attempting to represent
> either hierarchy in terms of the other.

Notwithstanding the similarities between the hierarchies listed above, 
this danger of obfuscation is a possibility. The reason for that is that 
our interface within the filesystem, as defined by the virtual files and 
the attributes within them that one can read and write, do not map 
cleanly onto the ones exported by cpusets.

e.g. the notions of stats, lower and upper bounds for shares that CKRM 
needs, are not relevant to cpusets. On the other hand, we do allow 
attributes that are controller-specific to be represented within some 
virtual files and cpusets could use that.

The other point of difference is the one you'd brought up earlier - ther 
restrictions on the hierarchy creation. CKRM has none (effectively), 
cpusets has many.

As CKRM's interface stands today, there are sufficient differences 
between the interfaces to keep them separate.

However, if CKRM moves to a model where
- each controller is allowed to define its own virtual files and attributes
- each controllers has its own hierarchy (and hence more control over 
how it can be formed),
then the similarities will be too many to ignore merger possibilities
altogether.

The kicker is, we've not decided. The splitting of controllers into 
their own hierarchy is something we're considering independently (as a 
consequence of Linus' suggestion at KS04). But making the interface 
completely per-controller is something we can do, without too much 
effort, IF there is sufficient reason (we have other reasons for doing 
that as well - see recent postings on ckrm-tech).

Interest/recommendations from the community that cpusets  be part of 
CKRM's hierarchy would certainly be a factor in that decision.



> 
> For another way to put the difference, CKRM is managing "commodity"
> resources, such as cycles and bits.  One cycle is as good as the
> next; it's just a question of who gets how many.  On the other hand,
> cpusets manage precious named resources - such as an entire block
> of 64 CPUs and associated memory on a 256 CPU system.  

> Each such
> cpuset is a unique, named, first class, relatively long lasting
> entity represented by its own directory in the cpuset file system,
> and assigned a specific well known job to execute.

s/cpuset/class and s/cpuset file system/rcfs and this pretty much
describes CKRM.

> 
> So what interaction or relationship if any do I see between cpusets
> and CKRM?  Only one at the moment.  A major job running within a
> long lasting cpuset might well want to make use of CKRM in order to
> provide refined Qualities of Service to its clients.  This means that
> the CKRM instance would need to understand that it's not managing
> the entire physical system, but just some cpuset-defined subset.

This brings up a very important point. If CKRM's cpu controller is 
managing cpu time and cpusets are also operational, it might be hard for 
one or the other to achieve their objectives since they're both trying 
to constrain CPU usage along different dimensions.

But in a sense, CKRM already faces this problem since cpu, mem and io 
are not completely independent resources. We're pretty much relying on 
the sysadmin/user not to set wildly conflicting sets of shares for these 
resources and can have the same expectation from someone trying to use 
both CKRM cpu controller and cpusets at the same time.

> 
> A few days ago, one of the CKRM gurus encouraged me to look forward
> to providing a CKRM controller for cpusets.  At the time, I nodded
> knowingly at my screen, as if that all made sense.
> 
> Now, I've no clue what such a controller would be or do, or why anyone
> would want one.

Such a controller would be a different packaging of the cpusets patch 
with most of its internals remaining the same but using the CKRM 
interfaces, as Erich had pointed out.


> I look forward to having my likely serious confusions over CKRM
> corrected.  Meanwhile, I remain convinced that cpusets and CKRM are
> separate and distinct projects, and that neither should wait for
> the other.

On the non-technical front, this is desirable. Tying two projects 
together always runs the risk that one drags the other down. CKRM also 
faces this dilemma while considering a switch from using relayfs to 
netlink as the kernel-user communication channel. We think relayfs suits 
our needs better but given the problems the project has, can't afford to 
tie ourselves down to it.

Broadly, CKRM is not just a collection of controllers which operate on 
arbitrary groups of kernel objects, but also a framework for such 
controllers. In its latter role, it has a place for cpusets.

However, cpusets has little need for CKRM except for the commonalities 
in the interfaces and that too, if and when CKRM adopts the changes 
needed by cpusets.

So the bottomline, IMHO, is the interface - should there be one or two ? 
One can argue either way. There are already so many filesystems, whats 
one more ? CKRM doesn't encompass other "grouping" resource controllers 
such as outbound network (yet!) so why try to shoehorn cpusets into it ?

On the other hand, the user may appreciate one-stop-shops for similar 
kinds of resource management and would probably benefit from an 
integration of interfaces. And there is a merit to the argument that 
interfaces, once adopted in the mainline, will be hard to change.

Rusty's keynote at OLS2003 advised "work on the interfaces last". 
Evidently that advice isn't operative here ! Future incompatibility of 
interfaces is becoming a blocking factor for acceptance/testing/usage of 
the core functionality.

One suggestion is to go ahead with the  -mm acceptance of cpusets so its 
functionality has a chance to get feedback and address the CKRM 
interface integration a couple of months from now once CKRM's interface 
issues get resolved ? But do let us know if there is interest in merging 
(after this round of clarificatory emails is over) as it will affect 
which way we go.


-- Shailabh










> 
> I continue to recommend that cpusets be accepted into the 2.6.9 mm
> patches, and if that goes well, into Linus' tree.
> 
> Thank-you for reading.
> 
>     (*) The above description of a Class as a Quality of Service
>         does _not_ match the phrase on http://ckrm.sourcefourge.net:
> 	    "A class is a group of Linux tasks (processes), ..."
> 	I'm speculating that this phrase is misleading.  More
> 	likely, it's just that I'm confused ;).
> 
> 


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] new bitmap list format (for cpusets)
  2004-08-09 14:49     ` Martin J. Bligh
@ 2004-08-10 23:43       ` Paul Jackson
  0 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-08-10 23:43 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: akpm, hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, lse-tech, sivanich

Martin wrote:
> OK, that [removal of prefix char fluff^Wstuff] looks a lot more palletable ;-)

Good.


> it looks like you're only parsing on the read-side to me

I couldn't parse this comment, but hopefully I'll answer this below ...


> I can't see any [bitmap_scnlistprintf] callers,

The first caller of the bitmap list format stuff is the next patch - the
cpuset patch.

The bitmap list printing call stack for cpumasks is:

	kernel/cpuset.c:		cpuset_common_file_read()
	kernel/cpuset.c:		cpuset_sprintf_cpulist()
	include/linux/cpumask.h:	__cpulist_scnprintf()
	lib/bitmap.c:			bitmap_scnlistprintf()

Similarly for nodemasks.

That's why the very first two lines of the bitmap list patch comment
state:

	A bitmap print and parse format that provides lists of ranges of
	numbers, to be first used for by cpusets (next patch).

;).

> do we really need both cpumask_parse and __cpumask_parse in front of
> bitmap_parse?

Yes.  Look through cpumask.h and nodemask.h.  You will see this use of
a #define macro wrapping a static inline macro repeatedly.  The #define
macro is needed to implement the implied call-by-reference convention,
and the static inline is needed to add some type checking.

Note that while the C syntax for using these cpu and node mask operators
_looks_ like it is passing the mask by value at the point marked "<==",
which would copy a possibly multiple word mask on the stack, in the
code:

	static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
	{
        	cpumask_t mask;
		...
        	return cpulist_scnprintf(page, PAGE_SIZE, mask);	/* <== */
	}

this code is _really_ passing a pointer (unsigned long *) on the stack
to the actual implementing code, in lib/bitmap.c:bitmap_scnlistprintf().

The key ingredient that the #define macro adds is the '&' char, turning
normal C call-by-value conventions into an implied call-by-reference.

This somewhat un-C-like calling convention (more like Pascal call by
reference) existed for cpumasks before I came on the scene, with my
various cleanups of the bitmap and cpumask code over the last 10 months.
I found the convention pleasing enough, if odd, so I preserved it, for
most of the cpumask calls, and now (thanks to Matthew Dobson) the
nodemask calls as well.

In summary, the #define macros are needed in cpumask.h and nodemask.h to
alter the usual C call-by-value conventions, and the static inline
macros are needed to provide some static type checking of the arguments.

... at least I hope that was your question.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-08 14:50               ` Martin J. Bligh
@ 2004-08-11  0:43                 ` Paul Jackson
  2004-08-11  9:40                 ` Erich Focht
  1 sibling, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-08-11  0:43 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: efocht, pj, lse-tech, akpm, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich, ckrm-tech

[ Adding ckrm-tech to the cc list, as per Hubertus' request
  yesterday to include ckrm-tech on replies to this cpuset
  thread on lkml when the reply raises CKRM issues. - pj]

Martin wrote:
> both [CKRM and cpusets] are mechanisms for controlling the amount
> of CPU time and memory that processes get to use.  They have fundamentally
> the same objective ... having 2 mechanisms to do the same thing with
> different interfaces doesn't seem like a good plan.

No.

See further my long reply on this thread about 12 hours ago.

Cpusets and CKRM are profoundly different, in purpose, approach
and applicability.

  * The purpose of CKRM is to better manage sharing resources.
    The purpose of cpusets is to isolate resources.

  * The approach of CKRM is to classify, measure and meter the
    use of shared cycles, bits and bandwidth.  The approach of
    cpusets is to setup isolation areas so as to avoid sharing.

  * Their respective areas of useful application have no overlap
    whatsoever that I have yet found.


My understanding (such as it is) of CKRM agrees with what you suggest,
that it measures and meters the use of such shared commodity resources
as cycles, bits and bandwidth.  I understand that it does this in order
to provide for explicitly managed Quality of Service levels or
"classes", to various distinguished system uses or users.

The essential purpose of CKRM is to manage the ** sharing ** of such
resources in a more controlled fashion on a shared resource system.

Cpusets defines compute (cpu and memory) subsets of large SMP and NUMA
systems.  These subsets are first-class, named objects wth vfs-style
access control.

The essential purpose of cpusets is to provide ** isolated ** compute
resources for dedicated jobs.  The existing sched_setaffinity (for CPUs)
and mbind/set_mempolicy (for Memory) calls provide some of the
mechanisms needed.  The cpuset patch completes the kernel support
required for this.

One could make good use of CKRM on a uni-processor system, to better
manage the prioritization of transactions flowing through a complex
service application.  Cpusets are utterly useless on uni-processor
systems.

On the other hand, one could imagine (_easily_ so, if you had my
customer base ;) running a couple of big computational jobs, each on a
dedicated cpuset of dozens or hundreds of CPUs and Nodes, where CKRM
would provide no value (less than zero value - a waste of critical
cycles ;).

Please do not confuse CKRM with cpusets.

They are polar opposite approaches to some of the problems of shared
resource systems - one refines the sharing, the other avoids it.

By now, I trust you know which is which.

Thank-you.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-08 14:50               ` Martin J. Bligh
  2004-08-11  0:43                 ` Paul Jackson
@ 2004-08-11  9:40                 ` Erich Focht
  2004-08-11 14:49                   ` Martin J. Bligh
  2004-08-11 15:12                   ` Shailabh Nagar
  1 sibling, 2 replies; 233+ messages in thread
From: Erich Focht @ 2004-08-11  9:40 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Paul Jackson, lse-tech, akpm, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich

On Sunday 08 August 2004 16:50, Martin J. Bligh wrote:
> > If I understand correctly, CKRM is fine for simple resources like
> > amount of memory or cputime and designed to control flexible sharing
> > of these resources and ensure some degree of fairness. Cpusets is a
> > complex NUMA specific compound resource which actually only allows for
> > a rather static distribution across processes (especially with the
> > exclusive bits set). Including cpusets control into CKRM will be
> > trivial, because you already provide all that's needed.
> 
> I'd disagree with this - both are mechanisms for controlling the amount
> of CPU time and memory that processes get to use. They have fundamentally
> the same objective ... having 2 mechanisms to do the same thing with
> different interfaces doesn't seem like a good plan.

My turn to disagree ;-) CKRMs CPU and memory controller are not
NUMA-specific, they are usefull on non-NUMA machines as well. Their
aim is to share cpu cycles and memory pages among processes in a fair
way. The amount of cycles and memory pages you get is flexible. If
noone else is on the machine, you get the full machine. If someone
else comes with another job, your stuff gets pushed away. Cpusets
guarantee that you get exclusive use of exactly the piece of machine
which you want. This way your run times will be reproducible and other
users just won't disturb you. With the current CKRM cpu/mem
controllers you can say: this set of processes should get 25% of the
cycles and memory. This is a soft limit (can be violated) and doesn't
imply where the CPUs are and which memory blocks (cells/nodes) in the
machine you use. It's of no use for a customer who wants reproducible
compute times (and I don't mean minimal, or guaranteed. I mean same
time for each run, within minimal error margins) and no interference
between users. I'm sure many might question these objectives. I assure
you that they are taken from real life and are very important.

As Paul explained in a previous email: the scope of cpusets is
orthogonal to that of the current CKRM CPU/mem controllers. I see
benefit in combining the two, within one cpuset one can run several
processes and protect them from starving.

The implementation of CKRM cpu/mem and cpusets is as different as
their scope. I doubt CKRM can be just easilly extended to replicate
cpusets functionality. Just adding cpus_allowed will not be enough. In
the end CKRM will need to rebuild all code in the cpusets patch.

> I don't think CKRM is anything like as far away from being ready as
> you seem to be implying - we're talking about a month or two, I
> think.

Shailab's email shows that we're talking about several months. He also
agreed with pushing cpusets towards the -mm tree.

Best regards,
Erich


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-10 22:38                     ` Shailabh Nagar
@ 2004-08-11 10:42                       ` Erich Focht
  2004-08-11 14:56                         ` Shailabh Nagar
  2004-08-14  8:51                       ` Paul Jackson
  1 sibling, 1 reply; 233+ messages in thread
From: Erich Focht @ 2004-08-11 10:42 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: Paul Jackson, Hubertus Franke, mbligh, lse-tech, akpm, hch,
	steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel, colpatch,
	Simon.Derr, ak, sivanich, ckrm-tech

On Wednesday 11 August 2004 00:38, Shailabh Nagar wrote:
> >  Metrics, transactions, tasks, and resource
> >     decisions all have to be tracked or managed by Class.
> > 
> >     These Classes form a fairly shallow hierarchy of usage levels or
> >     service qualities, as perceived by the end users of the system.
> > 
> >     I'd guess that the average lifetime of a Class is months or years,
> >     as they can reflect the relative priority of relations with long
> >     standing, external customers.
> > 
> > Cpusets and CKRM have profoundly different purposes, economics and
> > motivations.
> 
> I would say the methods differ, not the purpose. Both are trying to 
> performance-isolate groups of tasks - one uses the spatial dimension of 
> cpu bindings, the other uses  the temporal dimension of cpu time.

So the purpose is different, too. With your words: spatial versus
temporal separation. They are orthogonal. In physics terms: you need
both to describe the universe and you cannot transform the one into
the other. Both make sense, they can be combined to give more benefit
(aehm, control).


> The other point of difference is the one you'd brought up earlier - ther 
> restrictions on the hierarchy creation. CKRM has none (effectively), 
> cpusets has many.

Don't know how it's exactly implemented, but the restrictions should
not be at hierarchy creation time (i.e. when creating the class
(cpusets) subdirectory). They should be imposed when setting/changing
the attributes. Writing illegal values to the virtual attribute files
must simply fail. And each resource controller knows best what it
allows for and what not, this shouldn't be a task of the
infrastructure (CKRM).


> As CKRM's interface stands today, there are sufficient differences 
> between the interfaces to keep them separate.
> 
> However, if CKRM moves to a model where
> - each controller is allowed to define its own virtual files and attributes
> - each controllers has its own hierarchy (and hence more control over 
> how it can be formed),
> then the similarities will be too many to ignore merger possibilities
> altogether.
> 
> The kicker is, we've not decided. The splitting of controllers into 
> their own hierarchy is something we're considering independently (as a 
> consequence of Linus' suggestion at KS04). But making the interface 
> completely per-controller is something we can do, without too much 
> effort, IF there is sufficient reason (we have other reasons for doing 
> that as well - see recent postings on ckrm-tech).

Having controller specifics less hidden is good because usage becomes
more intuitive and you don't have to RTFM (controller specific manuals
would have to be written, too). One file per attribute is also nicer
than several attributes hidden in a shares files. Adding an attribute
means adding a file, it doesn't break the old interface, so this is
easier to maintain. And, as you mentioned, some files in the current
CKRM interface just don't make sense for some resources. But a sane
ruleset provided by CKRM for external controllers should be
there. For example something like:
   - Class members are added by writing to the vitual file "target".
   - Class members are listed by reading the virtual file "target" and
     the format is ...
   - Each class attribute should be controlled by one file named
     appropriately. Etc...
   - Members of a class can register a callback which will be invoked
     when following events occur:
        - the class is destroyed
	- ... ?
   - etc ...

> Interest/recommendations from the community that cpusets  be part of 
> CKRM's hierarchy would certainly be a factor in that decision.

I'd prefer a single entry point for resource management with
consistent (not necessarilly same) and easy to use user interfaces for
all resources.

Regards,
Erich


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] new bitmap list format (for cpusets)
  2004-08-05 10:08 [PATCH] new bitmap list format (for cpusets) Paul Jackson
  2004-08-05 10:10 ` [PATCH] cpusets - big numa cpu and memory placement Paul Jackson
  2004-08-05 20:47 ` [Lse-tech] [PATCH] new bitmap list format (for cpusets) Martin J. Bligh
@ 2004-08-11 13:11 ` Dinakar Guniguntala
  2004-08-11 16:17   ` Paul Jackson
  2 siblings, 1 reply; 233+ messages in thread
From: Dinakar Guniguntala @ 2004-08-11 13:11 UTC (permalink / raw)
  To: Paul Jackson; +Cc: linux-kernel, lse-tech

[-- Attachment #1: Type: text/plain, Size: 341 bytes --]

Paul,

Considering that cpu_possible_map does not get fully initialized
until smp_prepare_cpus gets called by init(), I thought it right
to move cpuset_init() to after smp initialization. I tested it on
2.6.8-rc2-mm2 and it seemed to work ok.

Patch attached below

Regards,

Dinakar 

Signed-off-by: Dinakar Guniguntala <dino@in.ibm.com>



[-- Attachment #2: cpuset-init.patch --]
[-- Type: text/plain, Size: 5597 bytes --]

diff -Naurp linux-2.6.8-rc2-mm2-cs3/include/linux/cpuset.h linux-2.6.8-rc2-mm2-cs3.new/include/linux/cpuset.h
--- linux-2.6.8-rc2-mm2-cs3/include/linux/cpuset.h	2004-08-05 17:22:31.000000000 +0530
+++ linux-2.6.8-rc2-mm2-cs3.new/include/linux/cpuset.h	2004-08-10 22:58:23.000000000 +0530
@@ -14,6 +14,43 @@
 
 #ifdef CONFIG_CPUSETS
 
+struct cpuset {
+	unsigned long flags;		/* "unsigned long" so bitops work */
+	cpumask_t cpus_allowed;		/* CPUs allowed to tasks in cpuset */
+	nodemask_t mems_allowed;	/* Memory Nodes allowed to tasks */
+
+	atomic_t count;			/* count tasks using this cpuset */
+
+	/*
+	 * We link our 'sibling' struct into our parents 'children'.
+	 * Our children link their 'sibling' into our 'children'.
+	 */
+	struct list_head sibling;	/* my parents children */
+	struct list_head children;	/* my children */
+
+	struct cpuset *parent;		/* my parent */
+	struct dentry *dentry;		/* cpuset fs entry */
+};
+
+/* bits in struct cpuset flags field */
+typedef enum {
+	CS_CPU_EXCLUSIVE,
+	CS_MEM_EXCLUSIVE,
+	CS_REMOVED,
+	CS_NOTIFY_ON_RELEASE
+} cpuset_flagbits_t;
+
+static struct cpuset top_cpuset = {
+	.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
+	.cpus_allowed = CPU_MASK_ALL,
+	.mems_allowed = NODE_MASK_ALL,
+	.count = ATOMIC_INIT(0),
+	.sibling = LIST_HEAD_INIT(top_cpuset.sibling),
+	.children = LIST_HEAD_INIT(top_cpuset.children),
+	.parent = NULL,
+	.dentry = NULL,
+};
+
 extern int cpuset_init(void);
 extern void cpuset_fork(struct task_struct *p);
 extern void cpuset_exit(struct task_struct *p);
@@ -26,8 +63,14 @@ int cpuset_zonelist_valid_mems_allowed(s
 int cpuset_zone_allowed(struct zone *z);
 extern struct file_operations proc_cpuset_operations;
 
+#define INIT_TASK_CPUSET(tsk)	\
+	.cpuset = &top_cpuset,		\
+	.mems_allowed = NODE_MASK_ALL,
+
 #else /* !CONFIG_CPUSETS */
 
+#define INIT_TASK_CPUSET(tsk)
+
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_fork(struct task_struct *p) {}
 static inline void cpuset_exit(struct task_struct *p) {}
diff -Naurp linux-2.6.8-rc2-mm2-cs3/include/linux/init_task.h linux-2.6.8-rc2-mm2-cs3.new/include/linux/init_task.h
--- linux-2.6.8-rc2-mm2-cs3/include/linux/init_task.h	2004-08-05 17:20:52.000000000 +0530
+++ linux-2.6.8-rc2-mm2-cs3.new/include/linux/init_task.h	2004-08-10 22:47:14.000000000 +0530
@@ -3,6 +3,7 @@
 
 #include <linux/file.h>
 #include <linux/pagg.h>
+#include <linux/cpuset.h>
 
 #define INIT_FILES \
 { 							\
@@ -114,6 +115,7 @@ extern struct group_info init_groups;
 	.switch_lock	= SPIN_LOCK_UNLOCKED,				\
 	.journal_info	= NULL,						\
 	INIT_TASK_PAGG(tsk)						\
+	INIT_TASK_CPUSET(tsk)						\
 }
 
 
diff -Naurp linux-2.6.8-rc2-mm2-cs3/include/linux/pagg.h linux-2.6.8-rc2-mm2-cs3.new/include/linux/pagg.h
--- linux-2.6.8-rc2-mm2-cs3/include/linux/pagg.h	2004-08-05 17:20:52.000000000 +0530
+++ linux-2.6.8-rc2-mm2-cs3.new/include/linux/pagg.h	2004-08-10 20:12:45.000000000 +0530
@@ -183,7 +183,7 @@ static inline void pagg_exec(struct task
  */
 #define INIT_TASK_PAGG(tsk) \
 	.pagg_list = LIST_HEAD_INIT(tsk.pagg_list),     \
-	.pagg_sem  = __RWSEM_INITIALIZER(tsk.pagg_sem)
+	.pagg_sem  = __RWSEM_INITIALIZER(tsk.pagg_sem),
 
 #else  /* CONFIG_PAGG */
 
diff -Naurp linux-2.6.8-rc2-mm2-cs3/init/main.c linux-2.6.8-rc2-mm2-cs3.new/init/main.c
--- linux-2.6.8-rc2-mm2-cs3/init/main.c	2004-08-05 17:22:31.000000000 +0530
+++ linux-2.6.8-rc2-mm2-cs3.new/init/main.c	2004-08-11 21:50:24.970179272 +0530
@@ -569,8 +569,6 @@ asmlinkage void __init start_kernel(void
 #ifdef CONFIG_PROC_FS
 	proc_root_init();
 #endif
-	cpuset_init();
-
 	check_bugs();
 
 	/* Do the rest non-__init'ed, we're now alive */
@@ -708,6 +706,8 @@ static int init(void * unused)
 	smp_init();
 	sched_init_smp();
 
+	cpuset_init();
+	
 	/*
 	 * Do this before initcalls, because some drivers want to access
 	 * firmware files.
diff -Naurp linux-2.6.8-rc2-mm2-cs3/kernel/cpuset.c linux-2.6.8-rc2-mm2-cs3.new/kernel/cpuset.c
--- linux-2.6.8-rc2-mm2-cs3/kernel/cpuset.c	2004-08-11 22:02:47.077361832 +0530
+++ linux-2.6.8-rc2-mm2-cs3.new/kernel/cpuset.c	2004-08-11 22:01:07.416512584 +0530
@@ -54,32 +54,6 @@
 
 #define CPUSET_SUPER_MAGIC 		0x27e0eb
 
-struct cpuset {
-	unsigned long flags;		/* "unsigned long" so bitops work */
-	cpumask_t cpus_allowed;		/* CPUs allowed to tasks in cpuset */
-	nodemask_t mems_allowed;	/* Memory Nodes allowed to tasks */
-
-	atomic_t count;			/* count tasks using this cpuset */
-
-	/*
-	 * We link our 'sibling' struct into our parents 'children'.
-	 * Our children link their 'sibling' into our 'children'.
-	 */
-	struct list_head sibling;	/* my parents children */
-	struct list_head children;	/* my children */
-
-	struct cpuset *parent;		/* my parent */
-	struct dentry *dentry;		/* cpuset fs entry */
-};
-
-/* bits in struct cpuset flags field */
-typedef enum {
-	CS_CPU_EXCLUSIVE,
-	CS_MEM_EXCLUSIVE,
-	CS_REMOVED,
-	CS_NOTIFY_ON_RELEASE
-} cpuset_flagbits_t;
-
 /* convenient tests for these bits */
 static inline int is_cpu_exclusive(const struct cpuset *cs)
 {
@@ -101,17 +75,6 @@ static inline int notify_on_release(cons
 	return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
 }
 
-static struct cpuset top_cpuset = {
-	.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
-	.cpus_allowed = CPU_MASK_ALL,
-	.mems_allowed = NODE_MASK_ALL,
-	.count = ATOMIC_INIT(0),
-	.sibling = LIST_HEAD_INIT(top_cpuset.sibling),
-	.children = LIST_HEAD_INIT(top_cpuset.children),
-	.parent = NULL,
-	.dentry = NULL,
-};
-
 static struct vfsmount *cpuset_mount;
 static struct super_block *cpuset_sb = NULL;
 

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-11  9:40                 ` Erich Focht
@ 2004-08-11 14:49                   ` Martin J. Bligh
  2004-08-11 17:50                     ` Paul Jackson
  2004-08-11 15:12                   ` Shailabh Nagar
  1 sibling, 1 reply; 233+ messages in thread
From: Martin J. Bligh @ 2004-08-11 14:49 UTC (permalink / raw)
  To: Erich Focht
  Cc: Paul Jackson, lse-tech, akpm, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich

> My turn to disagree ;-) CKRMs CPU and memory controller are not
> NUMA-specific, they are usefull on non-NUMA machines as well. Their

Neither is cpusets - doing resource control is pretty much the same
problem on SMP as NUMA. So I don't really see your point.

> aim is to share cpu cycles and memory pages among processes in a fair
> way. The amount of cycles and memory pages you get is flexible. If
> noone else is on the machine, you get the full machine. If someone
> else comes with another job, your stuff gets pushed away. Cpusets
> guarantee that you get exclusive use of exactly the piece of machine
> which you want. This way your run times will be reproducible and other
> users just won't disturb you. With the current CKRM cpu/mem
> controllers you can say: this set of processes should get 25% of the
> cycles and memory. This is a soft limit (can be violated) and doesn't
> imply where the CPUs are and which memory blocks (cells/nodes) in the
> machine you use. It's of no use for a customer who wants reproducible
> compute times (and I don't mean minimal, or guaranteed. I mean same
> time for each run, within minimal error margins) and no interference
> between users. I'm sure many might question these objectives. I assure
> you that they are taken from real life and are very important.
> 
> As Paul explained in a previous email: the scope of cpusets is
> orthogonal to that of the current CKRM CPU/mem controllers. I see
> benefit in combining the two, within one cpuset one can run several
> processes and protect them from starving.

Right ... the problems you're attacking aren't *exactly* the same - but
they're still close enough, that especially when programming them in
combination, it seems silly to have 2 separate interfaces. 

> The implementation of CKRM cpu/mem and cpusets is as different as
> their scope. I doubt CKRM can be just easilly extended to replicate
> cpusets functionality. Just adding cpus_allowed will not be enough. In
> the end CKRM will need to rebuild all code in the cpusets patch.

Perhaps the main thing that ends up shared would be the interface - and
I agree that adding cpus_allowed is wholly insufficient. However, I think
it's foolish to go ahead and press one resource control method interface
into the kernel without carefully considering the possibilities for a
unified interface first - this is important to get right ... interfaces
are too hard to change afterwards (see current discussion re libnuma
for a perfect example).
 
>> I don't think CKRM is anything like as far away from being ready as
>> you seem to be implying - we're talking about a month or two, I
>> think.
> 
> Shailab's email shows that we're talking about several months. He also
> agreed with pushing cpusets towards the -mm tree.

That's up to Andrew ... but personally I'd rather see the interface issues
thrashed out first. My real concern is that it doesn't go into mainline or
a distro yet, but quite frankly, another couple of weeks isn't going to 
kill anyone, so I don't see the point.

M.

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-11 10:42                       ` Erich Focht
@ 2004-08-11 14:56                         ` Shailabh Nagar
  0 siblings, 0 replies; 233+ messages in thread
From: Shailabh Nagar @ 2004-08-11 14:56 UTC (permalink / raw)
  Cc: lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, colpatch, Simon.Derr, ak, sivanich, ckrm-tech

Erich Focht wrote:
> On Wednesday 11 August 2004 00:38, Shailabh Nagar wrote:
> 
>>> Metrics, transactions, tasks, and resource
>>>    decisions all have to be tracked or managed by Class.
>>>
>>>    These Classes form a fairly shallow hierarchy of usage levels or
>>>    service qualities, as perceived by the end users of the system.
>>>
>>>    I'd guess that the average lifetime of a Class is months or years,
>>>    as they can reflect the relative priority of relations with long
>>>    standing, external customers.
>>>
>>>Cpusets and CKRM have profoundly different purposes, economics and
>>>motivations.
>>
>>I would say the methods differ, not the purpose. Both are trying to 
>>performance-isolate groups of tasks - one uses the spatial dimension of 
>>cpu bindings, the other uses  the temporal dimension of cpu time.
> 
> 
> So the purpose is different, too. With your words: spatial versus
> temporal separation. They are orthogonal. 

By purpose, I meant "performance isolation". Method used is spatial 
vs. temporal. But I guess thats just quibbling over words. The 
approaches are certainly orthogonal.

Also, cpusets have a purpose beyond isolation and that is 
optimization. One might want to restrict tasks/apps to a NUMA node for 
reducing avg mem latency - this is completely beyond CKRM's scope.



In physics terms: you need
> both to describe the universe and you cannot transform the one into
> the other. Both make sense, they can be combined to give more benefit
> (aehm, control).

On machines with a fairly large number of cpus, this is true. cpusets 
would partition a machine and CKRM would operate within each partition.

But its less clear whether both CKRM and cpuset approaches can be 
simultaneously used, profitably, on a smaller SMP if one is primarily 
interested in  isolation.

Partitioning the cpus with cpusets does offer harder guarantees, 
replicable isolation etc. but also runs the risk of underutilization.
If the user primarily wants to give 20% to one App, 40% to another, he 
does have to make that call: go with cpusets which offers better 
guarantees but could waste cpus or create ckrm classes which also 
offer this functionality but run the risk of weaker control depending 
on other applications load ?

To further complicate that choice, CKRM's design does provide for 
implementation of hard vs. soft limits where hard limits would provide 
the stronger guarantees that a user might want.

The CKRM CPU controller, in particular, is close (~ two weeks to 
availablity) to providing an implementation of hard limits which would 
offer stronger guarantees along the temporal dimension.



> 
> 
> 
>>The other point of difference is the one you'd brought up earlier - ther 
>>restrictions on the hierarchy creation. CKRM has none (effectively), 
>>cpusets has many.
> 
> 
> Don't know how it's exactly implemented, but the restrictions should
> not be at hierarchy creation time (i.e. when creating the class
> (cpusets) subdirectory). They should be imposed when setting/changing
> the attributes. 

True - I was lumping the "create cpuset + set its cpu ownership 
values" into the hierarchy creation. But the point made still holds 
good, CKRM has no controller-defined restrictions on changing 
attributes, cpusets does.

> Writing illegal values to the virtual attribute files
> must simply fail. And each resource controller knows best what it
> allows for and what not, this shouldn't be a task of the
> infrastructure (CKRM).

Yes, this makes sense.



>>As CKRM's interface stands today, there are sufficient differences 
>>between the interfaces to keep them separate.
>>
>>However, if CKRM moves to a model where
>>- each controller is allowed to define its own virtual files and attributes
>>- each controllers has its own hierarchy (and hence more control over 
>>how it can be formed),
>>then the similarities will be too many to ignore merger possibilities
>>altogether.
>>
>>The kicker is, we've not decided. The splitting of controllers into 
>>their own hierarchy is something we're considering independently (as a 
>>consequence of Linus' suggestion at KS04). But making the interface 
>>completely per-controller is something we can do, without too much 
>>effort, IF there is sufficient reason (we have other reasons for doing 
>>that as well - see recent postings on ckrm-tech).
> 
> 
> Having controller specifics less hidden is good because usage becomes
> more intuitive and you don't have to RTFM (controller specific manuals
> would have to be written, too). One file per attribute is also nicer
> than several attributes hidden in a shares files. Adding an attribute
> means adding a file, it doesn't break the old interface, so this is
> easier to maintain. And, as you mentioned, some files in the current
> CKRM interface just don't make sense for some resources. But a sane
> ruleset provided by CKRM for external controllers should be
> there. For example something like:
>    - Class members are added by writing to the vitual file "target".
>    - Class members are listed by reading the virtual file "target" and
>      the format is ...
>    - Each class attribute should be controlled by one file named
>      appropriately. Etc...
>    - Members of a class can register a callback which will be invoked
>      when following events occur:
>         - the class is destroyed
> 	- ... ?
>    - etc ...

One file per attribute is an excellent idea and the slight additional 
overhead won't matter since attribute changes are rarely in the 
critical path. Will follow up on this on ckrm-tech (which is cc'ed).

We'll still need to keep statistics grouped as far as possible because 
  the overhead of reading several files vs. one will matter.


> 
> 
>>Interest/recommendations from the community that cpusets  be part of 
>>CKRM's hierarchy would certainly be a factor in that decision.
> 
> 
> I'd prefer a single entry point for resource management with
> consistent (not necessarilly same) and easy to use user interfaces for
> all resources.
> 
> Regards,
> Erich
> 


P.S. I've pruned some of the names on the cc: list who are obviously 
subscribed to one or the other lists (mailman on sf keeps complaining 
if the cc list is too long). I can be dropped from the cc: too if this 
thread continues...

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-11  9:40                 ` Erich Focht
  2004-08-11 14:49                   ` Martin J. Bligh
@ 2004-08-11 15:12                   ` Shailabh Nagar
  1 sibling, 0 replies; 233+ messages in thread
From: Shailabh Nagar @ 2004-08-11 15:12 UTC (permalink / raw)
  To: lse-tech
  Cc: hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich, ckrm-tech

Erich Focht wrote:
> On Sunday 08 August 2004 16:50, Martin J. Bligh wrote:
> 
>>I don't think CKRM is anything like as far away from being ready as
>>you seem to be implying - we're talking about a month or two, I
>>think.
> 
> 
> Shailab's email shows that we're talking about several months. He also
> agreed with pushing cpusets towards the -mm tree.

CKRM with its current interfaces is ready for a spin in the -mm tree 
today. But if we go with the split controllers idea, we'll be delayed 
by two or three months (the changes to the codebase are not very large 
since we internally do have quite a bit of separation between the 
controllers...its mostly an interface issue).

I would estimate that the following will be available in two-three months:
CKRM framework with per-controller modifications suggested
1 version each of the controllers
classification engines


Please note that acceptance of CKRM the framework is not tied into 
acceptance of any particular CKRM controller or the classification 
engines (that was one of our objectives !). So its quite possible that 
only the framework and the least intrusive controllers will be found 
acceptable for -mm inclusion initially and we are asked to keep 
iterating on the others based on suggestions.

-- Shailabh

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] new bitmap list format (for cpusets)
  2004-08-11 13:11 ` Dinakar Guniguntala
@ 2004-08-11 16:17   ` Paul Jackson
  2004-08-11 18:05     ` Dinakar Guniguntala
  0 siblings, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-08-11 16:17 UTC (permalink / raw)
  To: dino; +Cc: linux-kernel, lse-tech

Dinakar wrote:
> Considering that cpu_possible_map does not get fully initialized
> until smp_prepare_cpus gets called by init(), I thought it right
> to move cpuset_init() to after smp initialization.

Thank-you.  I suspect you're right.

Could you also provide some motivation for the other changes in your
patch, moving struct cpuset, enum cpuset_flagbits_t, and struct cpuset
top_cpuset definitions from kernel/cpuset.c to include/linux/cpuset.h?
I had found it rather pleasing that these structures did not need to
be known outside of kernel/cpuset.c.

Another approach that might work, in order to ensure that the top_cpuset
has its cpus_allowed set to the proper value of cpu_possible_map, would
be to add a routine, say cpuset_init_smp(), called from init/main.c
init() just after smp_init() returns, to update the cpus_allowed in
top_cpuset from the fully initialized value of cpu_possible_map.  This
seems to resemble the call sched_init_smp(), also made in init/main.c
init() just after smp_init() returns, to finish initializing some sched
stuff.

If you take your approach, should we remove the __init qualifier from
kernel/cpuset.c cpuset_init()?

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-11 14:49                   ` Martin J. Bligh
@ 2004-08-11 17:50                     ` Paul Jackson
  2004-08-11 21:12                       ` Shailabh Nagar
  0 siblings, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-08-11 17:50 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: efocht, lse-tech, akpm, hch, steiner, jbarnes, sylvain.jeaugey,
	djh, linux-kernel, colpatch, Simon.Derr, ak, sivanich

Martin wrote:
> but they're still close enough, that especially when programming
> them in combination, it seems silly to have 2 separate interfaces. 

The specific attributes needed of CKRM classes are not the same as those
needed for cpusets.

The semantics of the two are distinct -- each has different rules that
have little relevance to the other.

The typical uses of the two have little overlap.  More often than not,
the applications that customers want to run in isolation in cpusets are
not the same as those which customers want to run while sharing compute
resources with a managed balance.

Any merger of two separate mechanisms has its costs - the result risks
being less focused, larger.  These costs must be balanced with what's
called (in Corporate mergers) "synergy".

Programming these two "in combination" simply means using CKRM to manage
resources for some tasks running in a cpuset.  Neither capability or
interface gains in such a use by attempting to merge it with the other.

I can imagine running multiple cpusets, say SetA, SetB, and SetC, using
Nodes 0..31, 32..63, and 64..127, respectively.  Within each, running
the same suite of applications, say a DBMS, web server and credit card
payment handler.  Serving within each three classes of customers: Gold
(corporate), Silver (logged in individuals) and Bronze (anonymous web
surfers).  This is naturally a 3x3 two-dimensional space, not a flat
space that's nine units wide.  The two dimensions are orthogonal here.

No, it is not silly to have 2 separate interfaces.  What's silly is to
presume that everything that seems similar at the 10,000 foot level
should be combined.

The details matter.  Show me the synergy.

Do we have trains that float and ships that roll?

Is there much of a market for a hammer-saw?

It is fitting and proper for kernels to provide independent mechanisms,
and let user space connect them as it will.  Look at the actual hooks
in the kernel code to implement these two facilities.  One hooks the
scheduler and allocator to prohibit running on CPUs outside the cpuset,
and to prohibit allocating memory on Nodes outside the cpuset.  The other
hooks these places, and others, to bias their decisions in order to obtain
the requested balance of resource usage.

These are two distinct sets of hooks, perhaps on the same page of code,
but distinct in placement, logic, means and intention.

Ideally, the kernel provides a single, separate, orthogonal interface to
each mechanism it supports.

If this were a case of proposing two interfaces to the same mechanism,
or what should be the same mechanism, then you'd be 100% right.  We
should merge them.

Perhaps the proper place to resolve this discussion in is a detailed
examination of the kernel hooks required for CKRM and cpusets, the
hooks in the scheduler, allocator and such.

You have both patches available to you.  Examine them.  Especially
examine the hooks in the scheduler and allocator code.  These are not
the same hooks.  I defy you to make them the same and propose such with
a straight face.  If you do so successfully, I will sit up and take
notice.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] new bitmap list format (for cpusets)
  2004-08-11 16:17   ` Paul Jackson
@ 2004-08-11 18:05     ` Dinakar Guniguntala
  2004-08-11 20:40       ` Paul Jackson
  0 siblings, 1 reply; 233+ messages in thread
From: Dinakar Guniguntala @ 2004-08-11 18:05 UTC (permalink / raw)
  To: Paul Jackson; +Cc: linux-kernel, lse-tech

On Wed, Aug 11, 2004 at 09:17:32AM -0700, Paul Jackson wrote:
> Dinakar wrote:
> > Considering that cpu_possible_map does not get fully initialized
> > until smp_prepare_cpus gets called by init(), I thought it right
> > to move cpuset_init() to after smp initialization.
> 
> Thank-you.  I suspect you're right.
> 
> Could you also provide some motivation for the other changes in your
> patch, moving struct cpuset, enum cpuset_flagbits_t, and struct cpuset
> top_cpuset definitions from kernel/cpuset.c to include/linux/cpuset.h?
> I had found it rather pleasing that these structures did not need to
> be known outside of kernel/cpuset.c.

Since init() is executed by a kernel_thread that does a do_fork(),
it already expects the top_cpuset to be initialized. Since this can
be achieved by initializing the task structure (INIT_TASK), I had
to move the structure definitions to the header file.

A related Q, I was wondering why the nodemask_t needed to be part 
of the task_struct, since cpuset would anyway have a reference to it. 
Sorry if this is something very obvious, I didn't really look to see 
why it was there

> 
> Another approach that might work, in order to ensure that the top_cpuset
> has its cpus_allowed set to the proper value of cpu_possible_map, would
> be to add a routine, say cpuset_init_smp(), called from init/main.c
> init() just after smp_init() returns, to update the cpus_allowed in
> top_cpuset from the fully initialized value of cpu_possible_map.  This
> seems to resemble the call sched_init_smp(), also made in init/main.c
> init() just after smp_init() returns, to finish initializing some sched
> stuff.

Yes that would be fine too.

> 
> If you take your approach, should we remove the __init qualifier from
> kernel/cpuset.c cpuset_init()?
> 
The qualifier would still be valid I think, no ?

Regards,

Dinakar

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] new bitmap list format (for cpusets)
  2004-08-11 18:05     ` Dinakar Guniguntala
@ 2004-08-11 20:40       ` Paul Jackson
  2004-08-12  9:48         ` Dinakar Guniguntala
  0 siblings, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-08-11 20:40 UTC (permalink / raw)
  To: dino; +Cc: linux-kernel, lse-tech

Paul wrote:
> Another approach that might work, in order to ensure that the top_cpuset
> has its cpus_allowed set to the proper value of cpu_possible_map, would
> be to add a routine, say cpuset_init_smp(),

Dinakar replied:
> Yes that would be fine too.

Since I've gotten this far without having the definition of 'struct cpuset'
exposed in a header file, I'd like to see if I can continue that.  I'll
give this other approach a try - though it will be a day or so before I
can get to it - prior commitments.  Unless of course, someone sends me such
a patch first ;).

Paul wrote:
> If you take your approach, should we remove the __init qualifier from
> kernel/cpuset.c cpuset_init()?

Dinakar replied:
> The qualifier would still be valid I think, no ?

What led me to ask that question was the following bit of code at the
bottom of start_kernel(), in init/main.c:

=========================================================
        /* rootfs populating might need page-writeback */
        page_writeback_init();
#ifdef CONFIG_PROC_FS
        proc_root_init();
#endif
        cpuset_init();

        check_bugs();

        /* Do the rest non-__init'ed, we're now alive */
        rest_init();
}
=========================================================

Since this is where your patch was moving the cpuset_init() call _away_
from, in order to put the call later, I took the comment about
"non-__init'ed" to mean that your patch was moving the cpuset_init()
call past the place where an __init qualifier was valid.

But I haven't studied the code to know this for sure, and if my other
scheme pans out to address the problem you found (that cpu_possible_map,
upon which cpuset initialization depends, does not get fully initialized
until smp_prepare_cpus gets called by init(),) then this detail won't
matter anyway.

However an equivalent detail would matter.  Can I mark cpuset_init_smp()
as "__init" ?  Hmmm ... likely I can, since two routines called at the
same time, sched_init_smp() and smp_init(), are marked __init.  This
suggests that my interpretation of that comment was wrong, and that
you're entirely right -- calls made in either place can be marked
__init.  Is that comment above misleading?


> A related Q, I was wondering why the nodemask_t needed to be part 
> of the task_struct, since cpuset would anyway have a reference to it.

Good question.

The nodemasks current->mems_allowed and current->cpuset->mems_allowed,
can be out of sync, by design.  Changes made via the cpuset file system
affect the second of these.  But not until the affected task goes
through the numa code in an mbind or set_mempolicy system call does that
task pick up the new value of mems_allowed and put it in its task struct
as current->mems_allowed to control memory placement.

This seems necessary, because there is no way for one task to affect
anothers mm, vma and zonelist structures.  So all of these structures
must be managed directly by a task on itself.  So a tasks mems_allowed,
which must be consistent with its zonelists, must also be managed
directly by itself.

If one task directly manipulated anothers current->mems_allowed, we
could end up in the situation that a task had done say an MPOL_BIND
setting up some zonelists for one set of nodes, then had its
current->mems_allowed changed to some non-overlapping set, leaving the
task completely unable to allocate memory on its own behalf, which would
likely send us into portions of the allocator code we should only arrive
in if very short on memory and desperate, which risks causing further
grief to the rest of the system ... not good (tm).

Hmmm ... as I write this, I am suspecting that there is a bit of code
that is missing in this solution.  Any given task may have multiple
memory policies, a default one (set_mempolicy) and zero or more ones
specific to some range of memory (mbind).  We must deal with the case
that any change in a tasks current->mems_allowed could break any of the
memory policies affecting it (leave the policy non-overlapping with the
mems_allowed).  My crystal ball sees some more nodemasks, perhaps one
per numa struct mempolicy, and a little bit more code, in my future ;).
I'll have to think about this.  Suggestions welcome.

As I said -- good question.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-11 17:50                     ` Paul Jackson
@ 2004-08-11 21:12                       ` Shailabh Nagar
  2004-08-12  7:15                         ` Paul Jackson
  0 siblings, 1 reply; 233+ messages in thread
From: Shailabh Nagar @ 2004-08-11 21:12 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Martin J. Bligh, efocht, lse-tech, akpm, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich

Paul Jackson wrote:

> Martin wrote:
> 
>>but they're still close enough, that especially when programming
>>them in combination, it seems silly to have 2 separate interfaces. 
> 
> 
> The specific attributes needed of CKRM classes are not the same as those
> needed for cpusets.
> 
> The semantics of the two are distinct -- each has different rules that
> have little relevance to the other.


> 
> The typical uses of the two have little overlap.  More often than not,
> the applications that customers want to run in isolation in cpusets are
> not the same as those which customers want to run while sharing compute
> resources with a managed balance.

If you want to emphasize the differences, this might help:  cpusets 
allows apps to be confined to a set for gaining benefits like cache 
affinity and reduced memory latency. CKRM doesn't and cannot and in this 
use case, the two are orthogonal.

But when apps are being confined to a set of cpus *only* for purposes of 
  getting a certain fraction of the total compute power, cpusets are not 
orthogonal in intent, not implementation, from a CKRM CPU class 
implementing hard limits. More capable of achieving those limits, yes, 
but orthogonal, no.

Note that this does not suggest the joint use of the two mechanisms - 
merely that there exists a usage scenario where both are relevant and 
for users of which, a common interface might be handy.


> No, it is not silly to have 2 separate interfaces.  What's silly is to
> presume that everything that seems similar at the 10,000 foot level
> should be combined.
> 
> The details matter.  Show me the synergy.

What's your opinion on the commonalities between the two interfaces 
pointed out in my previous mail ?

Also, if CKRM were to move to the "each controller exports its own 
interface" model, how would this affect the discussion ?


> It is fitting and proper for kernels to provide independent mechanisms,
> and let user space connect them as it will.  


> Look at the actual hooks
> in the kernel code to implement these two facilities....  
> Perhaps the proper place to resolve this discussion in is a detailed
> examination of the kernel hooks required for CKRM and cpusets, the
> hooks in the scheduler, allocator and such.

No one is questioning that the internals differ. There is very little in 
common between a CKRM I/O controller and its CPU controller too. But 
that doesn't prevent them from sharing the same interface.

I repeat - the question isn't one of the internals - its about the 
interface. Do you think there's *any* merit to cpusets sharing the rcfs 
interface *if* the latter were to make the changes mentioned in earlier 
mail ?

If not (and others agree), lets end this discussion and move on - both 
projects have enough to do. If there is some commonality, lets see what 
we can do to enhance the eventual user's experience.

-- Shailabh

> You have both patches available to you.  Examine them.  Especially
> examine the hooks in the scheduler and allocator code.  These are not
> the same hooks.  I defy you to make them the same and propose such with
> a straight face.  If you do so successfully, I will sit up and take
> notice.




^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-11 21:12                       ` Shailabh Nagar
@ 2004-08-12  7:15                         ` Paul Jackson
  2004-08-12 12:58                           ` Jack Steiner
  2004-08-12 14:50                           ` Martin J. Bligh
  0 siblings, 2 replies; 233+ messages in thread
From: Paul Jackson @ 2004-08-12  7:15 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: mbligh, efocht, lse-tech, akpm, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich

Shailabh wrote:
> But when apps are being confined to a set of cpus *only* for purposes of 
> getting a certain fraction of the total compute power, cpusets are not 
> orthogonal in intent, not implementation, from a CKRM CPU class 
> implementing hard limits.

So if someone wanted to constrain a group of tasks to using 50% of all
the available CPU ticks on a 32 CPU system, they could either use a CKRM
CPU class with a hard limit of 50%, or a cpuset that contained 16 CPUs.

Yes, for that purpose, except for NUMA placement (the cache affinity and
memory latency you mention), these two approaches are similar in affect.

So, yes, my absolute insistence that CKRM and cpusets are orthogonal is
overstated.  Well, I could quibble that orthogonal doesn't imply disjoint.
Whatever.


> What's your opinion on the commonalities between the two interfaces 
> pointed out in my previous mail ?

My apologies for not yet replying to your mail of a couple of days ago.
It was valuable to me, and I've taken a bit of time to digest it.
Meanwhile, newer stuff keeps overruning my reply.  Soon, hopefully.


> Also, if CKRM were to move to the "each controller exports its own 
> interface" model, how would this affect the discussion ?

I cannot speak for the discussion, only for myself.  I am clearly
sensitive to the downsides of trying to integrate these interfaces.

Hopefully I can find the time tonight to study your earlier replies more
closely, and better understand the potential benefits of such an
integration.  So far, I don't see them.  I will do my best to keep my
eyes and mind open.  Thanks especially to your posts, I have learned
quite a bit about CKRM this week.

I will confess to a strong bias toward a minimum of abstraction at the
kernel-user boundary, and towards providing a one-to-one map between the
mechanisms and the interfaces a kernel provides.  Let the user level
assemble the pieces as it will.  If combining interfaces (CKRM and
cpuset) caused any unwarranted change in or obfuscation of the semantics
provided by either, that would be unfortunate, in my view.


> Do you think there's *any* merit to cpusets sharing the rcfs 
> interface *if* the latter were to make the changes mentioned in earlier 
> mail ?

Not yet, but I need to go back over your replies, and others, with this
question more clearly in focus.


> If not (and others agree), lets end this discussion and move on - both 
> projects have enough to do ...

At least Martin does not yet agree, if I understand his posts.  But,
yes, either way, we are close to where it is best to table this
discussion, for the moment at least.

Thank-you for your constructive and enlightening comments so far.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] new bitmap list format (for cpusets)
  2004-08-11 20:40       ` Paul Jackson
@ 2004-08-12  9:48         ` Dinakar Guniguntala
  2004-08-12 10:11           ` Paul Jackson
  0 siblings, 1 reply; 233+ messages in thread
From: Dinakar Guniguntala @ 2004-08-12  9:48 UTC (permalink / raw)
  To: Paul Jackson; +Cc: linux-kernel, lse-tech

[-- Attachment #1: Type: text/plain, Size: 936 bytes --]

On Wed, Aug 11, 2004 at 01:40:18PM -0700, Paul Jackson wrote:
> 
> Since I've gotten this far without having the definition of 'struct cpuset'
> exposed in a header file, I'd like to see if I can continue that.  I'll
> give this other approach a try - though it will be a day or so before I
> can get to it - prior commitments.  Unless of course, someone sends me such
> a patch first ;).

Ok revised patch attached

> However an equivalent detail would matter.  Can I mark cpuset_init_smp()
> as "__init" ?  Hmmm ... likely I can, since two routines called at the
> same time, sched_init_smp() and smp_init(), are marked __init.  This
> suggests that my interpretation of that comment was wrong, and that
> you're entirely right -- calls made in either place can be marked
> __init.  Is that comment above misleading?

That I believe applies only to the rest_init function which does not have
the __init qualifier

Regards,

Dinakar



[-- Attachment #2: cpuset-init-2.patch --]
[-- Type: text/plain, Size: 1891 bytes --]

diff -Naurp linux-2.6.8-rc2-mm2-cs3/include/linux/cpuset.h linux-2.6.8-rc2-mm2-cs3.new/include/linux/cpuset.h
--- linux-2.6.8-rc2-mm2-cs3/include/linux/cpuset.h	2004-08-05 17:22:31.000000000 +0530
+++ linux-2.6.8-rc2-mm2-cs3.new/include/linux/cpuset.h	2004-08-12 18:58:51.000000000 +0530
@@ -15,6 +15,7 @@
 #ifdef CONFIG_CPUSETS
 
 extern int cpuset_init(void);
+extern void cpuset_init_smp(void);
 extern void cpuset_fork(struct task_struct *p);
 extern void cpuset_exit(struct task_struct *p);
 extern const cpumask_t cpuset_cpus_allowed(const struct task_struct *p);
diff -Naurp linux-2.6.8-rc2-mm2-cs3/init/main.c linux-2.6.8-rc2-mm2-cs3.new/init/main.c
--- linux-2.6.8-rc2-mm2-cs3/init/main.c	2004-08-05 17:22:31.000000000 +0530
+++ linux-2.6.8-rc2-mm2-cs3.new/init/main.c	2004-08-12 18:06:54.000000000 +0530
@@ -708,6 +708,8 @@ static int init(void * unused)
 	smp_init();
 	sched_init_smp();
 
+	cpuset_init_smp();
+
 	/*
 	 * Do this before initcalls, because some drivers want to access
 	 * firmware files.
diff -Naurp linux-2.6.8-rc2-mm2-cs3/kernel/cpuset.c linux-2.6.8-rc2-mm2-cs3.new/kernel/cpuset.c
--- linux-2.6.8-rc2-mm2-cs3/kernel/cpuset.c	2004-08-11 22:02:47.000000000 +0530
+++ linux-2.6.8-rc2-mm2-cs3.new/kernel/cpuset.c	2004-08-12 18:55:34.000000000 +0530
@@ -1270,7 +1270,6 @@ int __init cpuset_init(void)
 	struct dentry *root;
 	int err;
 
-	top_cpuset.cpus_allowed = cpu_possible_map;
 	top_cpuset.mems_allowed = node_possible_map;
 
 	init_task.cpuset = &top_cpuset;
@@ -1296,6 +1295,17 @@ out:
 }
 
 /**
+ * cpuset_init_smp - initialize cpus_allowed
+ *
+ * Description: Initialize cpus_allowed after cpu_possible_map is initialized 
+ **/
+
+void __init cpuset_init_smp(void)
+{
+	top_cpuset.cpus_allowed = cpu_possible_map;
+}
+
+/**
  * cpuset_fork - attach newly forked task to its parents cpuset.
  * @p: pointer to task_struct of forking parent process.
  *

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] new bitmap list format (for cpusets)
  2004-08-12  9:48         ` Dinakar Guniguntala
@ 2004-08-12 10:11           ` Paul Jackson
  2004-08-12 12:34             ` Dinakar Guniguntala
  0 siblings, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-08-12 10:11 UTC (permalink / raw)
  To: dino; +Cc: linux-kernel, lse-tech

> Ok revised patch attached

Sweet - thanks.  I have one other small patch against the
cpuset patch I posted on lkml a week ago I guess now.

Next week, I expect to repost, against a current *-mm,
and I will include your revised patch, after I build and
test it along with my stuff.  Thanks.

The rest of this week is taken up with unrelated duties
for me.


> applies only to the rest_init function which does not have
> the __init qualifier

Ok.

If you have any thoughts on the issue I raised at the end of
my previous message in this subthread, concerning numa policies
that get out of sync with their tasks cpuset, I'd be interested
to hear them.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] new bitmap list format (for cpusets)
  2004-08-12 10:11           ` Paul Jackson
@ 2004-08-12 12:34             ` Dinakar Guniguntala
  0 siblings, 0 replies; 233+ messages in thread
From: Dinakar Guniguntala @ 2004-08-12 12:34 UTC (permalink / raw)
  To: Paul Jackson; +Cc: linux-kernel, lse-tech

On Thu, Aug 12, 2004 at 03:11:13AM -0700, Paul Jackson wrote:
> Next week, I expect to repost, against a current *-mm,
> and I will include your revised patch, after I build and
> test it along with my stuff.  Thanks.

Thanks

> If you have any thoughts on the issue I raised at the end of
> my previous message in this subthread, concerning numa policies
> that get out of sync with their tasks cpuset, I'd be interested
> to hear them.

Sure. I still need to readup on cpuset/numa though

Regards,

Dinakar

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-12  7:15                         ` Paul Jackson
@ 2004-08-12 12:58                           ` Jack Steiner
  2004-08-12 14:50                           ` Martin J. Bligh
  1 sibling, 0 replies; 233+ messages in thread
From: Jack Steiner @ 2004-08-12 12:58 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Shailabh Nagar, mbligh, efocht, lse-tech, akpm, hch, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich

On Thu, Aug 12, 2004 at 12:15:22AM -0700, Paul Jackson wrote:
> Shailabh wrote:
> > But when apps are being confined to a set of cpus *only* for purposes of 
> > getting a certain fraction of the total compute power, cpusets are not 
> > orthogonal in intent, not implementation, from a CKRM CPU class 
> > implementing hard limits.
> 
> So if someone wanted to constrain a group of tasks to using 50% of all
> the available CPU ticks on a 32 CPU system, they could either use a CKRM
> CPU class with a hard limit of 50%, or a cpuset that contained 16 CPUs.
> 
> Yes, for that purpose, except for NUMA placement (the cache affinity and
> memory latency you mention), these two approaches are similar in affect.

One other important attribute of a cpuset is that, used properly, cpusets
will guarantee exclusive use of a set of cpus for an application.

MPI jobs are frequently consist of a number of threads that communicate
via message passing interfaces. All threads need to be executing
at the same time. If a single thread loses a cpu, all threads stop making
forward progress and spin at a barrier.

Cpusets can eliminate the need for a gang scheduler.


> 
> So, yes, my absolute insistence that CKRM and cpusets are orthogonal is
> overstated.  Well, I could quibble that orthogonal doesn't imply disjoint.
> Whatever.
> 
> 
> > What's your opinion on the commonalities between the two interfaces 
> > pointed out in my previous mail ?
> 
> My apologies for not yet replying to your mail of a couple of days ago.
> It was valuable to me, and I've taken a bit of time to digest it.
> Meanwhile, newer stuff keeps overruning my reply.  Soon, hopefully.
> 
> 
> > Also, if CKRM were to move to the "each controller exports its own 
> > interface" model, how would this affect the discussion ?
> 
> I cannot speak for the discussion, only for myself.  I am clearly
> sensitive to the downsides of trying to integrate these interfaces.
> 
> Hopefully I can find the time tonight to study your earlier replies more
> closely, and better understand the potential benefits of such an
> integration.  So far, I don't see them.  I will do my best to keep my
> eyes and mind open.  Thanks especially to your posts, I have learned
> quite a bit about CKRM this week.
> 
> I will confess to a strong bias toward a minimum of abstraction at the
> kernel-user boundary, and towards providing a one-to-one map between the
> mechanisms and the interfaces a kernel provides.  Let the user level
> assemble the pieces as it will.  If combining interfaces (CKRM and
> cpuset) caused any unwarranted change in or obfuscation of the semantics
> provided by either, that would be unfortunate, in my view.
> 
> 
> > Do you think there's *any* merit to cpusets sharing the rcfs 
> > interface *if* the latter were to make the changes mentioned in earlier 
> > mail ?
> 
> Not yet, but I need to go back over your replies, and others, with this
> question more clearly in focus.
> 
> 
> > If not (and others agree), lets end this discussion and move on - both 
> > projects have enough to do ...
> 
> At least Martin does not yet agree, if I understand his posts.  But,
> yes, either way, we are close to where it is best to table this
> discussion, for the moment at least.
> 
> Thank-you for your constructive and enlightening comments so far.
> 
> -- 
>                           I won't rest till it's the best ...
>                           Programmer, Linux Scalability
>                           Paul Jackson <pj@sgi.com> 1.650.933.1373

-- 
Thanks

Jack Steiner (steiner@sgi.com)          651-683-5302
Principal Engineer                      SGI - Silicon Graphics, Inc.



^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-12  7:15                         ` Paul Jackson
  2004-08-12 12:58                           ` Jack Steiner
@ 2004-08-12 14:50                           ` Martin J. Bligh
  1 sibling, 0 replies; 233+ messages in thread
From: Martin J. Bligh @ 2004-08-12 14:50 UTC (permalink / raw)
  To: Paul Jackson, Shailabh Nagar
  Cc: efocht, lse-tech, akpm, hch, steiner, jbarnes, sylvain.jeaugey,
	djh, linux-kernel, colpatch, Simon.Derr, ak, sivanich

>> If not (and others agree), lets end this discussion and move on - both 
>> projects have enough to do ...
> 
> At least Martin does not yet agree, if I understand his posts.  But,
> yes, either way, we are close to where it is best to table this
> discussion, for the moment at least.

I'm not in violent disagreement, no ... I just want people to carefully
thrash through the rationale behind what we're doing before we do it ;-)
If the cpusets and CKRM people agree, I'm happy ... just don't want to
end up with multiple interfaces if it's not needed. If it is, then so
be it.

M.


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-10 22:38                     ` Shailabh Nagar
  2004-08-11 10:42                       ` Erich Focht
@ 2004-08-14  8:51                       ` Paul Jackson
  1 sibling, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-08-14  8:51 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: frankeh, efocht, mbligh, lse-tech, akpm, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich, ckrm-tech

Shailabh writes:
> But do let us know if there is interest in merging 
> (after this round of clarificatory emails is over) as it will affect 
> which way we go.

I remain convinced that such a merging would be wrong headed.

When I examine the experience that other operating systems such as
Solaris, Unicos and Irix have had with resource share groups and
cpusets, they have considered these to be two distinct facilities,
correctly so in my view.

So I recommend that you not try to bend CKRM to include cpusets.

Unless others have more to say, I too am content to close this thread
for now.  I've email-bombed enough mail boxes for one week ;).

I'll have an updated cpuset patch, hopefully next week, hopefully
with a shorter Cc list this time, with a couple of modest fixes.

Thank-you.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-08-08 19:58             ` Shailabh Nagar
@ 2004-10-01 23:41               ` Andrew Morton
  2004-10-02  6:06                 ` Paul Jackson
  2004-10-02 15:46                 ` [ckrm-tech] " Marc E. Fiuczynski
  0 siblings, 2 replies; 233+ messages in thread
From: Andrew Morton @ 2004-10-01 23:41 UTC (permalink / raw)
  To: Shailabh Nagar, ckrm-tech
  Cc: pj, efocht, mbligh, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich


Paul, I'm having second thoughts regarding a cpusets merge.  Having gone
back and re-read the cpusets-vs-CKRM thread from mid-August, I am quite
unconvinced that we should proceed with two orthogonal resource
management/partitioning schemes.

And CKRM is much more general than the cpu/memsets code, and hence it
should be possible to realize your end-users requirements using an
appropriately modified CKRM, and a suitable controller.

I'd view the difficulty of implementing this as a test of the wisdom of
CKRM's design, actually.

The clearest statement of the end-user cpu and memory partitioning
requirement is this, from Paul:

> Cpusets - Static Isolation:
> 
>     The essential purpose of cpusets is to support isolating large,
>     long-running, multinode compute bound HPC (high performance
>     computing) applications or relatively independent service jobs,
>     on dedicated sets of processor and memory nodes.
>     
>     The (unobtainable) ideal of cpusets is to provide perfect
>     isolation, for such jobs as:
> 
>      1) Massive compute jobs that might run hours or days, on dozens
> 	or hundreds of processors, consuming gigabytes or terabytes
> 	of main memory.  These jobs are often highly parallel, and
> 	carefully sized and placed to obtain maximum performance
> 	on NUMA hardware, where memory placement and bandwidth is
> 	critical.
> 
>      2) Independent services for which dedicated compute resources
>         have been purchased or allocated, in units of one or more
> 	CPUs and Memory Nodes, such as a web server and a DBMS
> 	sharing a large system, but staying out of each others way.
> 
>     The essential new construct of cpusets is the set of dedicated
>     compute resources - some processors and memory.  These sets have
>     names, permissions, an exclusion property, and can be subdivided
>     into subsets.
> 
>     The cpuset file system models a hierarchy of 'virtual computers',
>     which hierarchy will be deeper on larger systems.
> 
>     The average lifespan of a cpuset used for (1) above is probably
>     between hours and days, based on the job lifespan, though a couple
>     of system cpusets will remain in place as long as the system is
>     running.  The cpusets in (2) above might have a longer lifespan;
>     you'd have to ask Simon Derr of Bull about that.
> 

Now, even that is not a very good end-user requirement because it does
prejudge the way in which the requirement's solution should be implemented.
 Users don't require that their NUMA machines "model a hierarchy of
'virtual computers'".  Users require that their NUMA machines implement
some particular behaviour for their work mix.  What is that behaviour?

For example, I am unable to determine from the above whether the users
would be 90% satisfied with some close-enough ruleset which was implemented
with even the existing CKRM cpu and memory governors.

So anyway, I want to reopen this discussion, and throw a huge spanner in
your works, sorry.

I would ask the CKRM team to tell us whether there has been any progress in
this area, whether they feel that they have a good understanding of the end
user requirement, and to sketch out a design with which CKRM could satisfy
that requirement.

Thanks.

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-01 23:41               ` Andrew Morton
@ 2004-10-02  6:06                 ` Paul Jackson
  2004-10-02 14:55                   ` Dipankar Sarma
  2004-10-03 20:21                   ` Erich Focht
  2004-10-02 15:46                 ` [ckrm-tech] " Marc E. Fiuczynski
  1 sibling, 2 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-02  6:06 UTC (permalink / raw)
  To: Andrew Morton
  Cc: nagar, ckrm-tech, efocht, mbligh, lse-tech, hch, steiner,
	jbarnes, sylvain.jeaugey, djh, linux-kernel, colpatch,
	Simon.Derr, ak, sivanich, efocht

[Adding Erich Focht <efocht@hpce.nec.com>]

Are cpusets a special case of CKRM?

Andrew raises (again) the question - can CKRM meet the needs
which cpusets is trying to meet, enabling CKRM to subsume
cpusets?

  Step 1 - Why cpusets?
  Step 2 - Can CKRM do that?

Basically - cpusets implements dynamic soft partitioning to 
provide jobs with sets of isolated CPUs and Memory Nodes.

The following begins Step 1, describing who has or is expected
to use cpusets, and what I understand of their requirements
for cpusets.

Cpuset Users
============

The users of cpusets want to run jobs in relative isolation, by
dividing the system into dynamically adjustable (w/o rebooting)
subsets of compute resources (dedicated CPUs and Memory Nodes),
and run one or sometimes several jobs within a given subset.

Many such users, if they push this model far enough, tend toward
using a batch manager, aka workload manager, such as OpenPBS
or LSF.

So the actual people who scream (gently) at me the most if I
miss something in cpusets for SGI are (or have been, on 2.4
kernels and/or Irix):

  1) The PBS and LSF folks porting their workload
     managers on top of cpusets, and

  2) the SGI support engineers supporting customers
     of our biggest configurations running high value
     HPC applications.
     
  3) Cpusets are also used by various graphics, storage and
     soft-realtime projects to obtain dedicated or precisely
     placed compute resources.

The other declared potential users of cpusets, Bull and NEC at
least, seem from what I can tell to have a somewhat different
focus, toward providing a mix of compute services with minimum
interference, from what I'd guess are more departmental size
systems.

Bull (Simon) and NEC (Erich) should also look closely at CKRM,
and then try to describe their requirements, so we can understand
whether CKRM, cpusets or both or neither can meet their needs.

If I've forgotten any other likely users of cpusets who are
lurking out there, I hope they will speak up and describe how
they expect to use cpusets, what they require, and whether
they find that CKRM would also meet their needs, or why not.

I will try to work with the folks in PBS and LSF a bit, to see
if I can get a simple statement of their essential needs that
would be useful to the CKRM folks.  I'll begin taking a stab
at it, below.

CKRM folks - what would be the best presentation of CKRM that
I could point the PBS/LSF folks at?

  It's usually easier for users to determine if something will
  meet their needs if they can see and understand it.  Trying to
  do requirements analysis to drive design choices with no
  feedback loop is crazy.
  
    They'll know it when they see it, not a day sooner ;)
 
  If some essential capability is missing, they might not
  articulate that capability at all, until someone tries to
  push a "solution" on them that is missing that capability.

Cpuset Requirements
===================

The three primary requirements that the SGI support engineers
on our biggest configurations keep telling me are most important
are:
  1) isolation,
  2) isolation, and
  3) isolation.  
A big HPC job running on a dedicated set of CPUs and Memory Nodes
should not lose any CPU cycles or Memory pages to outsiders.

Both the batch managers and the HPC shops need to be able to
guarantee exclusive use of some set of CPUS and Memory to a job.

The batch managers need to be able to efficiently list
the process id's of all tasks currently attached to a set.
By default, set membership should be inherited across fork and
exec, but batch managers need to be able to move tasks between
sets without regard to the process creation hierarchy.

A job running in a cpuset should be able to use various configuration,
resource management (CKRM for example), cpu and memory (numa) affinity
tools, performance analysis and thread management facilities within a
set, including pthreads and MPI, independently from what is happening
on the rest of the system.

One should be able to run a stock 3rd party app (Oracle is
the canonical example) on a system side-by-side with a special
customer app, each in their own set, neither interfering with
the other, and the Oracle folks happy that their app is running
in a supported environment.

And of course, a cpuset needs to be able to be setup and torn
down without impacting the rest of the system, and then its
CPU and Memory resources put back in the free pool, to be
reallocated in different configurations for other cpusets.

The batch or workload manager folks want to be hibernate and
migrate jobs, so that they can move long running jobs around to
get higher priority jobs through, and so that they can sensibly
over commit without thrashing.  And they want to be able to
add and remove CPU and Memory resources to an existing cpuset,
which might appear to jobs currently executing within that
cpuset as resources going on and offline.

The HPC apps folks need to control some kernel memory
allocations, swapping, classic Unix daemons and kernel threads
along cpuset lines as well.  When the kernel page cache is
many times larger than the memory on a single node, leaving
placement up to willy-nilly kernel decisions can totally blow
out a nodes memory, which is deadly to the performance of
the job using that node.  Similarly, one job can interfere
with another if it abuses the swapper. Kernel threads that
don't require specific placement, as well as the classic Unix
daemons both need to be kept off the CPUs and Memory Nodes
used for the main applications, typically by confining them to
their own small cpuset.

The graphics, realtime and storage folks in particular need
to place their cpusets on very specific CPUs and Memory Nodes
near some piece of hardware of interest to them.  The pool
of CPUs and Memory Nodes is not homogeneous to these folks.
If not all CPUs are the same speed, or not all Memory Nodes
the same size, then CPUs and Memory Nodes are not homogeneous
to the HPC folks either.  And in any case, big numa machines
have complex bus topologies, which the system admins or batch
managers have to take into account when deciding which CPUs
and Memory Nodes to put together into a cpuset.

There must not be any presumption that composition of cpusets
is done on a per-node basis, with all the CPUs and Memory on
a node the unit of allocation.  While this is often the case,
sometimes other combinations of CPUs and Memory Nodes are needed,
not along node boundaries.

For the larger configurations, I am beginning to see requests
for hierarchical "soft partitions" reflecting typically the
complex coorperate or government organization that purchased
the big system, and needs to share it amongst different,
semi-uncooperative groups and subgroups.  I anticipate that
SGI will see more of this over the next few years, but I will
(reluctantly) admit that a hierarchy of some fixed depth of
two or three could meet the current needs as I hear them.

Even the flat model (no hierarchy) uses require some way to
name and control access to cpusets, with distinct permissions
for examining, attaching to, and changing them, that can be
used and managed on a system wide basis.

At least Bull has a requirement to automatically remove a
cpuset when the last user of it exits - which the current
implementation in Andrew's tree provides by calling out to a
user level program on the last release.  User level code can
handle the actual removal.

Bull also has a requirement for the kernel to provide
cpuset-relative numbering of CPUs and Memory Nodes to some
applications, so that they can be run oblivious to the fact
that they don't own the entire machine.  This requirement is
not satisfied by the current implementation in Andrew's tree -
Simon has a separate patch for that.

Cpusets needs to be able to interoperate with hotplug, which
can be a bit of challenge, given the tendency of cpuset code
to stash its own view of the current system CPU/Memory
configuration.

The essential implementation hooks required by cpusets follow from
their essential purpose.  Cpusets control on which CPUs a task may
be scheduled, and on which Memory Nodes it may allocate memory. 
Therefore hooks are required in the scheduler and allocator, which
constrain scheduling and allocation to only use the allowed CPUs
and Memory Nodes.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02  6:06                 ` Paul Jackson
@ 2004-10-02 14:55                   ` Dipankar Sarma
  2004-10-02 16:14                     ` Hubertus Franke
  2004-10-03  3:35                     ` Paul Jackson
  2004-10-03 20:21                   ` Erich Focht
  1 sibling, 2 replies; 233+ messages in thread
From: Dipankar Sarma @ 2004-10-02 14:55 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Andrew Morton, nagar, ckrm-tech, efocht, mbligh, lse-tech, hch,
	steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel, colpatch,
	Simon.Derr, ak, sivanich

On Fri, Oct 01, 2004 at 11:06:44PM -0700, Paul Jackson wrote:
> Cpuset Requirements
> ===================
> 
> The three primary requirements that the SGI support engineers
> on our biggest configurations keep telling me are most important
> are:
>   1) isolation,
>   2) isolation, and
>   3) isolation.  
> A big HPC job running on a dedicated set of CPUs and Memory Nodes
> should not lose any CPU cycles or Memory pages to outsiders.
> 
....

> 
> A job running in a cpuset should be able to use various configuration,
> resource management (CKRM for example), cpu and memory (numa) affinity
> tools, performance analysis and thread management facilities within a
> set, including pthreads and MPI, independently from what is happening
> on the rest of the system.
> 
> One should be able to run a stock 3rd party app (Oracle is
> the canonical example) on a system side-by-side with a special
> customer app, each in their own set, neither interfering with
> the other, and the Oracle folks happy that their app is running
> in a supported environment.

One of the things we are working on is to provide exactly something
like this. Not just that, within the isolated partitions, we want
to be able to provide completely different environment. For example,
we need to be able to run or more realtime processes of an application
in one partition while the other partition runs the database portion
of the application. For this to succeed, they need to be completely
isolated.

It would be nice if someone explains a potential CKRM implementation for 
this kind of complete isolation.

Thanks
Dipankar

^ permalink raw reply	[flat|nested] 233+ messages in thread

* RE: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-01 23:41               ` Andrew Morton
  2004-10-02  6:06                 ` Paul Jackson
@ 2004-10-02 15:46                 ` Marc E. Fiuczynski
  2004-10-02 16:17                   ` Hubertus Franke
  2004-10-02 17:47                   ` Paul Jackson
  1 sibling, 2 replies; 233+ messages in thread
From: Marc E. Fiuczynski @ 2004-10-02 15:46 UTC (permalink / raw)
  To: Andrew Morton, Shailabh Nagar, ckrm-tech
  Cc: pj, efocht, mbligh, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich, Larry Peterson

Paul & Andrew,

For PlanetLab (www.planet-lab.org) we also care very much about isolation
between different users.  Maybe not to the same degree as your users.
Nonetheless, penning in resource hogs is very important to us.  We are
giving CKRM a shot.  Over the past two weeks I have worked with Hubertus,
Chandra, and Shailabh to iron various bugs.  The controllers appear to be
working at first approximation.  From our perspective, it is not so much the
specific resource controllers but the CKRM framework that is of importance.
I.e., we certainly plan to test and implement other resource controllers for
CPU, disk I/o and memory isolation.

For cpu isolation, would it suffice to use a HTB-based cpu scheduler.  This
is essentially what the XEN folks are using to ensure strong isolation
between separate Xen domains.  An implementation of such a scheduler exists
as part of the linux-vserver project and the port of that to CKRM should be
straightforward.  In fact, I am thinking of doing such a port for PlanetLab
just to have an alternative to the existing CKRM cpu controller. Seems like
an implementation of that scheduler (or a modification to the existing CKRM
controller) + some support for CPU affinity + hotplug CPU support might
approach your cpuset solution. Correct me if I completely missed it.

For memory isolation, I am not sufficiently familiar with NUMA style
machines to comment on this topic.  The CKRM memory controller is
interesting, but we have not used it sufficiently to comment.

Finally, in terms of isolation, we have mixed together CKRM with VSERVERs.
Using CKRM for performance isolation and Vserver (for the lack of a better
name) "view" isolation.  Maybe your users care about the vserver style of
islation.  We have an anon cvs server with our kernel (which is based on
Fedora Core 2 1.521 + vserver 1.9.2 + the latest ckrm e16 framework and
resource controllers that are not even available yet at ckrm.sf.net), which
you are welcome to play with.

Best regards,
Marc

-----------
Marc E. Fiuczynski
PlanetLab Consortium --- OS Taskforce PM
Princeton University --- Research Scholar
http://www.cs.princeton.edu/~mef

> -----Original Message-----
> From: ckrm-tech-admin@lists.sourceforge.net
> [mailto:ckrm-tech-admin@lists.sourceforge.net]On Behalf Of Andrew Morton
> Sent: Friday, October 01, 2004 7:41 PM
> To: Shailabh Nagar; ckrm-tech@lists.sourceforge.net
> Cc: pj@sgi.com; efocht@hpce.nec.com; mbligh@aracnet.com;
> lse-tech@lists.sourceforge.net; hch@infradead.org; steiner@sgi.com;
> jbarnes@sgi.com; sylvain.jeaugey@bull.net; djh@sgi.com;
> linux-kernel@vger.kernel.org; colpatch@us.ibm.com; Simon.Derr@bull.net;
> ak@suse.de; sivanich@sgi.com
> Subject: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and
> memory placement
>
>
>
> Paul, I'm having second thoughts regarding a cpusets merge.  Having gone
> back and re-read the cpusets-vs-CKRM thread from mid-August, I am quite
> unconvinced that we should proceed with two orthogonal resource
> management/partitioning schemes.
>
> And CKRM is much more general than the cpu/memsets code, and hence it
> should be possible to realize your end-users requirements using an
> appropriately modified CKRM, and a suitable controller.
>
> I'd view the difficulty of implementing this as a test of the wisdom of
> CKRM's design, actually.
>
> The clearest statement of the end-user cpu and memory partitioning
> requirement is this, from Paul:
>
> > Cpusets - Static Isolation:
> >
> >     The essential purpose of cpusets is to support isolating large,
> >     long-running, multinode compute bound HPC (high performance
> >     computing) applications or relatively independent service jobs,
> >     on dedicated sets of processor and memory nodes.
> >
> >     The (unobtainable) ideal of cpusets is to provide perfect
> >     isolation, for such jobs as:
> >
> >      1) Massive compute jobs that might run hours or days, on dozens
> > 	or hundreds of processors, consuming gigabytes or terabytes
> > 	of main memory.  These jobs are often highly parallel, and
> > 	carefully sized and placed to obtain maximum performance
> > 	on NUMA hardware, where memory placement and bandwidth is
> > 	critical.
> >
> >      2) Independent services for which dedicated compute resources
> >         have been purchased or allocated, in units of one or more
> > 	CPUs and Memory Nodes, such as a web server and a DBMS
> > 	sharing a large system, but staying out of each others way.
> >
> >     The essential new construct of cpusets is the set of dedicated
> >     compute resources - some processors and memory.  These sets have
> >     names, permissions, an exclusion property, and can be subdivided
> >     into subsets.
> >
> >     The cpuset file system models a hierarchy of 'virtual computers',
> >     which hierarchy will be deeper on larger systems.
> >
> >     The average lifespan of a cpuset used for (1) above is probably
> >     between hours and days, based on the job lifespan, though a couple
> >     of system cpusets will remain in place as long as the system is
> >     running.  The cpusets in (2) above might have a longer lifespan;
> >     you'd have to ask Simon Derr of Bull about that.
> >
>
> Now, even that is not a very good end-user requirement because it does
> prejudge the way in which the requirement's solution should be
> implemented.
>  Users don't require that their NUMA machines "model a hierarchy of
> 'virtual computers'".  Users require that their NUMA machines implement
> some particular behaviour for their work mix.  What is that behaviour?
>
> For example, I am unable to determine from the above whether the users
> would be 90% satisfied with some close-enough ruleset which was
> implemented
> with even the existing CKRM cpu and memory governors.
>
> So anyway, I want to reopen this discussion, and throw a huge spanner in
> your works, sorry.
>
> I would ask the CKRM team to tell us whether there has been any
> progress in
> this area, whether they feel that they have a good understanding
> of the end
> user requirement, and to sketch out a design with which CKRM could satisfy
> that requirement.
>
> Thanks.
>
>
> -------------------------------------------------------
> This SF.net email is sponsored by: IT Product Guide on ITManagersJournal
> Use IT products in your business? Tell us what you think of them. Give us
> Your Opinions, Get Free ThinkGeek Gift Certificates! Click to
> find out more
> http://productguide.itmanagersjournal.com/guidepromo.tmpl
> _______________________________________________
> ckrm-tech mailing list
> https://lists.sourceforge.net/lists/listinfo/ckrm-tech


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02 14:55                   ` Dipankar Sarma
@ 2004-10-02 16:14                     ` Hubertus Franke
  2004-10-02 18:04                       ` Paul Jackson
  2004-10-02 23:21                       ` Peter Williams
  2004-10-03  3:35                     ` Paul Jackson
  1 sibling, 2 replies; 233+ messages in thread
From: Hubertus Franke @ 2004-10-02 16:14 UTC (permalink / raw)
  To: dipankar
  Cc: Paul Jackson, Andrew Morton, ckrm-tech, efocht, mbligh, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich


OK, let me respond to this (again...) from the perspective of cpus.
This should to some extend also cover Andrew's request as well as
Paul's earlier message.

I see cpumem sets to be orthogonal to CKRM cpu share allocations.
AGAIN.
I see cpumem sets to be orthogonal to CKRM cpu share allocations.

In its essense, "cpumem sets" is a hierarchical mechanism of sucessively 
tighter constraints on the affinity mask of tasks.

The O(1) scheduler today does not know about cpumem sets. It operates
on the level of affinity masks to adhere to the constraints specified 
based on cpu masks.

The CKRM cpu scheduler also adheres to affinity mask constraints and 
frankly does not care how they are set.

So I do not see what at the scheduler level the problem will be.
If you want system isolation you deploy cpumem sets. If you want overall 
  share enforcement you choose ckrm classes.
In addition you can use both with the understanding that cpumem sets can 
and will not be violated even if that means that shares are not maintained.

Since you want orthogonality, cpumem sets could be implemented as a
different "classtype". They would not belong to the taskclass and thus 
are independent from what we consider the task class.



The tricky stuff comes in from the fact that CKRM assumes a system wide 
definition of a class and a system wide "calculation" of shares.






Dipankar Sarma wrote:
> On Fri, Oct 01, 2004 at 11:06:44PM -0700, Paul Jackson wrote:
> 
>>Cpuset Requirements
>>===================
>>
>>The three primary requirements that the SGI support engineers
>>on our biggest configurations keep telling me are most important
>>are:
>>  1) isolation,
>>  2) isolation, and
>>  3) isolation.  
>>A big HPC job running on a dedicated set of CPUs and Memory Nodes
>>should not lose any CPU cycles or Memory pages to outsiders.
>>
> 
> ....
> 
> 
>>A job running in a cpuset should be able to use various configuration,
>>resource management (CKRM for example), cpu and memory (numa) affinity
>>tools, performance analysis and thread management facilities within a
>>set, including pthreads and MPI, independently from what is happening
>>on the rest of the system.
>>
>>One should be able to run a stock 3rd party app (Oracle is
>>the canonical example) on a system side-by-side with a special
>>customer app, each in their own set, neither interfering with
>>the other, and the Oracle folks happy that their app is running
>>in a supported environment.
> 
> 
> One of the things we are working on is to provide exactly something
> like this. Not just that, within the isolated partitions, we want
> to be able to provide completely different environment. For example,
> we need to be able to run or more realtime processes of an application
> in one partition while the other partition runs the database portion
> of the application. For this to succeed, they need to be completely
> isolated.
> 
> It would be nice if someone explains a potential CKRM implementation for 
> this kind of complete isolation.

Alternatively to what is described above, if you want to do cpumemsets 
purely through the current implementation, I'd approach it as follows:

- Start with the current cpumemset implementation.
- Write the CKRM controller that simply replaces the API of the
   cpumemset.
- Now you have the object hierarchy through /rcfs/taskclass
- Change the memsets through the generic attributes (discussed in
   earlier emails to extend the static fixed shares notation)
- DO NOT USE CPU shares (always specify DONTCARE).

I am not saying that this is the most elegant solution, but neither
is trying to achieve proportional shares through cpumemsets.


> 
> Thanks
> Dipankar
> 

Hope this helps.

> 
> -------------------------------------------------------
> This SF.net email is sponsored by: IT Product Guide on ITManagersJournal
> Use IT products in your business? Tell us what you think of them. Give us
> Your Opinions, Get Free ThinkGeek Gift Certificates! Click to find out more
> http://productguide.itmanagersjournal.com/guidepromo.tmpl
> _______________________________________________
> Lse-tech mailing list
> Lse-tech@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/lse-tech
> 


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02 15:46                 ` [ckrm-tech] " Marc E. Fiuczynski
@ 2004-10-02 16:17                   ` Hubertus Franke
  2004-10-02 17:53                     ` Paul Jackson
  2004-10-02 20:40                     ` Andrew Morton
  2004-10-02 17:47                   ` Paul Jackson
  1 sibling, 2 replies; 233+ messages in thread
From: Hubertus Franke @ 2004-10-02 16:17 UTC (permalink / raw)
  To: Marc E. Fiuczynski
  Cc: Andrew Morton, Shailabh Nagar, ckrm-tech, pj, efocht, mbligh,
	lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, colpatch, Simon.Derr, ak, sivanich, Larry Peterson



Marc E. Fiuczynski wrote:

> Paul & Andrew,
> 
> For PlanetLab (www.planet-lab.org) we also care very much about isolation
> between different users.  Maybe not to the same degree as your users.
> Nonetheless, penning in resource hogs is very important to us.  We are
> giving CKRM a shot.  Over the past two weeks I have worked with Hubertus,
> Chandra, and Shailabh to iron various bugs.  The controllers appear to be
> working at first approximation.  From our perspective, it is not so much the
> specific resource controllers but the CKRM framework that is of importance.
> I.e., we certainly plan to test and implement other resource controllers for
> CPU, disk I/o and memory isolation.
> 
> For cpu isolation, would it suffice to use a HTB-based cpu scheduler.  This
> is essentially what the XEN folks are using to ensure strong isolation
> between separate Xen domains.  An implementation of such a scheduler exists
> as part of the linux-vserver project and the port of that to CKRM should be
> straightforward.  In fact, I am thinking of doing such a port for PlanetLab
> just to have an alternative to the existing CKRM cpu controller. Seems like
> an implementation of that scheduler (or a modification to the existing CKRM
> controller) + some support for CPU affinity + hotplug CPU support might
> approach your cpuset solution. Correct me if I completely missed it.

Marc, cpusets lead to physical isolation.

> 
> For memory isolation, I am not sufficiently familiar with NUMA style
> machines to comment on this topic.  The CKRM memory controller is
> interesting, but we have not used it sufficiently to comment.
> 
> Finally, in terms of isolation, we have mixed together CKRM with VSERVERs.
> Using CKRM for performance isolation and Vserver (for the lack of a better
> name) "view" isolation.  Maybe your users care about the vserver style of
> islation.  We have an anon cvs server with our kernel (which is based on
> Fedora Core 2 1.521 + vserver 1.9.2 + the latest ckrm e16 framework and
> resource controllers that are not even available yet at ckrm.sf.net), which
> you are welcome to play with.
> 
> Best regards,
> Marc
> 
> -----------
> Marc E. Fiuczynski
> PlanetLab Consortium --- OS Taskforce PM
> Princeton University --- Research Scholar
> http://www.cs.princeton.edu/~mef
> 
> 
>>-----Original Message-----
>>From: ckrm-tech-admin@lists.sourceforge.net
>>[mailto:ckrm-tech-admin@lists.sourceforge.net]On Behalf Of Andrew Morton
>>Sent: Friday, October 01, 2004 7:41 PM
>>To: Shailabh Nagar; ckrm-tech@lists.sourceforge.net
>>Cc: pj@sgi.com; efocht@hpce.nec.com; mbligh@aracnet.com;
>>lse-tech@lists.sourceforge.net; hch@infradead.org; steiner@sgi.com;
>>jbarnes@sgi.com; sylvain.jeaugey@bull.net; djh@sgi.com;
>>linux-kernel@vger.kernel.org; colpatch@us.ibm.com; Simon.Derr@bull.net;
>>ak@suse.de; sivanich@sgi.com
>>Subject: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and
>>memory placement
>>
>>
>>
>>Paul, I'm having second thoughts regarding a cpusets merge.  Having gone
>>back and re-read the cpusets-vs-CKRM thread from mid-August, I am quite
>>unconvinced that we should proceed with two orthogonal resource
>>management/partitioning schemes.
>>
>>And CKRM is much more general than the cpu/memsets code, and hence it
>>should be possible to realize your end-users requirements using an
>>appropriately modified CKRM, and a suitable controller.
>>
>>I'd view the difficulty of implementing this as a test of the wisdom of
>>CKRM's design, actually.
>>
>>The clearest statement of the end-user cpu and memory partitioning
>>requirement is this, from Paul:
>>
>>
>>>Cpusets - Static Isolation:
>>>
>>>    The essential purpose of cpusets is to support isolating large,
>>>    long-running, multinode compute bound HPC (high performance
>>>    computing) applications or relatively independent service jobs,
>>>    on dedicated sets of processor and memory nodes.
>>>
>>>    The (unobtainable) ideal of cpusets is to provide perfect
>>>    isolation, for such jobs as:
>>>
>>>     1) Massive compute jobs that might run hours or days, on dozens
>>>	or hundreds of processors, consuming gigabytes or terabytes
>>>	of main memory.  These jobs are often highly parallel, and
>>>	carefully sized and placed to obtain maximum performance
>>>	on NUMA hardware, where memory placement and bandwidth is
>>>	critical.
>>>
>>>     2) Independent services for which dedicated compute resources
>>>        have been purchased or allocated, in units of one or more
>>>	CPUs and Memory Nodes, such as a web server and a DBMS
>>>	sharing a large system, but staying out of each others way.
>>>
>>>    The essential new construct of cpusets is the set of dedicated
>>>    compute resources - some processors and memory.  These sets have
>>>    names, permissions, an exclusion property, and can be subdivided
>>>    into subsets.
>>>
>>>    The cpuset file system models a hierarchy of 'virtual computers',
>>>    which hierarchy will be deeper on larger systems.
>>>
>>>    The average lifespan of a cpuset used for (1) above is probably
>>>    between hours and days, based on the job lifespan, though a couple
>>>    of system cpusets will remain in place as long as the system is
>>>    running.  The cpusets in (2) above might have a longer lifespan;
>>>    you'd have to ask Simon Derr of Bull about that.
>>>
>>
>>Now, even that is not a very good end-user requirement because it does
>>prejudge the way in which the requirement's solution should be
>>implemented.
>> Users don't require that their NUMA machines "model a hierarchy of
>>'virtual computers'".  Users require that their NUMA machines implement
>>some particular behaviour for their work mix.  What is that behaviour?
>>
>>For example, I am unable to determine from the above whether the users
>>would be 90% satisfied with some close-enough ruleset which was
>>implemented
>>with even the existing CKRM cpu and memory governors.
>>
>>So anyway, I want to reopen this discussion, and throw a huge spanner in
>>your works, sorry.
>>
>>I would ask the CKRM team to tell us whether there has been any
>>progress in
>>this area, whether they feel that they have a good understanding
>>of the end
>>user requirement, and to sketch out a design with which CKRM could satisfy
>>that requirement.
>>
>>Thanks.
>>
>>
>>-------------------------------------------------------
>>This SF.net email is sponsored by: IT Product Guide on ITManagersJournal
>>Use IT products in your business? Tell us what you think of them. Give us
>>Your Opinions, Get Free ThinkGeek Gift Certificates! Click to
>>find out more
>>http://productguide.itmanagersjournal.com/guidepromo.tmpl
>>_______________________________________________
>>ckrm-tech mailing list
>>https://lists.sourceforge.net/lists/listinfo/ckrm-tech
> 
> 
> 
> 
> -------------------------------------------------------
> This SF.net email is sponsored by: IT Product Guide on ITManagersJournal
> Use IT products in your business? Tell us what you think of them. Give us
> Your Opinions, Get Free ThinkGeek Gift Certificates! Click to find out more
> http://productguide.itmanagersjournal.com/guidepromo.tmpl
> _______________________________________________
> Lse-tech mailing list
> Lse-tech@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/lse-tech
> 


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02 15:46                 ` [ckrm-tech] " Marc E. Fiuczynski
  2004-10-02 16:17                   ` Hubertus Franke
@ 2004-10-02 17:47                   ` Paul Jackson
  1 sibling, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-02 17:47 UTC (permalink / raw)
  To: Marc E. Fiuczynski
  Cc: akpm, nagar, ckrm-tech, efocht, mbligh, lse-tech, hch, steiner,
	jbarnes, sylvain.jeaugey, djh, linux-kernel, colpatch,
	Simon.Derr, ak, sivanich, llp

Marc writes:
>
> For PlanetLab (www.planet-lab.org) we also care very much about isolation
> between different users.  Maybe not to the same degree as your users.
> Nonetheless, penning in resource hogs is very important to us. 

Thank-you for your report, Marc.

Before I look at code, I think we could do with a little more
discussion of usage patterns and requirements.

Despite my joke about "1) isolation, 2) isolation, and 3) isolation"
being the most important requirements on cpusets, there are further
requirements presented by typical cpuset users, which I tried to spell
out in my previous post.

Could you do a couple more things to further help this discussion:

 1) I know nothing at this moment of what PlanetLab is or what
    they do.  Could you describe this a bit - your business, your
    customers usage patterns and how these make use of CKRM?  Perhaps
    a couple of web links will help here.  I will also do a Google
    search now, in an effort to become more educated on PlanetLab.

    I might come away from this thinking one of:

	a. Dang - that sounds alot like what my cpuset users are
	   doing.  If CKRM meets PlanetLab's needs, it might meet
	   my users needs too.  I should put aside my skepticism
	   and approach Andrew's proposal to have CKRM supplant
	   cpusets with a more open mind than (I will confess)
	   I have now.

	b. No, no - that's something different.  PlanetLab doesn't
	   have the particular requirements x, y and z that my cpuset
	   users do.  Rather they have other requirements, a, b and
	   c, that seem to fit my understanding of CKRM well, but
	   not cpusets.

 2) I made some effort to present the usage patterns and
    requirements of cpuset users in my post.  Could you read
    it and comment on the requirements I presented. 

    I'd be interested to know, for each cpuset requirement I
    presented, which of the following multiple choices applies
    in your case:

	a. Huh - I (Marc) don't understand what you (pj) are
           saying here well enough to comment further.

	b. Yes - this sounds just like something PlanetLab needs,
	   perhaps rephrasing the requirement in terms more familiar
	   to you.  And CKRM meets this requirement this way ...

	c. No - this is not a big need PlanetLab has of its resource
	   management technology (perhaps noting in this case,
	   whether, in your understanding of CKRM, CKRM addresses
	   this requirement anyway, even though you don't need it).

I encourage you to stay "down to earth" in this, at least initially.
Speak in terms familiar to you, and present the actual, practical
experience you've gained at PlanetLab.

I want to avoid the trap of premature abstraction:

    Gee - both CKRM and cpusets deal with resource management, both
    have kernel hooks in the allocators and schedulers, both have
    hierarchies and both provide isolation of some sort.  They must
    be two solutions to the same problem (or at least, since CKRM
    is obviously bigger, it must be a solution to a superset of
    the problems that cpusets addresses), and so we should pick one
    (the superset, no doubt) and drop the other to avoid duplication.

Let us begin this discussion with a solid grounding in the actual
experiences we bring to this thread.

Thank-you.

	"I'm thinking of a 4 legged, long tailed, warm blooded
	creature, commonly associated with milk, that makes a
	sound written in my language starting with the letter 'M'.
	The name of the animal is a three letter word starting
	with the letter 'C'.  We had many of them in the barn on
	my Dad's dairy farm."

	Mooo ?		[cow]

	No - meow.	[cat]

	And no, we shouldn't try to catch mice with cows, even
	if they are bigger than cats.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02 16:17                   ` Hubertus Franke
@ 2004-10-02 17:53                     ` Paul Jackson
  2004-10-02 18:16                       ` Hubertus Franke
  2004-10-02 20:40                     ` Andrew Morton
  1 sibling, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-10-02 17:53 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: mef, akpm, nagar, ckrm-tech, efocht, mbligh, lse-tech, hch,
	steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel, colpatch,
	Simon.Derr, ak, sivanich, llp

Hubertus wrote:
>
> Marc, cpusets lead to physical isolation.

This is slightly too terse for my dense brain to grok.
Could you elaborate just a little, Hubertus?  Thanks.

(Try to quote less - I almost missed your reply in
 the middle of all the quoted stuff.)

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02 16:14                     ` Hubertus Franke
@ 2004-10-02 18:04                       ` Paul Jackson
  2004-10-02 23:21                       ` Peter Williams
  1 sibling, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-02 18:04 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: dipankar, akpm, ckrm-tech, efocht, mbligh, lse-tech, hch,
	steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel, colpatch,
	Simon.Derr, ak, sivanich

> I see cpumem sets to be orthogonal to CKRM cpu share allocations.

I agree.  Thank-you for stating that, Hubertus.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02 17:53                     ` Paul Jackson
@ 2004-10-02 18:16                       ` Hubertus Franke
  2004-10-02 19:14                         ` Paul Jackson
  2004-10-02 23:29                         ` Peter Williams
  0 siblings, 2 replies; 233+ messages in thread
From: Hubertus Franke @ 2004-10-02 18:16 UTC (permalink / raw)
  To: Paul Jackson
  Cc: akpm, ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich, llp



Paul Jackson wrote:
> Hubertus wrote:
> 
>>Marc, cpusets lead to physical isolation.
> 
> 
> This is slightly too terse for my dense brain to grok.
> Could you elaborate just a little, Hubertus?  Thanks.
> 

A minimal quote from your website :-)

"CpuMemSets provides a new Linux kernel facility that enables system 
services and applications to specify on which CPUs they may be 
scheduled, and from which nodes they may allocate memory."

Since I have addressed the cpu section it seems obvious that
in order to ISOLATE different workloads, you associate them onto
non-overlapping cpusets, thus technically they are physically isolated
from each other on said chosen CPUs.

Given that cpuset hierarchies translate into cpu-affinity masks,
this desired isolation can result in lost cycles globally.

I believe this to be orthogonal to share settings. To me both
are extremely desirable features.

I also pointed out that if you separate mechanism from API, it
is possible to move the CPU set API under the CKRM framework.
I have not thought about the memory aspect.

-- Hubertus



^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02 18:16                       ` Hubertus Franke
@ 2004-10-02 19:14                         ` Paul Jackson
  2004-10-02 23:29                         ` Peter Williams
  1 sibling, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-02 19:14 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: akpm, ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich, llp

Hubertus wrote:
> 
> A minimal quote from your website :-)

Ok - now I see what you're saying.

Let me expound a bit on this line, from a different perspective.

While big NUMA boxes provide the largest available single system image
boxes available currently, they have their complications.  The bus and
cache structures and geometry are complex and multilayered.

For more modest, more homogenous systems, one can benefit from putting
CKRM controllers (I hope I'm using this term correctly here) on things
like memory pages, cpu cycles, disk i/o, and network i/o in order to
provide a fairly rich degree of control over what share of resources
each application class receives, and obtain both efficient and
controlled balance of resource usage.

But for the big NUMA configuration, running some of our customers most
performance critical applications, one cannot achieve the desired
performance by trying to control all the layers of cache and bus, in
complex geometries, with their various interactions.

So instead one ends up using an orthogonal (thanks, Hubertus) and
simpler mechanism - physical isolation(*).  These nodes, and all their
associated hardware, are dedicated to the sole use of this critical
application.  There is still sometimes non-trivial work done, for a
given application, to tune its performance, but by removing (well, at
least radically reducing) the interactions of other unknown applications
on the same hardware resources, the tuning of the critical application
now becomes a practical, solvable task.

In corporate organizations, this resembles the difference between having
separate divisions with their own P&L statements, kept at arms length
for all but a few common corporate services [cpusets], versus the more
dynamic trade-offs made within a single division, moving limited
resources back and forth in order to meet changing and sometimes
conflicting objectives in accordance with the priorities dictated by
upper management [CKRM].

 (*) Well, not physical isolation in the sense of unplugging the
     interconnect cables.  Rather logical isolation of big chunks
     of the physical hardware.  And not pure 100% isolation, as
     would come from running separate kernel images, but minimal
     controlled isolation, with the ability to keep out anything
     that causes interference if it doesn't need to be there, on
     those particular CPUs and Memory Nodes.

     And our customers _do_ want to manage these logically isolated
     chunks as named "virtual computers" with system managed permissions
     and integrity (such as the system-wide attribute of "Exclusive"
     ownership of a CPU or Memory by one cpuset, and a robust ability
     to list all tasks currently in a cpuset).  This is a genuine user
     requirement to my understanding, apparently contrary to Andrew's.

The above is not the only use of cpusets - there's also providing
a base for ports of PBS and LSF workload managers (which if I recall
correctly arose from earlier HPC environments similar to the one
I described above), and there's the work being done by Bull and NEC,
which can better be spoken to by representives of those companies.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02 16:17                   ` Hubertus Franke
  2004-10-02 17:53                     ` Paul Jackson
@ 2004-10-02 20:40                     ` Andrew Morton
  2004-10-02 23:08                       ` Hubertus Franke
  2004-10-03  2:26                       ` Paul Jackson
  1 sibling, 2 replies; 233+ messages in thread
From: Andrew Morton @ 2004-10-02 20:40 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: mef, nagar, ckrm-tech, pj, efocht, mbligh, lse-tech, hch,
	steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel, colpatch,
	Simon.Derr, ak, sivanich, llp

Hubertus Franke <frankeh@watson.ibm.com> wrote:
>
> Marc, cpusets lead to physical isolation.

Despite what Paul says, his customers *do not* "require" physical isolation
[*].  That's like an accountant requiring that his spreadsheet be written
in Pascal.  He needs slapping.

Isolation is merely the means by which cpusets implements some higher-level
customer requirement.

I want to see a clearer description of what that higher-level requirement is.

Then I'd like to see some thought put into whether CKRM (with probably a new
controller) can provide a good-enough implementation of that requirement.

Coming at this from the other direction: CKRM is being positioned as a
general purpose resource management framework, yes?  Isolation is a simple
form of resource management.  If the CKRM framework simply cannot provide
this form of isolation then it just failed its first test, did it not?

[*] Except for the case where there is graphics (or other) hardware close
to a particular node.  In that case it is obvious that CPU-group pinning is
the only way in which to satisfy the top-level requirement of "make access
to the graphics hardware be efficient".

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02 23:08                       ` Hubertus Franke
@ 2004-10-02 22:26                         ` Alan Cox
  2004-10-03  2:49                         ` Paul Jackson
  2004-10-03  3:25                         ` Paul Jackson
  2 siblings, 0 replies; 233+ messages in thread
From: Alan Cox @ 2004-10-02 22:26 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: Andrew Morton, mef, nagar, ckrm-tech, pj, efocht, mbligh,
	lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	Linux Kernel Mailing List, colpatch, Simon.Derr, ak, sivanich,
	llp

On Sul, 2004-10-03 at 00:08, Hubertus Franke wrote:
> Andrew Morton wrote:
> > Hubertus Franke <frankeh@watson.ibm.com> wrote:
> > 
> >>Marc, cpusets lead to physical isolation.

Not realistically on x86 unless you start billing memory accesses IMHO


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02 20:40                     ` Andrew Morton
@ 2004-10-02 23:08                       ` Hubertus Franke
  2004-10-02 22:26                         ` Alan Cox
                                           ` (2 more replies)
  2004-10-03  2:26                       ` Paul Jackson
  1 sibling, 3 replies; 233+ messages in thread
From: Hubertus Franke @ 2004-10-02 23:08 UTC (permalink / raw)
  To: Andrew Morton
  Cc: mef, nagar, ckrm-tech, pj, efocht, mbligh, lse-tech, hch,
	steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel, colpatch,
	Simon.Derr, ak, sivanich, llp



Andrew Morton wrote:
> Hubertus Franke <frankeh@watson.ibm.com> wrote:
> 
>>Marc, cpusets lead to physical isolation.
> 
> 
> Despite what Paul says, his customers *do not* "require" physical isolation
> [*].  That's like an accountant requiring that his spreadsheet be written
> in Pascal.  He needs slapping.
> 
> Isolation is merely the means by which cpusets implements some higher-level
> customer requirement.
> 
> I want to see a clearer description of what that higher-level requirement is.
> 
> Then I'd like to see some thought put into whether CKRM (with probably a new
> controller) can provide a good-enough implementation of that requirement.
> 

CKRM could do so. We already provide the name space and the class 
hierarchy. If a cpuset is associated with a class, then the class 
controller can sets the appropriate masks in the system.

The issue that Paul correctly pointed out is that if you associate the 
current task classes, i.e. set cpu and i/o shares then one MIGHT have 
conflicting directives to the system.
This can be avoided by not utilizing cpu shares at that point or live
with the potential share inbalance that will arrive from being forced 
into the various affinity constraints of the tasks.
But we already have to live with that anyway when resources create 
dependencies, such as to little memory can potentially impact obtained 
cpu share.

Alternatively, cpumem set could be introduced as a whole new classtype
that similar to the socket class type will have this one controller 
associated.

So to me cpumem sets as as concept is useful, so I won't be doing that 
whopping, but it can be integrated into CKRM as classtype/controller 
concept. Particularly for NUMA machine it makes sense in the absense of 
more sophisticated and (sub)optimal placement by the OS.

> Coming at this from the other direction: CKRM is being positioned as a
> general purpose resource management framework, yes?  Isolation is a simple
> form of resource management.  If the CKRM framework simply cannot provide
> this form of isolation then it just failed its first test, did it not?
> 

That's fair to say, I think it is feasible, by utilizing the guts of the 
cpumem set and wrapping the CKRM RCFS and class objects around it.

> [*] Except for the case where there is graphics (or other) hardware close
> to a particular node.  In that case it is obvious that CPU-group pinning is
> the only way in which to satisfy the top-level requirement of "make access
> to the graphics hardware be efficient".

Yipp ...  but it is also useful if one has limited faith in the system 
to always the right thing. If I have no control over where tasks go, I 
can potentially end up introducing heavy bus traffic (over NUMA link).
There's a good reason why in many HPC deployment, application try to by 
pass the OS ...

Hope this helps.



^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02 16:14                     ` Hubertus Franke
  2004-10-02 18:04                       ` Paul Jackson
@ 2004-10-02 23:21                       ` Peter Williams
  2004-10-02 23:44                         ` Hubertus Franke
                                           ` (4 more replies)
  1 sibling, 5 replies; 233+ messages in thread
From: Peter Williams @ 2004-10-02 23:21 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: dipankar, Paul Jackson, Andrew Morton, ckrm-tech, efocht, mbligh,
	lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, colpatch, Simon.Derr, ak, sivanich

Hubertus Franke wrote:
> 
> OK, let me respond to this (again...) from the perspective of cpus.
> This should to some extend also cover Andrew's request as well as
> Paul's earlier message.
> 
> I see cpumem sets to be orthogonal to CKRM cpu share allocations.
> AGAIN.
> I see cpumem sets to be orthogonal to CKRM cpu share allocations.
> 
> In its essense, "cpumem sets" is a hierarchical mechanism of sucessively 
> tighter constraints on the affinity mask of tasks.
> 
> The O(1) scheduler today does not know about cpumem sets. It operates
> on the level of affinity masks to adhere to the constraints specified 
> based on cpu masks.

This is where I see the need for "CPU sets".  I.e. as a 
replacement/modification to the CPU affinity mechanism basically adding 
an extra level of abstraction to make it easier to use for implementing 
the type of isolation that people seem to want.  I say this because, 
strictly speaking and as you imply, the current affinity mechanism is 
sufficient to provide that isolation BUT it would be a huge pain to 
implement.

The way I see it you just replace the task's affinity mask with a 
pointer to its "CPU set" which contains the affinity mask shared by 
tasks belonging to that set (and this is used by try_to_wake_up() and 
the load balancing mechanism to do their stuff instead of the per task 
affinity mask).  Then when you want to do something like take a CPU away 
from one group of tasks and give it to another group of tasks it's just 
a matter of changing the affinity masks in the sets instead of visiting 
every one of the tasks individually and changing their masks.  There 
should be no need to explicitly move tasks off the "lost" CPU after such 
a change as it should/could be done next time that they go through 
try_to_wake_up() and/or finish a time slice.  Moving a task from one CPU 
set to another would be a similar process to the current change of 
affinity mask.

There would, of course, need to be some restriction on the movement of 
CPUs from one set to another so that you don't end up with an empty set 
with live tasks, etc.

A possible problem is that there may be users whose use of the current 
affinity mechanism would be broken by such a change.  A compile time 
choice between the current mechanism and a set based mechanism would be 
a possible solution.  Of course, this proposed modification wouldn't 
make any sense with less than 3 CPUs.

PS Once CPU sets were implemented like this, configurable CPU schedulers 
(such as (blatant plug :-)) ZAPHOD) could have "per CPU set" 
configurations, CKRM could do its (CPU management stuff) stuff within a 
CPU set, etc.

> 
> The CKRM cpu scheduler also adheres to affinity mask constraints and 
> frankly does not care how they are set.
> 
> So I do not see what at the scheduler level the problem will be.
> If you want system isolation you deploy cpumem sets. If you want overall 
>  share enforcement you choose ckrm classes.
> In addition you can use both with the understanding that cpumem sets can 
> and will not be violated even if that means that shares are not maintained.
> 
> Since you want orthogonality, cpumem sets could be implemented as a
> different "classtype". They would not belong to the taskclass and thus 
> are independent from what we consider the task class.
> 
> 
> 
> The tricky stuff comes in from the fact that CKRM assumes a system wide 
> definition of a class and a system wide "calculation" of shares.

Doesn't sound insurmountable or particularly tricky :-).

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02 18:16                       ` Hubertus Franke
  2004-10-02 19:14                         ` Paul Jackson
@ 2004-10-02 23:29                         ` Peter Williams
  2004-10-02 23:51                           ` Hubertus Franke
  1 sibling, 1 reply; 233+ messages in thread
From: Peter Williams @ 2004-10-02 23:29 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: Paul Jackson, akpm, ckrm-tech, efocht, lse-tech, hch, steiner,
	jbarnes, sylvain.jeaugey, djh, linux-kernel, colpatch,
	Simon.Derr, ak, sivanich, llp

Hubertus Franke wrote:
> 
> 
> Paul Jackson wrote:
> 
>> Hubertus wrote:
>>
>>> Marc, cpusets lead to physical isolation.
>>
>>
>>
>> This is slightly too terse for my dense brain to grok.
>> Could you elaborate just a little, Hubertus?  Thanks.
>>
> 
> A minimal quote from your website :-)
> 
> "CpuMemSets provides a new Linux kernel facility that enables system 
> services and applications to specify on which CPUs they may be 
> scheduled, and from which nodes they may allocate memory."
> 
> Since I have addressed the cpu section it seems obvious that
> in order to ISOLATE different workloads, you associate them onto
> non-overlapping cpusets, thus technically they are physically isolated
> from each other on said chosen CPUs.
> 
> Given that cpuset hierarchies translate into cpu-affinity masks,
> this desired isolation can result in lost cycles globally.

This argument if followed to its logical conclusion would advocate the 
abolition of CPU affinity masks completely.

> 
> I believe this to be orthogonal to share settings. To me both
> are extremely desirable features.
> 
> I also pointed out that if you separate mechanism from API, it
> is possible to move the CPU set API under the CKRM framework.
> I have not thought about the memory aspect.
> 
> -- Hubertus
> 
> 
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 


-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02 23:21                       ` Peter Williams
@ 2004-10-02 23:44                         ` Hubertus Franke
  2004-10-03  0:00                           ` Peter Williams
                                             ` (2 more replies)
  2004-10-03  2:59                         ` Paul Jackson
                                           ` (3 subsequent siblings)
  4 siblings, 3 replies; 233+ messages in thread
From: Hubertus Franke @ 2004-10-02 23:44 UTC (permalink / raw)
  To: Peter Williams
  Cc: dipankar, Paul Jackson, Andrew Morton, ckrm-tech, efocht, mbligh,
	lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, colpatch, Simon.Derr, ak, sivanich

We are in sync on this... Hopefully, everybody else as well.
> 
> This is where I see the need for "CPU sets".  I.e. as a 
> replacement/modification to the CPU affinity mechanism basically adding 
> an extra level of abstraction to make it easier to use for implementing 
> the type of isolation that people seem to want.  I say this because, 
> strictly speaking and as you imply, the current affinity mechanism is 
> sufficient to provide that isolation BUT it would be a huge pain to 
> implement.

Exactly, you do the movement from cpuset through higher level operations 
replacing the per task cpu-affinity with a shared object.
This is what CKRM does at the core level through its class objects.
RCFS provides the high level operations. The controller implements them
wrt to the constraints and the details.

> 
> The way I see it you just replace the task's affinity mask with a 
> pointer to its "CPU set" which contains the affinity mask shared by 
> tasks belonging to that set (and this is used by try_to_wake_up() and 
> the load balancing mechanism to do their stuff instead of the per task 
> affinity mask).  Then when you want to do something like take a CPU away 
> from one group of tasks and give it to another group of tasks it's just 
> a matter of changing the affinity masks in the sets instead of visiting 
> every one of the tasks individually and changing their masks.  

Exactly ..

> There 
> should be no need to explicitly move tasks off the "lost" CPU after such 
> a change as it should/could be done next time that they go through 
> try_to_wake_up() and/or finish a time slice.  Moving a task from one CPU 
> set to another would be a similar process to the current change of 
> affinity mask.
> 
> There would, of course, need to be some restriction on the movement of 
> CPUs from one set to another so that you don't end up with an empty set 
> with live tasks, etc.
> 
> A possible problem is that there may be users whose use of the current 
> affinity mechanism would be broken by such a change.  A compile time 
> choice between the current mechanism and a set based mechanism would be 
> a possible solution.  Of course, this proposed modification wouldn't 
> make any sense with less than 3 CPUs.

Why ? It is even useful for 2 cpus.
Currently cpumem sets do not enforce that there is not intersections 
between siblings of a hierarchy.

> 
> PS Once CPU sets were implemented like this, configurable CPU schedulers 
> (such as (blatant plug :-)) ZAPHOD) could have "per CPU set" 
> configurations, CKRM could do its (CPU management stuff) stuff within a 
> CPU set, etc.

That's one of the sticking points.
That would require that TASKCLASSES and cpumemsets must go along the 
same hierarchy. With CPUmemsets being the top part of the hierarchy.
In other words the task classes can not span different cpusets.

There are other posibilities that would restrict the load balancing
along cpuset boundaries. If taskclasses are allowed to span disjoint
cpumemsets, what is then the definition of setting shares ?

Today we simply do the system wide share proportionment adhering to the 
affinity constraints, which is still valid in this discussion.

> 

>>
>> The tricky stuff comes in from the fact that CKRM assumes a system 
>> wide definition of a class and a system wide "calculation" of shares.
> 
Tricky in that it needs to be decided what the class hierarchy 
definitions and whether to CKRM cpu scheduling within each cpuset and 
what the exact definition of share then is ?

> 
> Doesn't sound insurmountable or particularly tricky :-).

I agree its not insurmountable but a matter of deciding what the desired
behavior is ...

Regards.



> 
> Peter


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02 23:29                         ` Peter Williams
@ 2004-10-02 23:51                           ` Hubertus Franke
  0 siblings, 0 replies; 233+ messages in thread
From: Hubertus Franke @ 2004-10-02 23:51 UTC (permalink / raw)
  To: Peter Williams
  Cc: Paul Jackson, akpm, ckrm-tech, efocht, lse-tech, hch, steiner,
	jbarnes, sylvain.jeaugey, djh, linux-kernel, colpatch,
	Simon.Derr, ak, sivanich, llp



Peter Williams wrote:

> Hubertus Franke wrote:
> 
>>
>>
>> Paul Jackson wrote:

>> A minimal quote from your website :-)
>>
>> "CpuMemSets provides a new Linux kernel facility that enables system 
>> services and applications to specify on which CPUs they may be 
>> scheduled, and from which nodes they may allocate memory."
>>
>> Since I have addressed the cpu section it seems obvious that
>> in order to ISOLATE different workloads, you associate them onto
>> non-overlapping cpusets, thus technically they are physically isolated
>> from each other on said chosen CPUs.
>>
>> Given that cpuset hierarchies translate into cpu-affinity masks,
>> this desired isolation can result in lost cycles globally.
> 
> 
> This argument if followed to its logical conclusion would advocate the 
> abolition of CPU affinity masks completely.
> 

No, why is that. One can restrict memory on a task and by doing so waste 
  cycles in paging. That does not mean we should get ride of memory 
restrictions or a like.
Loosing cycles is simply an observation of what could happen.

As in any system, over constraining a given workload (wrt to affinity, 
cpu limits, rate control) can lead to suboptimal utilization of 
resources. That does not mean there is no rational for the constraints 
in the first place and hence they should never be allowed in the first 
place.

Cheers ..



^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02 23:44                         ` Hubertus Franke
@ 2004-10-03  0:00                           ` Peter Williams
  2004-10-03  3:44                           ` Paul Jackson
  2004-10-05  3:13                           ` [ckrm-tech] " Matthew Helsley
  2 siblings, 0 replies; 233+ messages in thread
From: Peter Williams @ 2004-10-03  0:00 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: dipankar, Paul Jackson, Andrew Morton, ckrm-tech, efocht, mbligh,
	lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, colpatch, Simon.Derr, ak, sivanich

Hubertus Franke wrote:
>> be a possible solution.  Of course, this proposed modification 
>> wouldn't make any sense with less than 3 CPUs.
> 
> 
> Why ? It is even useful for 2 cpus.
> Currently cpumem sets do not enforce that there is not intersections 
> between siblings of a hierarchy.

There's only 3 non empty sets and only one of them can have a CPU 
removed from the set without becoming empty.  So the pain wouldn't be 
worth the gain.

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02 20:40                     ` Andrew Morton
  2004-10-02 23:08                       ` Hubertus Franke
@ 2004-10-03  2:26                       ` Paul Jackson
  2004-10-03 14:11                         ` Paul Jackson
  1 sibling, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-10-03  2:26 UTC (permalink / raw)
  To: Andrew Morton
  Cc: frankeh, mef, nagar, ckrm-tech, efocht, mbligh, lse-tech, hch,
	steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel, colpatch,
	Simon.Derr, ak, sivanich, llp

Andrew writes:
>
> Despite what Paul says, his customers *do not* "require" physical isolation
> [*].  That's like an accountant requiring that his spreadsheet be written
> in Pascal.  He needs slapping.

No - it's like an accountant saying the books for your two sole
proprietor Subchapter S corporations have to be kept separate.

Consider the following use case scenario, which emphasizes this
isolation aspect (and ignores other requirements, such as the need for
system admins to manage cpusets by name [some handle valid across
process contexts], with a system wide imposed permission model and
exclusive use guarantees, and with a well defined system supported
notion of which tasks are "in" which cpuset at any point in time).

===

You're running a 64-way, compute bound application on 64 CPUs of your
256 CPU system.  The 64 threads are in lock step, tightly coupled, for
three days straight.  You've sized the application and the computer you
bought to run that application to within the last few percent of what
CPU cycles are available on 64 CPUs and how many memory pages are
available on the nodes local to those CPUs.  It's an MPT application in
Fortran, using most of the available bandwidth between those nodes for
synconization on each loop of the computation.  If a single thread slows
down 10% for any reason, the entire application slows down that much
(sometimes worse), and you have big money on the table, ensuring that
doesn't happen.  You absolutely positively have to complete that
application run on time, in three days (say it's a weather forecast for
four days out).  You've varied the resolution to which you compute the
answer or the size of your input data set or whatever else you could, in
order to obtain the most accurate answer you could, in three days, not
an hour longer.  If the runtimes jump around by more than 5% or 10%,
some Vice President starts losing sleep.  If it's a 20% variation, that
sleep deprived Vice President works for the computer company that sold
you the system.  The boss of the boss of my boss ;).

I now know that every one of these 64 threads is pinned for those three
days.  It's just as pinned as the graphics application that has to be
near its hardware.  Due to both the latency affects of the several
levels of hardware cache (on the CPU chip and off), and the additional
latency affects imposed by the software when it decides on which node to
place a page of memory off a page fault, nothing can move.  Not in, not
out, not within.  To within a fraction of a percent, nothing else may be
allowed onto those nodes, nothing of those 64 threads may be allowed off
those nodes, and none of the threads may be allowed to move within the
64 CPUs.  And not just any random subset of 64 CPUs selected from the
256 available, but a subset that's "close" together, given the complex
geometries of these big systems (minimum number of router hops between
the furthest apart pair of CPUs in the set of 64 CPUs).

 (*) Message Passing Interface (MPI) - http://www.mpi-forum.org

===

It's a requirement, I say.  It's a requirement.  Let the slapping begin ;).

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02 23:08                       ` Hubertus Franke
  2004-10-02 22:26                         ` Alan Cox
@ 2004-10-03  2:49                         ` Paul Jackson
  2004-10-03 12:19                           ` Hubertus Franke
  2004-10-03  3:25                         ` Paul Jackson
  2 siblings, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-10-03  2:49 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: akpm, mef, nagar, ckrm-tech, efocht, mbligh, lse-tech, hch,
	steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel, colpatch,
	Simon.Derr, ak, sivanich, llp

Hubertus wrote:
>
> CKRM could do so. We already provide the name space and the class 
> hierarchy.

Just because two things have name spaces and hierarchies, doesn't
make them interchangeable.  Name spaces and hierarchies are just
implementation mechanisms - many interesting, entirely unrelated,
solutions make use of them.

What are the objects named, and what is the relation underlying
the hierarchy?  These must match up.

The objects named in cpusets are subsets of a systems CPUs and Memory
Nodes. The relation underlying the hierarchy is the subset relation on
these sets: if one cpuset node is a descendent of another, then its
CPUs and Memory Nodes are a subset of the others.

What is the corresponding statement for CKRM?

For CKRM to subsume cpusets, there must be an injective map from the
above cpuset objects to CKRM objects, that preserves this subset
relation on cpusets.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02 23:21                       ` Peter Williams
  2004-10-02 23:44                         ` Hubertus Franke
@ 2004-10-03  2:59                         ` Paul Jackson
  2004-10-03  3:19                         ` Paul Jackson
                                           ` (2 subsequent siblings)
  4 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-03  2:59 UTC (permalink / raw)
  To: Peter Williams
  Cc: frankeh, dipankar, akpm, ckrm-tech, efocht, mbligh, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich

Peter writes:
>
> I say this because, 
> strictly speaking and as you imply, the current affinity mechanism is 
> sufficient to provide that isolation BUT it would be a huge pain to 
> implement.

The affects on any given task - where it gets scheduled and where it
allocates memory - can be duplicated using the current affinity
mechanisms (setaffinity/mbind/mempolicy).

However the system wide naming of cpusets, the control of their access,
use and modification, the exclusive rights to a CPU or Memory and the
robust linkage of tasks to these named cpusets are, in my view, just the
sort of system wide resource synchronization that kernels are born to
do, and these capabilities are not provided by the per-task existing
affinity mechanisms.

However, my point doesn't matter much.  Whether its a huge pain, or an
infinite pain, so long as we agree it's more painful than we can
tolerate, that's enough agreement to continue this discussion along
other more fruitful lines.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02 23:21                       ` Peter Williams
  2004-10-02 23:44                         ` Hubertus Franke
  2004-10-03  2:59                         ` Paul Jackson
@ 2004-10-03  3:19                         ` Paul Jackson
  2004-10-03  3:53                           ` Peter Williams
  2004-10-03  4:02                           ` Paul Jackson
  2004-10-03  3:39                         ` Paul Jackson
  2004-10-03 14:36                         ` Martin J. Bligh
  4 siblings, 2 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-03  3:19 UTC (permalink / raw)
  To: Peter Williams
  Cc: frankeh, dipankar, akpm, ckrm-tech, efocht, mbligh, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich

Peter writes:
> 
> The way I see it you just replace the task's affinity mask with a 
> pointer to its "CPU set" which contains the affinity mask shared by 
> tasks belonging to that set ...

I too like this suggestion.  The current duplication of cpus_allowed and
mems_allowed between task and cpuset is a fragile design, forced on us
by incremental feature addition and the need to maintain backwards
compatibility.


> A possible problem is that there may be users whose use of the current 
> affinity mechanism would be broken by such a change.  A compile time 
> choice between the current mechanism and a set based mechanism would be 
> a possible solution.

Do you mean kernel or application compile time?  The current affinity
mechanisms have enough field penetration that the kernel will have to
support or emulate these calls for a long period of deprecation at best.

So I guess you mean application compile time.  However, the current user
level support, in glibc and other libraries, for these calls is
sufficiently confused, at least in my view, that rather than have that
same API mean two things, depending on a compile time switch, I'd rather
explore (1) emulating the existing calls, just as they are, (2) adding
new calls that are try these API's again, in line with our kernel
changes, and (3) eventually deprecate and remove the old calls, over a
multi-year period.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02 23:08                       ` Hubertus Franke
  2004-10-02 22:26                         ` Alan Cox
  2004-10-03  2:49                         ` Paul Jackson
@ 2004-10-03  3:25                         ` Paul Jackson
  2 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-03  3:25 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: akpm, mef, nagar, ckrm-tech, efocht, mbligh, lse-tech, hch,
	steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel, colpatch,
	Simon.Derr, ak, sivanich, llp

Hubertus wrote:
> So to me cpumem sets as as concept is useful, so I won't be doing that 
> whopping, but ...

I couldn't parse the above ... could you rephrase?

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02 14:55                   ` Dipankar Sarma
  2004-10-02 16:14                     ` Hubertus Franke
@ 2004-10-03  3:35                     ` Paul Jackson
  1 sibling, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-03  3:35 UTC (permalink / raw)
  To: dipankar
  Cc: akpm, nagar, ckrm-tech, efocht, mbligh, lse-tech, hch, steiner,
	jbarnes, sylvain.jeaugey, djh, linux-kernel, colpatch,
	Simon.Derr, ak, sivanich

Dipankar wrote:
> For this to succeed, they need to be completely
> isolated.

Do you mean by completely isolated (1) running two separate system
images on separate partitions connected at most by networks and storage,
or do you mean (2) minimal numa interaction between two subsets of
nodes, all running under the same system image?

If (1), then the partitioning project is down the hall ;)  But I guess
you knew that.  The issues on this thread involve managing resource
interactions on a single system image.

Just checking ... the words you used to describe the degree of
separation were sufficiently strong that I became worried we were at
risk for a miscommunication.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02 23:21                       ` Peter Williams
                                           ` (2 preceding siblings ...)
  2004-10-03  3:19                         ` Paul Jackson
@ 2004-10-03  3:39                         ` Paul Jackson
  2004-10-03 14:36                         ` Martin J. Bligh
  4 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-03  3:39 UTC (permalink / raw)
  To: Peter Williams
  Cc: frankeh, dipankar, akpm, ckrm-tech, efocht, mbligh, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich

Peter writes:
> This is where I see the need for "CPU sets".  I.e. as a 
> replacement/modification to the CPU affinity mechanism 

Note that despite the name, cpusets handles both CPU and
Memory affinity.

Which is probably why Hubertus is calling them cpumem sets.

And, indeed, why I have called them cpumemsets on alternate
years myself.

However the rest of your points, except where clearly specific
to the scheduler, apply equally well, so this point is not
critical at this point in the discussion.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02 23:44                         ` Hubertus Franke
  2004-10-03  0:00                           ` Peter Williams
@ 2004-10-03  3:44                           ` Paul Jackson
  2004-10-05  3:13                           ` [ckrm-tech] " Matthew Helsley
  2 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-03  3:44 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: pwil3058, dipankar, akpm, ckrm-tech, efocht, mbligh, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich

Hubertus writes:
> 
> That's one of the sticking points.
> That would require that TASKCLASSES and cpumemsets must go along the 
> same hierarchy. With CPUmemsets being the top part of the hierarchy.
> In other words the task classes can not span different cpusets.

Can task classes span an entire cpuset subtree?  I can well imagine that
an entire subtree of the cpuset tree should be managed by the same CKRM
policies and shares.

In particular, if we emulate the setaffinity/mbind/mempolicy calls by
forking a child cpuset to represent the new restrictions on the task
affected by those calls, then we'd for sure want to leave that task in
the same CKRM policy realm as it was before.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-03  3:19                         ` Paul Jackson
@ 2004-10-03  3:53                           ` Peter Williams
  2004-10-03  4:47                             ` Paul Jackson
  2004-10-03  4:02                           ` Paul Jackson
  1 sibling, 1 reply; 233+ messages in thread
From: Peter Williams @ 2004-10-03  3:53 UTC (permalink / raw)
  To: Paul Jackson
  Cc: frankeh, dipankar, akpm, ckrm-tech, efocht, mbligh, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich

Paul Jackson wrote:
> Peter writes:
> 
>>The way I see it you just replace the task's affinity mask with a 
>>pointer to its "CPU set" which contains the affinity mask shared by 
>>tasks belonging to that set ...
> 
> 
> I too like this suggestion.  The current duplication of cpus_allowed and
> mems_allowed between task and cpuset is a fragile design, forced on us
> by incremental feature addition and the need to maintain backwards
> compatibility.

OK.

> 
>>A possible problem is that there may be users whose use of the current 
>>affinity mechanism would be broken by such a change.  A compile time 
>>choice between the current mechanism and a set based mechanism would be 
>>a possible solution.
> 
> 
> Do you mean kernel or application compile time?

Kernel compile time.

>  The current affinity
> mechanisms have enough field penetration that the kernel will have to
> support or emulate these calls for a long period of deprecation at best.

That's unfortunate.  Are the (higher level) ways in which they're used 
incompatible with CPU sets or would CPU sets be seen as being a better 
(easier) way of doing the job?

If the choice is at kernel compile time then those users of the current 
mechanism can choose it and new users can choose CPU sets.  Of course, 
this makes gradual movement from one model to the other difficult to say 
the least.

> 
> So I guess you mean application compile time.  However, the current user
> level support, in glibc and other libraries, for these calls is
> sufficiently confused, at least in my view, that rather than have that
> same API mean two things, depending on a compile time switch, I'd rather
> explore (1) emulating the existing calls, just as they are, (2) adding
> new calls that are try these API's again, in line with our kernel
> changes, and (3) eventually deprecate and remove the old calls, over a
> multi-year period.

I would agree with that.  I guess that emulation would not be possible 
on top of my suggestion hence the requirement for the "fragile design" etc.

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-03  3:19                         ` Paul Jackson
  2004-10-03  3:53                           ` Peter Williams
@ 2004-10-03  4:02                           ` Paul Jackson
  1 sibling, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-03  4:02 UTC (permalink / raw)
  To: Paul Jackson
  Cc: pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht, mbligh,
	lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, colpatch, Simon.Derr, ak, sivanich

Paul wrote:
> (2) adding new calls that are try these API's again
                            ^^^
Drop that word 'are' - don't know how it snuck in there ;)

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-03  3:53                           ` Peter Williams
@ 2004-10-03  4:47                             ` Paul Jackson
  2004-10-03  5:12                               ` Peter Williams
  0 siblings, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-10-03  4:47 UTC (permalink / raw)
  To: Peter Williams
  Cc: frankeh, dipankar, akpm, ckrm-tech, efocht, mbligh, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich

Peter wrote:
> 
> Of course, this [kernel compile option] makes gradual movement
> from one model to the other difficult to say the least.

To say the least.

It might be possible to continue to support current affinity calls
(setaffinity/mbind/mempolicy) even while removing the duplication of
affinity masks between tasks and cpusets.

If each call to set a tasks affinity resulted in moving that task into
its very own cpuset (unless it was already the only user of its cpuset),
and if the calls to load and store task->{cpus,mems}_allowed in the
implementation of these affinity sys calls were changed to load and
store those affinity masks in the tasks cpuset instead.

I'm just brainstorming here ... this scheme could easily have some
fatal flaw that I'm missing at the moment.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-03  4:47                             ` Paul Jackson
@ 2004-10-03  5:12                               ` Peter Williams
  2004-10-03  5:39                                 ` Paul Jackson
  0 siblings, 1 reply; 233+ messages in thread
From: Peter Williams @ 2004-10-03  5:12 UTC (permalink / raw)
  To: Paul Jackson
  Cc: frankeh, dipankar, akpm, ckrm-tech, efocht, mbligh, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich

Paul Jackson wrote:
> Peter wrote:
> 
>>Of course, this [kernel compile option] makes gradual movement
>>from one model to the other difficult to say the least.
> 
> 
> To say the least.
> 
> It might be possible to continue to support current affinity calls
> (setaffinity/mbind/mempolicy) even while removing the duplication of
> affinity masks between tasks and cpusets.
> 
> If each call to set a tasks affinity resulted in moving that task into
> its very own cpuset (unless it was already the only user of its cpuset),
> and if the calls to load and store task->{cpus,mems}_allowed in the
> implementation of these affinity sys calls were changed to load and
> store those affinity masks in the tasks cpuset instead.
> 
> I'm just brainstorming here ... this scheme could easily have some
> fatal flaw that I'm missing at the moment.

Provided overlapping sets are allowed it should be feasible.  However, 
I'm not a big fan of overlapping sets as it would make using different 
CPU scheduling configurations in each set more difficult (maybe even 
inadvisable) but that's a different issue.

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-03  5:12                               ` Peter Williams
@ 2004-10-03  5:39                                 ` Paul Jackson
  0 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-03  5:39 UTC (permalink / raw)
  To: Peter Williams
  Cc: frankeh, dipankar, akpm, ckrm-tech, efocht, mbligh, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich

Peter wrote:
> 
> Provided overlapping sets are allowed it should be feasible.  However, 
> I'm not a big fan of overlapping sets as it would make using different 
> CPU scheduling configurations in each set more difficult (maybe even 
> inadvisable) but that's a different issue.

One can resolve these apparently conflicting objectives by having the
scheduling configuration apply to an entire subtree of the cpuset
hierarchy.  When cpuset "a/b" is created below cpuset "a", by
default cpuset "a/b" should get reference counted links to the same
scheduler and other CKRM policies as "a" had.

Then details about what happens further down the cpuset tree, as leaf
nodes come and go, overlapping with their parents, in order to emulate
the old affinity calls, don't confuse the scheduling configuration,
which applies across the same broad swath of CPUs before the affinity
call as after.

You don't need all the cpusets non-overlapping, you just need the
ones that define the realm of a particular scheduling policy to be
non-overlapping (or to tolerate the confusions that result if they
aren't, if that's preferrable - I don't know that it is.)

Indeed, the simple act of an individual task tweaking its own CPU or
Memory affinity should _not_ give it a different scheduling realm. 
Rather such a task must remain stuck in whatever realm it was in before
that affinity call.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-03  2:49                         ` Paul Jackson
@ 2004-10-03 12:19                           ` Hubertus Franke
  0 siblings, 0 replies; 233+ messages in thread
From: Hubertus Franke @ 2004-10-03 12:19 UTC (permalink / raw)
  To: Paul Jackson
  Cc: akpm, mef, nagar, ckrm-tech, efocht, mbligh, lse-tech, hch,
	steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel, colpatch,
	Simon.Derr, ak, sivanich, llp



Paul Jackson wrote:
> Hubertus wrote:
> 
>>CKRM could do so. We already provide the name space and the class 
>>hierarchy.
> 
> 
> Just because two things have name spaces and hierarchies, doesn't
> make them interchangeable.  Name spaces and hierarchies are just
> implementation mechanisms - many interesting, entirely unrelated,
> solutions make use of them.
> 
> What are the objects named, and what is the relation underlying
> the hierarchy?  These must match up.

Object name relationships are established through the rcfs pathname.

> 
> The objects named in cpusets are subsets of a systems CPUs and Memory
> Nodes. The relation underlying the hierarchy is the subset relation on
> these sets: if one cpuset node is a descendent of another, then its
> CPUs and Memory Nodes are a subset of the others.

Exactly, the controller will enforce that in the same way we
enforce other attributes and shares.
Example, we make sure that the sum of the share "guarantees" for
all children does not exceed the total_guarantee (i.e. denominator)
of the parent.
Nothing prohibits the controller to enforce the set constraints
you describe above and reject requests that are not valid.
As I said before, ideally the controller would be the cpumem set
guts and RCFS would be the API to it.

That's what Andrew was asking for in case the requirement for
this functionality can/is made.

> 
> What is the corresponding statement for CKRM?
> 
> For CKRM to subsume cpusets, there must be an injective map from the
> above cpuset objects to CKRM objects, that preserves this subset
> relation on cpusets.
> 

See above.


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-03  2:26                       ` Paul Jackson
@ 2004-10-03 14:11                         ` Paul Jackson
  0 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-03 14:11 UTC (permalink / raw)
  To: Paul Jackson
  Cc: akpm, frankeh, mef, nagar, ckrm-tech, efocht, mbligh, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich, llp

Paul wrote:
> It's a requirement, I say.  It's a requirement.  Let the slapping begin ;).

Granted, to give Andrew his due (begrudgingly ;), the requirement
to pin processes on CPUs is a requirement of the _implementation_,
which follows, for someone familiar with the art, from the two
items:
  1) The requirement of the _user_ that runtimes be repeatable
     within perhaps 1% to 5% for a certain class of job, plus
  2) The cantankerous properties of big honkin NUMA boxes.

Clearly, Andrew was looking for _user_ requirements, to which I
managed somewhat unwittingly to back up in my user case scenario.


I suspect that there is a second user case scenario, with which the Bull
or NEC folks might be more familiar with than I, that can seemingly lead
to the same implementation requirement to pin jobs.  This scenario would
involve a customer who has paid good money for some compute capacity
(CPU cycles and Memory pages) with a certain guaranteed Quality of
Service, and who would prefer to see this capacity go to waste when
underutilized rather than risk it being unavailable in times of need.

However in this case, as Andrew is likely already chomping at the bit to
tell me, CKRM could provide such guaranteed compute capacities without
pinning.

Whether or not a CKRM class would sell to the customers of Bull and
NEC in lieu of a set of pinned nodes, I have no clue.

  Erich, Simon - Can you introduce a note of reality into my
		 speculations above?


The third user case scenario that commonly leads us to pinning is
support of the batch or workload managers, PBS and LSF, which are fond
of dividing the compute resources up into identifiable subsets of CPUs
and Memory Nodes that are near to each other (in terms of the NUMA
topology) and that have the size (compute capacity as measured in free
cycles and freely available ram) requested by a job, then attaching that
job to that subset and running it.

In this third case, batch or workload managers have a long history with
big honkin SMP and NUMA boxes, and this remains an important market for
them.  Consistent runtimes are valued by their customers and are a key
selling point of these products in the HPC market.  So this third case
reduces to the first, with its implementation requirement for pinning
the tasks of an active job to specific CPUs and Memory Nodes.

For example from Platform's web site (the vendor of LSF) at:
    http://www.platform.com/products/HPC
the benefits for their LSF HPC product include:
  * Guaranteed consistent and reliable parallel workload processing with
    high performance interconnect support
  * Maximized application performance with topology-aware scheduling
  * Ensures application runtime consistency by automatically allocating
    similar processors 

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02 23:21                       ` Peter Williams
                                           ` (3 preceding siblings ...)
  2004-10-03  3:39                         ` Paul Jackson
@ 2004-10-03 14:36                         ` Martin J. Bligh
  2004-10-03 15:39                           ` Paul Jackson
                                             ` (2 more replies)
  4 siblings, 3 replies; 233+ messages in thread
From: Martin J. Bligh @ 2004-10-03 14:36 UTC (permalink / raw)
  To: Peter Williams, Hubertus Franke
  Cc: dipankar, Paul Jackson, Andrew Morton, ckrm-tech, efocht,
	lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, colpatch, Simon.Derr, ak, sivanich

>> The O(1) scheduler today does not know about cpumem sets. It operates
>> on the level of affinity masks to adhere to the constraints specified 
>> based on cpu masks.
> 
> This is where I see the need for "CPU sets".  I.e. as a 
> replacement/modification to the CPU affinity mechanism basically adding 
> an extra level of abstraction to make it easier to use for implementing 
> the type of isolation that people seem to want.  I say this because, 
> strictly speaking and as you imply, the current affinity mechanism is 
> sufficient to provide that isolation BUT it would be a huge pain to 
> implement.

The way cpusets uses the current cpus_allowed mechanism is, to me, the most
worrying thing about it. Frankly, the cpus_allowed thing is kind of tacked
onto the existing scheduler, and not at all integrated into it, and doesn't
work well if you use it heavily (eg bind all the processes to a few CPUs,
and watch the rest of the system kill itself). 

Matt had proposed having a separate sched_domain tree for each cpuset, which
made a lot of sense, but seemed harder to do in practice because "exclusive"
in cpusets doesn't really mean exclusive at all. Even if we don't have 
separate sched_domain trees, cpusets could be the top level in the master 
tree, I think.

M.


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-03 14:36                         ` Martin J. Bligh
@ 2004-10-03 15:39                           ` Paul Jackson
  2004-10-03 23:53                             ` Martin J. Bligh
  2004-10-03 16:02                           ` Paul Jackson
  2004-10-03 20:10                           ` Tim Hockin
  2 siblings, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-10-03 15:39 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich

Martin wrote:
> Matt had proposed having a separate sched_domain tree for each cpuset, which
> made a lot of sense, but seemed harder to do in practice because "exclusive"
> in cpusets doesn't really mean exclusive at all.

See my comments on this from yesterday on this thread.

I suspect we don't want a distinct sched_domain for each cpuset, but
rather a sched_domain for each of several entire subtrees of the cpuset
hierarchy, such that every CPU is in exactly one such sched domain, even
though it be in several cpusets in that sched_domain.  Perhaps each
cpuset in such a subtree points to the same reference counted
sched_domain, or perhaps each cpuset except the one at the root of the
subtree has a flag set, telling the scheduler to search up the cpuset
tree to find a sched_domain.  Probably the former, for performance
reasons.

As I can see even my own eyes glazing over trying to read what I just
wrote, let me give an example.

Let's say we have a 256 CPU system.  At the top level, we divide it into
five non-overlapping cpusets, of sizes 64, 64, 32, 28 and 4.  Each of
these five cpusets has its sched_domain, except the third one, of 32 CPUs.
That one is subdivided into 4 cpusets, of 8 CPUs each, non-overlapping,
each of the four with its own sched_domain.

[Aside - granted this is topologically equivalent to the flattened
partitioning into the eight cpusets of sizes 64, 64, 8, 8, 8, 8, 28 and
4.  Perhaps the 32 CPUs were farmed out to the Professor of Eccentric
Economics, who has permission to manage his 32 CPUs and divide them
further, but who lacks permission to modify the top layer of the cpuset
hierarchy.]

So we have eight cpusets, non-overlapping and covering the entire
system, each with its own sched_domain.  Now within those cpusets,
for various application reasons, further subdivisions occur.  But
no more sched_domains are created, and the existing sched_domains
apply to all tasks attached to any cpuset in their cpuset subtree.

On the other topic you raise, of the meaning (or lack thereof) of
"exclusive".  Perhaps "exclusive" should not a property of a node in
this tree, but rather a property of a node under a certain covering or
mapping.  You note we need a map from the range of CPUs to the domain
sched_domain's, specifying for each CPU its unique sched_domain.  And we
might have some other map on these same CPUs or Memory Nodes for other
purposes.  I am afraid I've forgotten too much of my math from long long
ago to state this with exactly the right terms.  But I can imagine
adding a little bit more code to cpusets, that kept a small list of such
mappings over the domains of CPUs and Memory Nodes, and that validated,
on each cpuset change, that each mapping preserved whatever properties
of covering and non-overlapping that it was marked for.  One of these
mappings could be into the range of sched_domains and be marked for both
covering and non-overlapping.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-03 14:36                         ` Martin J. Bligh
  2004-10-03 15:39                           ` Paul Jackson
@ 2004-10-03 16:02                           ` Paul Jackson
  2004-10-03 23:47                             ` Martin J. Bligh
  2004-10-03 20:10                           ` Tim Hockin
  2 siblings, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-10-03 16:02 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich

Martin wrote:
> The way cpusets uses the current cpus_allowed mechanism is, to me, the most
> worrying thing about it. Frankly, the cpus_allowed thing is kind of tacked
> onto the existing scheduler, and not at all integrated into it, and doesn't
> work well if you use it heavily (eg bind all the processes to a few CPUs,
> and watch the rest of the system kill itself). 

True.  One detail of what you say I'm unclear on -- how will the rest of
the system kill itself?  Why wouldn't the unemployed CPUs just idle
around, waiting for something to do?

As I recall, Ingo added task->cpus_allowed for the Tux in-kernel web
server a few years back, and I piggy backed the cpuset stuff on that, to
keep my patch size small.

Likely your same concerns apply to the task->mems_allowed field that
I added, in the same fashion, in my cpuset patch of recent.

We need a mechanism that the cpuset apparatus respects that maps each
CPU to a sched_domain, exactly one sched_domain for any given CPU at any
point in time, regardless of which task it is considering running at the
moment.  Somewhat like dual-channeled disks, having more than one
sched_domain apply at the same time to a given CPU leads to confusions
best avoided unless desparately needed.  Unlike dual-channeled disks, I
don't see the desparate need here for multi-channel sched_domains ;).

And of course, for the vast majority of normal systems in the world
not configured with cpusets, this has to collapse back to something
sensible "just like it is now."

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-03 14:36                         ` Martin J. Bligh
  2004-10-03 15:39                           ` Paul Jackson
  2004-10-03 16:02                           ` Paul Jackson
@ 2004-10-03 20:10                           ` Tim Hockin
  2004-10-04  1:56                             ` Paul Jackson
  2 siblings, 1 reply; 233+ messages in thread
From: Tim Hockin @ 2004-10-03 20:10 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Peter Williams, Hubertus Franke, dipankar, Paul Jackson,
	Andrew Morton, ckrm-tech, efocht, lse-tech, hch, steiner,
	jbarnes, sylvain.jeaugey, djh, linux-kernel, colpatch,
	Simon.Derr, ak, sivanich

On Sun, Oct 03, 2004 at 07:36:46AM -0700, Martin J. Bligh wrote:
> > This is where I see the need for "CPU sets".  I.e. as a 
> > replacement/modification to the CPU affinity mechanism basically adding 
> > an extra level of abstraction to make it easier to use for implementing 
> > the type of isolation that people seem to want.  I say this because, 
> > strictly speaking and as you imply, the current affinity mechanism is 
> > sufficient to provide that isolation BUT it would be a huge pain to 
> > implement.
> 
> The way cpusets uses the current cpus_allowed mechanism is, to me, the most
> worrying thing about it. Frankly, the cpus_allowed thing is kind of tacked
> onto the existing scheduler, and not at all integrated into it, and doesn't
> work well if you use it heavily (eg bind all the processes to a few CPUs,
> and watch the rest of the system kill itself). 

7 years ago, before cpus_allowed was dreamed up, I proposed a pset patch
and was shot down hard.  Now it's back, and we're trying to find a way to
cram it in on top.

Yeah, it does not fit nicely with cpus_allowed.

I have to ask - do we REALLY need cpusets?  I meant, even SGI dropped
PSET at some point, because (if I recall) NO ONE USED IT.

What's the problem being solved that *requires* psets?

I have a customer I work with periodically who was using my pset patch up
until they moved to RH8, when the O(1) scheduler and cpus_allowed changed
everything.  This was their requirement for pset:

1. Take a processor out of the general execution pool (call it
PROC_RESTRICTED).  This processor will not schedule general tasks.
2. Assign a task to the PROC_RESTRICTED cpu.  Now that CPU will only
schedule the assigned task (and it's children).
3. Repeat for every CPU, with the caveat that one CPU must remain
PROC_ENABLED.

I had an array of enum procstate and a new syscall pair:
sched_{gs}etprocstate().  The scheduler checks the procstate, and if it is
not ENABLED, it checks that (cpus_allowed == 1<<cpu).  Simple, but works.
Could be baked a bit more, for general use.

What if I proposed a patch like this, now?  It would require cleanup for
2.6, but I'm game if it's useful.

Tim


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02  6:06                 ` Paul Jackson
  2004-10-02 14:55                   ` Dipankar Sarma
@ 2004-10-03 20:21                   ` Erich Focht
  2004-10-03 20:48                     ` Andrew Morton
                                       ` (2 more replies)
  1 sibling, 3 replies; 233+ messages in thread
From: Erich Focht @ 2004-10-03 20:21 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Andrew Morton, nagar, ckrm-tech, mbligh, lse-tech, hch, steiner,
	jbarnes, sylvain.jeaugey, djh, linux-kernel, colpatch,
	Simon.Derr, ak, sivanich

> The other declared potential users of cpusets, Bull and NEC at
> least, seem from what I can tell to have a somewhat different
> focus, toward providing a mix of compute services with minimum
> interference, from what I'd guess are more departmental size
> systems.
> 
> Bull (Simon) and NEC (Erich) should also look closely at CKRM,
> and then try to describe their requirements, so we can understand
> whether CKRM, cpusets or both or neither can meet their needs.

The requirements I have in mind come from our customers: users,
benchmarkers, administrators and compute center management. They are
used to our kind of big iron, the NEC SX (earth simulator style
hardware) which is running a proprietary Unix and has a few amenities
not present in Linux. Among them: gang scheduling (even across
machines for big parallel jobs), resource groups and tight integration
of these features with the batch resource manager.

Can cpusets help me/us/Linux to get closer to these requirements?

A clear yes. Regard cpusets as a new kind of composite resource built
from memory and CPUs. They can play the role of the resource groups we
need. Disjunct cpusets can run jobs which will almost never interfere
cpu-cycle or memory-wise. This can be easilly integrated into PBS/LSF
or whatever batch resource manager comes to your mind. Cpusets
selected with some knowledge of the NUMA characteristics of a machine
guarantee always reproducible and best compute performance. If a job
runs alone in a cpuset it will run as if the machine has been reduced
to that piece and is owned exclusively by the job. Also if the set
contains as many CPUs as MPI processes, the cpuset helps getting some
sort of gang scheduling (i.e. all members of a parallel process get
cycles at the same time, this reduces barrier synchronisation times,
improves performance and makes it more predictible). This is something
one absolutely needs on big machines when dealing with time critical
highest performance applications. Permanently losing 10% because the
CPU placement is poor or because one has to get some other process out
of the way is just inacceptable. When you sell machines for several
millions 10% performance loss translates to quite some amount of
money.

Can CKRM (as it is now) fulfil the requirements?

I don't think so. CKRM gives me to some extent the confidence that I
will really use the part of the machine for which I paid, say 50%. But
it doesn't care about the structure of the machine. CKRM tries giving
a user as much of the machine as possible, at least the amount he paid
for. For example: When I come in with my job the machine might be
already running another job who's user also paid for 50% but was the
only user and got 100% of the machine (say some Java application with
enough threads...). This job maybe has filled up most of the memory
and uses all CPUs. CKRM will take care of getting me cycles (maybe
exclusively on 50% of the CPUs and will treat my job preferrentially
when allocating memory, but will not care about the placement of the
CPUs and the memory. Neither will it care whether the previously
running job is still using my memory blocks and reducing my bandwith
to them. So I get 50% of the cycles and the memory but these will be
BAD CYCLES and BAD MEMORY. My job will run slower than possible and a
second run will be again different. Don't misunderstand me: CKRM in
its current state is great for different things and running it inside
a cpuset sounds like a good thing to do.

What about integration with PBS/LSF and alike?

It makes sense to let an external resource manager (batch or
non-batch) keep track of and manage cpusets resources. It can allocate
them and give them to jobs (exclusively) and delete them. That's
perfect and exactly what we want. CKRM is a resource manager itself
and has an own idea about resources. Certainly PBS/LSF/etc. could
create a CKRM class for each job and run it in this class. The
difficulty is to avoid the resource managers to interfere and work
against each other. In such a setup I'd rather expect a batch manager
to be started inside one CKRM class and let it ensure that e.g. the
interactive class isn't starved by the batch class.

Can CKRM be extended to do what cpusets do? 

Certainly. Probably easilly. But cpusets will have to be reinvented, I
guess. Same hooks, same checks, different user interface...

Erich


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-03 20:21                   ` Erich Focht
@ 2004-10-03 20:48                     ` Andrew Morton
  2004-10-04 14:05                       ` Erich Focht
  2004-10-04  3:41                     ` Paul Jackson
  2004-10-04 13:58                     ` Hubertus Franke
  2 siblings, 1 reply; 233+ messages in thread
From: Andrew Morton @ 2004-10-03 20:48 UTC (permalink / raw)
  To: Erich Focht
  Cc: pj, nagar, ckrm-tech, mbligh, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich

Erich Focht <efocht@hpce.nec.com> wrote:
>
> Can CKRM (as it is now) fulfil the requirements?
> 
>  I don't think so. CKRM gives me to some extent the confidence that I
>  will really use the part of the machine for which I paid, say 50%. But
>  it doesn't care about the structure of the machine.

Right.   That's a restriction of the currently-implemented CKRM controllers.

> ...
>  Can CKRM be extended to do what cpusets do? 
> 
>  Certainly. Probably easilly. But cpusets will have to be reinvented, I
>  guess. Same hooks, same checks, different user interface...

Well if it is indeed the case that the CKRM *framework* is up to the task
of being used to deliver the cpuset functionality then that's the way we
should go, no?  It's more work and requires coordination and will deliver
later, but the eventual implementation will be better.

But I'm still not 100% confident that the CKRM framework is suitable. 
Mainly because the CKRM and cpuset teams don't seem to have looked at each
other's stuff enough yet.

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-03 16:02                           ` Paul Jackson
@ 2004-10-03 23:47                             ` Martin J. Bligh
  2004-10-04  3:33                               ` Paul Jackson
  0 siblings, 1 reply; 233+ messages in thread
From: Martin J. Bligh @ 2004-10-03 23:47 UTC (permalink / raw)
  To: Paul Jackson
  Cc: pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich

--Paul Jackson <pj@sgi.com> wrote (on Sunday, October 03, 2004 09:02:09 -0700):

> Martin wrote:
>> The way cpusets uses the current cpus_allowed mechanism is, to me, the most
>> worrying thing about it. Frankly, the cpus_allowed thing is kind of tacked
>> onto the existing scheduler, and not at all integrated into it, and doesn't
>> work well if you use it heavily (eg bind all the processes to a few CPUs,
>> and watch the rest of the system kill itself). 
> 
> True.  One detail of what you say I'm unclear on -- how will the rest of
> the system kill itself?  Why wouldn't the unemployed CPUs just idle
> around, waiting for something to do?

I think last time I looked they just sat there saying:

Rebalance!  
Ooooh, CPU 3 over there looks heavily loaded, I'll steal something.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
Humpf. I give up.
Rebalance!  
Ooooh, CPU 3 over there looks heavily loaded, I'll steal something.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
Humpf. I give up.
Rebalance!  
Ooooh, CPU 3 over there looks heavily loaded, I'll steal something.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
That one. Try to migrate. Oops, no cpus_allowed bars me.
Humpf. I give up.
... ad infinitum.

Desperately boring, and rather ineffective.

> As I recall, Ingo added task->cpus_allowed for the Tux in-kernel web
> server a few years back, and I piggy backed the cpuset stuff on that, to
> keep my patch size small.
> 
> Likely your same concerns apply to the task->mems_allowed field that
> I added, in the same fashion, in my cpuset patch of recent.

Mmm, I'm less concerned about that one, or at least I can't specifically
see how it breaks.
 
> We need a mechanism that the cpuset apparatus respects that maps each
> CPU to a sched_domain, exactly one sched_domain for any given CPU at any
> point in time, regardless of which task it is considering running at the
> moment.  Somewhat like dual-channeled disks, having more than one
> sched_domain apply at the same time to a given CPU leads to confusions
> best avoided unless desparately needed. 

Agreed. The cpus_allowed mechanism doesn't seem well suited to heavy use
anyway (I think John Hawkes had problems with it too). That's not your
fault ... but I'm not convinced it's a good foundation to be building
further things on either ;-)

M.


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-03 15:39                           ` Paul Jackson
@ 2004-10-03 23:53                             ` Martin J. Bligh
  2004-10-04  0:02                               ` Martin J. Bligh
                                                 ` (2 more replies)
  0 siblings, 3 replies; 233+ messages in thread
From: Martin J. Bligh @ 2004-10-03 23:53 UTC (permalink / raw)
  To: Paul Jackson
  Cc: pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich, colpatch

> Martin wrote:
>> Matt had proposed having a separate sched_domain tree for each cpuset, which
>> made a lot of sense, but seemed harder to do in practice because "exclusive"
>> in cpusets doesn't really mean exclusive at all.
> 
> See my comments on this from yesterday on this thread.
> 
> I suspect we don't want a distinct sched_domain for each cpuset, but
> rather a sched_domain for each of several entire subtrees of the cpuset
> hierarchy, such that every CPU is in exactly one such sched domain, even
> though it be in several cpusets in that sched_domain.

Mmmm. The fundamental problem I think we ran across (just whilst pondering,
not in code) was that some things (eg ... init) are bound to ALL cpus (or
no cpus, depending how you word it); i.e. they're created before the cpusets
are, and are a member of the grand-top-level-uber-master-thingummy.

How do you service such processes? That's what I meant by the exclusive
domains aren't really exclusive. 

Perhaps Matt can recall the problems better. I really liked his idea, aside
from the small problem that it didn't seem to work ;-)

> So we have eight cpusets, non-overlapping and covering the entire
> system, each with its own sched_domain.

But that's the problem ... I think there are *always* cpusets that overlap.
Which is sad (fixable?) because it breaks lots of intelligent things we
could do. 

> purposes.  I am afraid I've forgotten too much of my math from long long
> ago to state this with exactly the right terms.

That's OK, so have most of the rest of us, so even if you could remember,
it wouldn't help much ;-)

M.


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-03 23:53                             ` Martin J. Bligh
@ 2004-10-04  0:02                               ` Martin J. Bligh
  2004-10-04  0:53                                 ` Paul Jackson
  2004-10-04  0:45                               ` Paul Jackson
  2004-10-05 22:19                               ` Matthew Dobson
  2 siblings, 1 reply; 233+ messages in thread
From: Martin J. Bligh @ 2004-10-04  0:02 UTC (permalink / raw)
  To: Paul Jackson
  Cc: pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich

--"Martin J. Bligh" <mbligh@aracnet.com> wrote (on Sunday, October 03, 2004 16:53:40 -0700):

>> Martin wrote:
>>> Matt had proposed having a separate sched_domain tree for each cpuset, which
>>> made a lot of sense, but seemed harder to do in practice because "exclusive"
>>> in cpusets doesn't really mean exclusive at all.
>> 
>> See my comments on this from yesterday on this thread.
>> 
>> I suspect we don't want a distinct sched_domain for each cpuset, but
>> rather a sched_domain for each of several entire subtrees of the cpuset
>> hierarchy, such that every CPU is in exactly one such sched domain, even
>> though it be in several cpusets in that sched_domain.
> 
> Mmmm. The fundamental problem I think we ran across (just whilst pondering,
> not in code) was that some things (eg ... init) are bound to ALL cpus (or
> no cpus, depending how you word it); i.e. they're created before the cpusets
> are, and are a member of the grand-top-level-uber-master-thingummy.
> 
> How do you service such processes? That's what I meant by the exclusive
> domains aren't really exclusive. 
> 
> Perhaps Matt can recall the problems better. I really liked his idea, aside
> from the small problem that it didn't seem to work ;-)
> 
>> So we have eight cpusets, non-overlapping and covering the entire
>> system, each with its own sched_domain.
> 
> But that's the problem ... I think there are *always* cpusets that overlap.
> Which is sad (fixable?) because it breaks lots of intelligent things we
> could do. 

Hmmm. What if when you created a new, exclusive CPUset, the cpus you spec'ed
were *removed* from the parent CPUset (and existing processes forcibly
migrated off). That'd fix most of it, and would bring us much closer to the
true meaning of "exclusive". Changes your semantics a bit, but still ...

OK, so there is one problem I can see - you couldn't remove the last CPU
from the parent if there were any jobs running in it, but presumably fixable
(eg you have to move them into the created child, or fail the call).

M.


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-03 23:53                             ` Martin J. Bligh
  2004-10-04  0:02                               ` Martin J. Bligh
@ 2004-10-04  0:45                               ` Paul Jackson
  2004-10-04 11:44                                 ` Rick Lindsley
  2004-10-05 22:19                               ` Matthew Dobson
  2 siblings, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-10-04  0:45 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich

Martin wrote:
>
> Mmmm. The fundamental problem I think we ran across (just whilst pondering,
> not in code) was that some things (eg ... init) are bound to ALL cpus (or
> no cpus, depending how you word it); i.e. they're created before the cpusets
> are, and are a member of the grand-top-level-uber-master-thingummy.
> 
> How do you service such processes? That's what I meant by the exclusive
> domains aren't really exclusive. 

I move 'em.  I have user code that identifies the kernel threads whose
cpus_allowed is a superset of cpus_online_map, and I put them in a nice
little padded cell with init and the classic Unix daemons, called the
'bootcpuset'.

The tasks whose cpus_allowed is a strict _subset_ of cpus_online_map
need to be where they are.  These are things like the migration helper
threads, one for each cpu.  They get a license to violate cpuset
boundaries.

I will probably end up submitting a patch at some point, that changes
two lines, one in ____call_usermodehelper() and one in kthread(), from
setting the cpus_allowed on certain kernel threads to CPU_MASK_ALL, so
that instead these lines set that cpus_allowed to a new mask, a kernel
global variable that can be read and written via the cpuset api.  But
other than that, I don't need anymore kernel hooks than I already have,
and even now, I can get everything that's causing me any grief pinned
into the bootcpuset.


> But that's the problem ... I think there are *always* cpusets that overlap.
> Which is sad (fixable?) because it breaks lots of intelligent things we
> could do. 

So with my bootcpuset, the problem is reduced, to a few tasks per CPU,
such as the migration threads, which must remain pinned on their one CPU
(or perhaps on just the CPUs local to one Memory Node).  These tasks
remain in the root cpuset, which by the scheme we're contemplating,
doesn't get a sched_domain in the fancier configurations.

Yup - you're right - these tasks will also want the scheduler to give
them CPU time when they need it.  Hmmm ... logically this violates our
nice schemes, but seems that we are down to such a small exception case
that there must be some primitive way to workaround this.

We basically need to keep a list of the 4 or 5 per-cpu kernel threads,
and whenever we repartition the sched_domains, make sure that each such
kernel thread is bound to whatever sched_domain happens to be covering
that cpu.  If we just wrote the code, and quit trying to find a grand
unifying theory to explain it consistently with the rest of our design,
it would probably work just fine.

The cpuset code would have to be careful, when it came time to list the
tasks attached to a cpuset (Workload Manager software is fond of this
call) _not_ to list these indigenous (not "migrant" !) worker threads.
And when listing the tasks in the root cpuset, _do_ include them.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-04  0:02                               ` Martin J. Bligh
@ 2004-10-04  0:53                                 ` Paul Jackson
  2004-10-04  3:56                                   ` Martin J. Bligh
  0 siblings, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-10-04  0:53 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich

Martin wrote:
> (and existing processes forcibly migrated off)

No can do.  As described in my previous message, everything is happily
moved already, with some user code (and a CPU_MASK_ALL patch to kthread
I haven't submitted yet) _except_ for a few per-CPU threads such as the
migration helpers, which can _not_ be moved off their respective CPUs.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-03 20:10                           ` Tim Hockin
@ 2004-10-04  1:56                             ` Paul Jackson
  0 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-04  1:56 UTC (permalink / raw)
  To: Tim Hockin
  Cc: mbligh, pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht,
	lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, colpatch, Simon.Derr, ak, sivanich

Tim wrote:
> 7 years ago, before cpus_allowed was dreamed up, I proposed a pset patch

One more thing ... the original message from Simon and Sylvain that I
first saw a year ago announcing their cpuset work, which is the basis
for the current cpuset patch in Andrew's tree, began with the lines:


> From: Simon Derr <Simon.Derr@bull.net>
> Date: Wed, 24 Sep 2003 17:59:01 +0200 (DFT)
> To: lse-tech@lists.sourceforge.net, linux-ia64@vger.kernel.org
> cc: Sylvain Jeaugey <sylvain.jeaugey@bull.net>
> 
> We have developped a new feature in the Linux kernel, controlling CPU
> placements, which are useful on large SMP machines, especially NUMA ones.
> We call it CPUSETS, and we would highly appreciate to know about anyone
> who would be interested in such a feature. This has been somewhat inspired
> by the pset or cpumemset patches existing for Linux 2.4.


So I guess Tim, you (pset) and I (cpumemset) can both claim to
have developed anticedents of this current cpuset proposal.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-03 23:47                             ` Martin J. Bligh
@ 2004-10-04  3:33                               ` Paul Jackson
  0 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-04  3:33 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich, raybry

Martin wrote:
> Rebalance!  
> Ooooh, CPU 3 over there looks heavily loaded, I'll steal something.
> That one. Try to migrate. Oops, no cpus_allowed bars me.
> ...
> Humpf. I give up.
> ... ad infinitum.
> 
> Desperately boring, and rather ineffective.

Well ... I don't mind unemployed CPUs being borish.  It's not that they
have much useful work to do.  But if they keep beating down the doors of
their neighbors trying to find work, that seems disruptive.  Won't CPU 3
in your example waste time and suffer increased lock contention,
responding to its deadbeat neighbor?


> > Likely your same concerns apply to the task->mems_allowed field that
> > I added, in the same fashion, in my cpuset patch of recent.
> 
> Mmm, I'm less concerned about that one, or at least I can't specifically
> see how it breaks.

Ray Bryant <raybry@sgi.com> is working this now.  There are ways to get
memory allocated that hurt on our big boxes - such as blowing out one
nodes memory with a disproportionate share of the systems page cache
pages, due to problems vaguely like the cpus_allowed ones.

The kernel allocator and numa placement policies don't really integrate
mems_allowed into their algorithms, but rather are just whacked upside
the head anytime they ask if they can allocate on a non-allowed node.
They can end up doing suboptimal placement on big boxes.

A common one is that the first node in a multiple-node cpuset gets a
bigger memory load from allocations initiated on nodes up stream of it,
that weren't allowed to roost closer to home (or something like this ...
not sure I said this one just right).

Ray is leaning on me to get some kind of memory policy in each cpuset.
I'm giving him a hard time back over details of what this policy
structure should look like, buying time while I try to make more sense
of this all.

I've added him to the cc list here - hopefully he will find my
characterization of our discussions amusing ;).


> > Somewhat like dual-channeled disks, having more than one
> > sched_domain apply at the same time to a given CPU leads to confusions
> > best avoided unless desparately needed. 
> 
> Agreed. The cpus_allowed mechanism doesn't seem well suited to heavy use
> anyway (I think John Hawkes had problems with it too).

The various problems Hawkes had were various race conditions using the
new (at the time) set_cpus_allowed() that Ingo (I believe) added as part
of the O(1) scheduler.  SGI was on the bleeding edge of using the
set_cpus_allowed() call in new and exciting ways, and there were various
race and lock conditions and issues with making sure the per-cpu
migration threads stayed home.

Other than reminding us that this stuff is hard, these problems Hawkes
dealt with don't, to my understanding, shed any light on the new issue
uncovered in this thread, that a simple per-task cpus_allowed mask,
heavily used to affect affinity policy, can interact poorly with
sophisticated schedulers trying to balance an entire system.

===

In sum, I am tending further in the direction of thinking we need to
have scheduler and allocation policies handled on a "per-domain" basis,
where these domains take the form of a partition of the system into
equivalence classes corresponding to subtrees of the cpuset hierarchy.

For example, just to throw out a wild and crazy idea, perhaps instead of
one global set of zonelists (one per node, each containing all nodes,
sorted in various numa friendly orders), rather there should be a set of
zonelists per memory-domain, containing just the nodes therein
(subsetted from the global zonelists, preserving order).

We'll have to be careful here.  I suspect that the tolerance of those
running normal sized systems for this kind of crap will be pretty low.

Moreover, the scheduler in particular, and the allocator somewhat as
well, are areas with a long history of intense technical development.
Our impact on these areas has to be simplistic, so that folks doing the
real work here can keep our multi-domain stuff working with almost no
mind to it at all.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-03 20:21                   ` Erich Focht
  2004-10-03 20:48                     ` Andrew Morton
@ 2004-10-04  3:41                     ` Paul Jackson
  2004-10-04 13:58                     ` Hubertus Franke
  2 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-04  3:41 UTC (permalink / raw)
  To: Erich Focht
  Cc: akpm, nagar, ckrm-tech, mbligh, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich

Most helpful response, Erich.  Thanks.

>  NEC SX (earth simulator style hardware)

Ah yes - another product that has earned my
affectionate term "big honkin iron".

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-04  0:53                                 ` Paul Jackson
@ 2004-10-04  3:56                                   ` Martin J. Bligh
  2004-10-04  4:24                                     ` Paul Jackson
  0 siblings, 1 reply; 233+ messages in thread
From: Martin J. Bligh @ 2004-10-04  3:56 UTC (permalink / raw)
  To: Paul Jackson
  Cc: pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich

> Martin wrote:
>> (and existing processes forcibly migrated off)
> 
> No can do.  As described in my previous message, everything is happily
> moved already, with some user code (and a CPU_MASK_ALL patch to kthread
> I haven't submitted yet) _except_ for a few per-CPU threads such as the
> migration helpers, which can _not_ be moved off their respective CPUs.

Well, that just means we need to check for things bound to a subset when
we fork it off. ie if we have cpus 1,2,3,4 ... and there is 

A bound to 1
B bound to 2
C bound to 3
D bound to 4

Then when I fork off exclusive subset for CPUs 1&2, I have to push A & B
into it. You're right, what I said was broken ... but it doesn't seem
hard to fix.

M.


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-04  3:56                                   ` Martin J. Bligh
@ 2004-10-04  4:24                                     ` Paul Jackson
  2004-10-04 15:03                                       ` Martin J. Bligh
  0 siblings, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-10-04  4:24 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich

Martin wrote:
> Then when I fork off exclusive subset for CPUs 1&2, I have to push A & B
> into it.

Tasks A & B must _not_ be considered members of that exclusive cpuset,
even though it seems that A & B must be attended to by the sched_domain
and memory_domain associated with that cpuset.

The workload managers expect to be able to list the tasks in a cpuset,
so it can hibernate, migrate, kill-off, or wait for the finish of these
tasks.  I've been through this bug before - it was one that cost Hawkes
a long week to debug - I was moving the per-cpu migration threads off
their home CPU because I didn't have a clear way to distinguish tasks
genuinely in a cpuset, from tasks that just happened to be indigenous to
some of the same CPUs.  My essential motivation for adapting a cpuset
implementation that has a task struct pointer to a shared cpuset struct
was to track exactly this relation - which tasks are in which cpuset.

No ... tasks A & B are not allowed in that new exclusive cpuset.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-04  0:45                               ` Paul Jackson
@ 2004-10-04 11:44                                 ` Rick Lindsley
  2004-10-04 22:46                                   ` [ckrm-tech] " Paul Jackson
  0 siblings, 1 reply; 233+ messages in thread
From: Rick Lindsley @ 2004-10-04 11:44 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Martin J. Bligh, pwil3058, frankeh, dipankar, akpm, ckrm-tech,
	efocht, lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, colpatch, Simon.Derr, ak, sivanich

    I move 'em.  I have user code that identifies the kernel threads
    whose cpus_allowed is a superset of cpus_online_map, and I put them
    in a nice little padded cell with init and the classic Unix daemons,
    called the 'bootcpuset'.

So the examples you gave before were rather oversimplified, then?
You talked about dividing up a 256 cpu machine but didn't mention that
some portion of that must be reserved for the "bootcpuset".  Would this
be enforced by the kernel, or the administrator?

I might suggest a simpler approach.  As a matter of policy, at least one
cpu must remain outside of cpusets so that system processes like init,
getty, lpd, etc. have a place to run.

    The tasks whose cpus_allowed is a strict _subset_ of cpus_online_map
    need to be where they are.  These are things like the migration
    helper threads, one for each cpu.  They get a license to violate
    cpuset boundaries.

Literally, or figuratively?  (How do we recognize these tasks?)

    I will probably end up submitting a patch at some point, that changes
    two lines, one in ____call_usermodehelper() and one in kthread(), from
    setting the cpus_allowed on certain kernel threads to CPU_MASK_ALL,
    so that instead these lines set that cpus_allowed to a new mask,
    a kernel global variable that can be read and written via the cpuset
    api.  But other than that, I don't need anymore kernel hooks than I
    already have, and even now, I can get everything that's causing me
    any grief pinned into the bootcpuset.

Will cpus in exclusive cpusets be asked to service interrupts?

Martin pointed out the problem with looking at overloaded cpus repeatedly,
only to find (repeatedly) we can't steal any of their processes.
This is a real problem, but exists today outside of any cpuset changes.
A decaying failure rate might provide a hint to the scheduler to alleviate
this problem, or maybe the direct route of just checking more thoroughly
from the beginning is the answer.

    So with my bootcpuset, the problem is reduced, to a few tasks
    per CPU, such as the migration threads, which must remain pinned
    on their one CPU (or perhaps on just the CPUs local to one Memory
    Node).  These tasks remain in the root cpuset, which by the scheme
    we're contemplating, doesn't get a sched_domain in the fancier
    configurations.

You just confused me on many different levels:

    * what is the root cpuset? Is this the same as the "bootcpuset" you
      made mention of?

    * so where *do* these tasks go in the "fancier configurations"?

    * what does it mean "not to get a sched_domain"?  That the tasks in
      the root cpuset can't move?  Can't run?  One solution to the
      problem Martin described is to completely split the hierarchy that
      sched_domain represents, with a different, disjoint tree for each
      group of cpus in a cpuset.  But wouldn't changing cpus_allowed
      in every process do the same thing? (Isn't that how this would be
      implemented at the lowest layer?)

I really haven't heard of anything that couldn't be handled adequately
through cpus_allowed so far other than "kicking everybody off a cpu"
which would need some new code.  (Although, probably not, now that I
think of it, with the new hotplug cpu code wanting to do that too.)

    If we just wrote the code, and quit trying to find a grand unifying
    theory to explain it consistently with the rest of our design,
    it would probably work just fine.

I'll assume we're missing a smiley here.

So we want to pin a process to a cpu or set of cpus: set cpus_allowed to
    that cpu or that set of cpus.
So we want its children to be subject to the same restriction: children
    already inherit the cpus_allowed mask of their parent.
We want to keep out everyone who shouldn't be here: then clear the
    bits for the restrictive cpus in their cpus_allowed mask when the
    restriction is created.

When you "remove a cpuset" you just or in the right bits in everybody's
cpus_allowed fields and they start migrating over.

To me, this all works for the cpu-intensive, gotta have it with 1% runtime
variation example you gave.  Doesn't it?  And it seems to work for the
department-needs-8-cpus-to-do-as-they-please example too, doesn't it?
The scheduler won't try to move a process to someplace it's not allowed.

Rick

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-03 20:21                   ` Erich Focht
  2004-10-03 20:48                     ` Andrew Morton
  2004-10-04  3:41                     ` Paul Jackson
@ 2004-10-04 13:58                     ` Hubertus Franke
  2004-10-04 14:13                       ` Simon Derr
                                         ` (2 more replies)
  2 siblings, 3 replies; 233+ messages in thread
From: Hubertus Franke @ 2004-10-04 13:58 UTC (permalink / raw)
  To: Erich Focht
  Cc: Paul Jackson, Andrew Morton, nagar, ckrm-tech, mbligh, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich



Erich Focht wrote:


> Can cpusets help me/us/Linux to get closer to these requirements?
> 
> A clear yes. Regard cpusets as a new kind of composite resource built
> from memory and CPUs. They can play the role of the resource groups we
> need. Disjunct cpusets can run jobs which will almost never interfere
> cpu-cycle or memory-wise. This can be easilly integrated into PBS/LSF
> or whatever batch resource manager comes to your mind. Cpusets
> selected with some knowledge of the NUMA characteristics of a machine
> guarantee always reproducible and best compute performance. If a job
> runs alone in a cpuset it will run as if the machine has been reduced
> to that piece and is owned exclusively by the job. Also if the set
> contains as many CPUs as MPI processes, the cpuset helps getting some
> sort of gang scheduling (i.e. all members of a parallel process get
> cycles at the same time, this reduces barrier synchronisation times,
> improves performance and makes it more predictible). This is something
> one absolutely needs on big machines when dealing with time critical
> highest performance applications. Permanently losing 10% because the
> CPU placement is poor or because one has to get some other process out
> of the way is just inacceptable. When you sell machines for several
> millions 10% performance loss translates to quite some amount of
> money.
> 
> Can CKRM (as it is now) fulfil the requirements?
> 
> I don't think so. CKRM gives me to some extent the confidence that I
> will really use the part of the machine for which I paid, say 50%. But
> it doesn't care about the structure of the machine. CKRM tries giving
> a user as much of the machine as possible, at least the amount he paid
> for. For example: When I come in with my job the machine might be
> already running another job who's user also paid for 50% but was the
> only user and got 100% of the machine (say some Java application with
> enough threads...). This job maybe has filled up most of the memory
> and uses all CPUs. CKRM will take care of getting me cycles (maybe
> exclusively on 50% of the CPUs and will treat my job preferrentially
> when allocating memory, but will not care about the placement of the
> CPUs and the memory. Neither will it care whether the previously
> running job is still using my memory blocks and reducing my bandwith
> to them. So I get 50% of the cycles and the memory but these will be
> BAD CYCLES and BAD MEMORY. My job will run slower than possible and a
> second run will be again different. Don't misunderstand me: CKRM in
> its current state is great for different things and running it inside
> a cpuset sounds like a good thing to do.

You forget that CKRM does NOT violate the constraints set forward by 
cpu_allowed masks. So most of your drawbacks described above are simply 
not true.
As such it comes back to the question whether the RCFS
and controller interfaces can be used to set the cpu_allowed masks
in accordance to the current cpuset semantics.
Absolutely we can...

I am certainly not stipulating that cpusets can replace share based 
scheduling or vice versa.

What remains to be discussed is whether
In order to allow CKRM scheduling within a cpuset here are a few 
questions to be answered:
(a) is it a guarantee/property that cpusets at with the same
     parent cpuset do not overlap ?
(b) can we enforce that a certain task class is limited to a cpuset
     and its subsets.

If we agree or disagree then we can work on a proposal for this.

> 
> What about integration with PBS/LSF and alike?
> 
> It makes sense to let an external resource manager (batch or
> non-batch) keep track of and manage cpusets resources. It can allocate
> them and give them to jobs (exclusively) and delete them. That's
> perfect and exactly what we want. CKRM is a resource manager itself
> and has an own idea about resources. Certainly PBS/LSF/etc. could
> create a CKRM class for each job and run it in this class. The
> difficulty is to avoid the resource managers to interfere and work
> against each other. In such a setup I'd rather expect a batch manager
> to be started inside one CKRM class and let it ensure that e.g. the
> interactive class isn't starved by the batch class.
> 
> Can CKRM be extended to do what cpusets do? 

See above, I think it can be. We need to answer (a) and (b) and then 
define what a share means.

> 
> Certainly. Probably easilly. But cpusets will have to be reinvented, I
> guess. Same hooks, same checks, different user interface...
> 

-- Hubertus


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-03 20:48                     ` Andrew Morton
@ 2004-10-04 14:05                       ` Erich Focht
  2004-10-04 14:57                         ` Martin J. Bligh
  0 siblings, 1 reply; 233+ messages in thread
From: Erich Focht @ 2004-10-04 14:05 UTC (permalink / raw)
  To: Andrew Morton
  Cc: pj, nagar, ckrm-tech, mbligh, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich

On Sunday 03 October 2004 22:48, Andrew Morton wrote:
> Erich Focht <efocht@hpce.nec.com> wrote:
> >  Can CKRM be extended to do what cpusets do? 
> > 
> >  Certainly. Probably easilly. But cpusets will have to be reinvented, I
> >  guess. Same hooks, same checks, different user interface...
> 
> Well if it is indeed the case that the CKRM *framework* is up to the task
> of being used to deliver the cpuset functionality then that's the way we
> should go, no?  It's more work and requires coordination and will deliver
> later, but the eventual implementation will be better.
> 
> But I'm still not 100% confident that the CKRM framework is suitable. 
> Mainly because the CKRM and cpuset teams don't seem to have looked at each
> other's stuff enough yet.

My optimistic assumption that it is easy to build cpusets into CKRM is
only valid for adding a cpuset controller into the CKRM framework and
forgetting about the other controllers. The problems start with the
other controllers... As Hubertus said: CKRM and cpusets are
orthogonal.

Now CKRM consists of a set of more or less independent (orthogonal)
controllers. There is a cpu cycles and memory controller. Their aims
are different from that of cpuset and they cannot fulfil the
requirements of cpusets. But they make sense for themselves.

Adding cpusets as another special resource controller is fine but
breaks the requirement of having independent controllers. With this we
suddenly have two ways of controlling cpu and memory assignment. As
discussed previously in this thread it probably makes more sense to
let the old CKRM controllers manage resources inside each cpuset (at
certain level in the cpusets tree). One could even imagine switching
off the CKRM controllers in particular sets. The old cpucycles and
memory controllers will not be able to influence cycles and memory
distribution outside a cpuset, anyway, because these are hardly
limited by the affinity masks. So adding cpusets into CKRM must lead
to dependent controllers and a hierarchy between them (cpusets being
above the old controllers). This is indeed difficult but Dipankar
mentioned that CKRM people think about such a design (if I interpreted
his email correctly).

If CKRM sticks at the requirement for independent controllers (which
is clean in design and has been demonstrated to work) then it should
maybe first learn to run in an arbitrary cpuset and ignore the rest of
the machine. Having separate CKRM instances running in each partition
of a machine soft-partitioned with cpusets could be a target.

If CKRM wants to be a universal resource controller in the kernel then
a resource dependency tree and hierarchy might need to get somehow
into the CKRM infrastructure. The cpu cycles controller should notice
that there is another controller above it (cpusets) and might ask
that controller which processes it should take into account for its
job. The memory controller might get a different answer... Uhmmm, this
looks like a difficult problem.

Regards,
Erich


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-04 13:58                     ` Hubertus Franke
@ 2004-10-04 14:13                       ` Simon Derr
  2004-10-04 14:15                       ` Erich Focht
  2004-10-04 14:37                       ` Paul Jackson
  2 siblings, 0 replies; 233+ messages in thread
From: Simon Derr @ 2004-10-04 14:13 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: Erich Focht, Paul Jackson, Andrew Morton, nagar, ckrm-tech,
	mbligh, lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, colpatch, Simon.Derr, ak, sivanich

On Mon, 4 Oct 2004, Hubertus Franke wrote:

> What remains to be discussed is whether
> In order to allow CKRM scheduling within a cpuset here are a few questions to
> be answered:
> (a) is it a guarantee/property that cpusets at with the same
>     parent cpuset do not overlap ?

It depends on whether they are 'exclusive' cpusets or not.
In the general case, they may overlap.


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-04 13:58                     ` Hubertus Franke
  2004-10-04 14:13                       ` Simon Derr
@ 2004-10-04 14:15                       ` Erich Focht
  2004-10-04 15:23                         ` Paul Jackson
  2004-10-04 14:37                       ` Paul Jackson
  2 siblings, 1 reply; 233+ messages in thread
From: Erich Focht @ 2004-10-04 14:15 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: Paul Jackson, Andrew Morton, nagar, ckrm-tech, mbligh, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich

On Monday 04 October 2004 15:58, Hubertus Franke wrote:
> Erich Focht wrote:
> > Can CKRM (as it is now) fulfil the requirements?
> > 
> > I don't think so. CKRM gives me to some extent the confidence that I
> > will really use the part of the machine for which I paid, say 50%. But
> > it doesn't care about the structure of the machine. CKRM tries giving
> > a user as much of the machine as possible, at least the amount he paid
> > for. For example: When I come in with my job the machine might be
> > already running another job who's user also paid for 50% but was the
> > only user and got 100% of the machine (say some Java application with
> > enough threads...). This job maybe has filled up most of the memory
> > and uses all CPUs. CKRM will take care of getting me cycles (maybe
> > exclusively on 50% of the CPUs and will treat my job preferrentially
> > when allocating memory, but will not care about the placement of the
> > CPUs and the memory. Neither will it care whether the previously
> > running job is still using my memory blocks and reducing my bandwith
> > to them. So I get 50% of the cycles and the memory but these will be
> > BAD CYCLES and BAD MEMORY. My job will run slower than possible and a
> > second run will be again different. Don't misunderstand me: CKRM in
> > its current state is great for different things and running it inside
> > a cpuset sounds like a good thing to do.
> 
> You forget that CKRM does NOT violate the constraints set forward by 
> cpu_allowed masks. So most of your drawbacks described above are simply 
> not true.

I explicitely implied that I only use CKRM. This means all processes
have the trivial cpus_allowed mask and are allowed to go wherever they
want. With this assumption (and my understanding of CKRM) the
drawbacks will be there.

Cpusets is my method of choice (for the future) for setting the
cpus_allowed mask (and the memories_allowed). If I use cpusets AND
CKRM together all is fine, of course.

> I am certainly not stipulating that cpusets can replace share based 
> scheduling or vice versa.
> 
> What remains to be discussed is whether
> In order to allow CKRM scheduling within a cpuset here are a few 
> questions to be answered:
> (a) is it a guarantee/property that cpusets at with the same
>      parent cpuset do not overlap ?

Right now it isn't AFAIK. Paul, if all cpusets on the same level are
disjunct this certainly simplifies life. Would this be a too strong
limitation for you? We could live with it.

> (b) can we enforce that a certain task class is limited to a cpuset
>      and its subsets.

That is intended, yes. A task escaping from its set would be a
security (or denial of service) risk.

Regards,
Erich


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-04 13:58                     ` Hubertus Franke
  2004-10-04 14:13                       ` Simon Derr
  2004-10-04 14:15                       ` Erich Focht
@ 2004-10-04 14:37                       ` Paul Jackson
  2 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-04 14:37 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: efocht, akpm, nagar, ckrm-tech, mbligh, lse-tech, hch, steiner,
	jbarnes, sylvain.jeaugey, djh, linux-kernel, colpatch,
	Simon.Derr, ak, sivanich

Erich wrote:
> > Can CKRM (as it is now) fulfil the requirements?
> ...
> [CKRM] doesn't care about the structure of the machine

Hubertus wrote:
> You forget that CKRM does NOT violate ... cpus_allowed ...
> ...
> In order to allow CKRM scheduling within a cpuset ...

I sense a disconnect here.

Seems to me Erich was asking if CKRM could be used _instead_ of cpusets,
and observes that, for now at least, CKRM lacks something.

Seems to me Hubertus is, _in_ _part_, responding to the question of
whether CKRM can be used _within_ cpusets, and claims to be taking a
position opposite to Erich's, protesting that indeed CKRM can be used
within cpusets - CKRM doesn't violate cpus_allowed constraints.

Hubertus - I didn't realize that Erich considered that question, not did
I realize he took that position.

Unfortunately, the plot thickens.  Hubertus goes on it seems to consider
other questions, and I start to lose the thread of his thought.  Such
questions as:

 - can RCFS/controllers set cpus_allowed as do cpusets?
	[ beware that there's more to cpusets than setting cpus_allowed ]
 - can cpusets replace shared based scheduling?
 - can share based scheduling replace cpusets?
 - can CKRM scheduling be allowed within cpusets?
 - are sibling cpusets exclusive?
	[ yes - if the exclusive property is set on them ]
 - can we enforce that a certain task class is limited to a cpuset subtree?

By now I'm thoroughly confused.  Fortunately, Hubertus concludes:

  - If we agree or disagree then we can work on a proposal for this.

Well, since I'm pretty sure from my Logic 101 class that we agree or
disagree, this is good news.  I'm glad to hear we can work on a proposal
on this [ what was 'this' again ...? ;) ]

One thing I am sure of ... either one of Hubertus or myself needs another
cup of coffee, or both Hubertus and I need to have a beer together.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-04 14:05                       ` Erich Focht
@ 2004-10-04 14:57                         ` Martin J. Bligh
  2004-10-04 15:30                           ` Paul Jackson
                                             ` (2 more replies)
  0 siblings, 3 replies; 233+ messages in thread
From: Martin J. Bligh @ 2004-10-04 14:57 UTC (permalink / raw)
  To: Erich Focht, Andrew Morton
  Cc: pj, nagar, ckrm-tech, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich

> My optimistic assumption that it is easy to build cpusets into CKRM is
> only valid for adding a cpuset controller into the CKRM framework and
> forgetting about the other controllers. The problems start with the
> other controllers... As Hubertus said: CKRM and cpusets are
> orthogonal.
> 
> Now CKRM consists of a set of more or less independent (orthogonal)
> controllers. There is a cpu cycles and memory controller. Their aims
> are different from that of cpuset and they cannot fulfil the
> requirements of cpusets. But they make sense for themselves.
 ...

> If CKRM wants to be a universal resource controller in the kernel then
> a resource dependency tree and hierarchy might need to get somehow
> into the CKRM infrastructure. The cpu cycles controller should notice
> that there is another controller above it (cpusets) and might ask
> that controller which processes it should take into account for its
> job. The memory controller might get a different answer... Uhmmm, this
> looks like a difficult problem.

I see that the two mechanisms could have conflicting requirements. But
surely this is the case whether we merge the two into one integrated
system, or try to run CKRM and cpusets independantly at the same time? 
I'd think the problems would be easier to tackle if the systems knew
about each other, and talked to each other.

I don't think anyone is suggesting that either system as is could replace
the other ... more that a combined system could be made for both types
of resource control that would be a better overall solution.

M.


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-04  4:24                                     ` Paul Jackson
@ 2004-10-04 15:03                                       ` Martin J. Bligh
  2004-10-04 15:53                                         ` [ckrm-tech] " Paul Jackson
  2004-10-05  9:26                                         ` Simon Derr
  0 siblings, 2 replies; 233+ messages in thread
From: Martin J. Bligh @ 2004-10-04 15:03 UTC (permalink / raw)
  To: Paul Jackson
  Cc: pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich

--Paul Jackson <pj@sgi.com> wrote (on Sunday, October 03, 2004 21:24:52 -0700):

> Martin wrote:
>> Then when I fork off exclusive subset for CPUs 1&2, I have to push A & B
>> into it.
> 
> Tasks A & B must _not_ be considered members of that exclusive cpuset,
> even though it seems that A & B must be attended to by the sched_domain
> and memory_domain associated with that cpuset.
> 
> The workload managers expect to be able to list the tasks in a cpuset,
> so it can hibernate, migrate, kill-off, or wait for the finish of these
> tasks.  I've been through this bug before - it was one that cost Hawkes
> a long week to debug - I was moving the per-cpu migration threads off
> their home CPU because I didn't have a clear way to distinguish tasks
> genuinely in a cpuset, from tasks that just happened to be indigenous to
> some of the same CPUs.  My essential motivation for adapting a cpuset
> implementation that has a task struct pointer to a shared cpuset struct
> was to track exactly this relation - which tasks are in which cpuset.
> 
> No ... tasks A & B are not allowed in that new exclusive cpuset.

OK, then your "exclusive" cpusets aren't really exclusive at all, since
they have other stuff running in them. The fact that you may institute
the stuff early enough to avoid most things falling into this doesn't
really solve the problems, AFAICS. 

Or perhaps we end up cpuset alpha and beta that you created, and we create
parallel cpusets that operate on the same sched_domain tree to contain the
other random stuff.

Kind of "cpu groups" and "task groups", where you can have multiple task
groups running on the same cpu group (or subset thereof), but not overlapping 
different cpu groups. Then we can have one sched domain setup per cpu group,
or at least the top level entry in the main sched domain tree. This way the
scheduler might have a hope of working within this system efficiently ;-)

M.


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-04 14:15                       ` Erich Focht
@ 2004-10-04 15:23                         ` Paul Jackson
  0 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-04 15:23 UTC (permalink / raw)
  To: Erich Focht
  Cc: frankeh, akpm, nagar, ckrm-tech, mbligh, lse-tech, hch, steiner,
	jbarnes, sylvain.jeaugey, djh, linux-kernel, colpatch,
	Simon.Derr, ak, sivanich

Erich, responding to Hubertus:
> > (a) is it a guarantee/property that cpusets at with the same
> >      parent cpuset do not overlap ?
> 
> Right now it isn't AFAIK. Paul, if all cpusets on the same level are
> disjunct this certainly simplifies life. Would this be a too strong
> limitation for you? We could live with it.

Correct, Erich, it is not a guarantee that sibling cpusets don't
overlap, unless, as Simon noted, they are all marked exclusive.

Yes, it would be a stronger limitation than I would agree to, but that's
ok, because in my humble opinion, CKRM doesn't need it to operate within
cpusets.

I think what's needed for CKRM to operate within cpusets is clear
ownership.

Each instance of CKRM needs (tell me if I'm wrong here):
 1) to have a clear and unambiguous answer to the question of
    which CPUs, which Memory Nodes, and which Tasks it is
    controlling,
 2) no overlap of these sets with another instance of CKRM,
 3) the CPUs and Memory Nodes on which any of these Tasks are
    allowed to run must be a subset of those controlled by
    this instance of CKRM, and
 4) all Tasks allowed to run on any of the CPUs and Memory
    Nodes controlled by this CKRM instance are in the list
    of Tasks this CKRM knows it controls.

In short - each CKRM instance needs clear, unambiguous, non-overlapping
ownership of all it surveys.

Requesting that all cpusets be marked exclusive for both CPU and Memory
is an overzealous precondition for the above.

Another way to obtain the above requirements would be to assign each
CKRM instance to a separate cpuset subtree, where the root of the
subtree is marked exclusive for cpu and memory, where that CKRM instance
controls all CPUs and Memory owned by that subtree and all Tasks
attached to any cpuset in that subtree, and where any tasks attached to
ancestors of the root are either (1) not allowed to use any of the CPUs
and Memory assigned to the subtree, or (2) are both [2a] allowed to use
only some subset of the CPUs and Memory assigned to the subtree and [2b]
are included in the list of tasks to be managed by that CKRM instance.

(The last 4.5 lines above are the special case required to handle the
indigenous per-cpu tasks, such as the migration threads - sorry.)

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-04 14:57                         ` Martin J. Bligh
@ 2004-10-04 15:30                           ` Paul Jackson
  2004-10-04 15:41                             ` Martin J. Bligh
  2004-10-04 15:38                           ` Paul Jackson
  2004-10-04 16:46                           ` Paul Jackson
  2 siblings, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-10-04 15:30 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: efocht, akpm, nagar, ckrm-tech, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich

Martin wrote:
> I don't think anyone is suggesting that either system as is could replace
> the other ...

I'm pretty sure Andrew was suggesting this.

He began this thread addressing me with the statement:
> 
> And CKRM is much more general than the cpu/memsets code, and hence it
> should be possible to realize your end-users requirements using an
> appropriately modified CKRM, and a suitable controller.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-04 14:57                         ` Martin J. Bligh
  2004-10-04 15:30                           ` Paul Jackson
@ 2004-10-04 15:38                           ` Paul Jackson
  2004-10-04 16:46                           ` Paul Jackson
  2 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-04 15:38 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: efocht, akpm, nagar, ckrm-tech, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich

Martin wrote:
> I'd think the problems would be easier to tackle if the systems knew
> about each other, and talked to each other.

Clear boundaries should be enough.  If each instance of CKRM is assured
that it has control of some subset of a system that's separate and
non-overlapping, with all Memory, CPU, Tasks, and Allowed masks of said
Tasks either wholly owned by that CKRM instance, or entirely outside,
then that should do it, right?

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-04 15:30                           ` Paul Jackson
@ 2004-10-04 15:41                             ` Martin J. Bligh
  2004-10-04 16:02                               ` Paul Jackson
  0 siblings, 1 reply; 233+ messages in thread
From: Martin J. Bligh @ 2004-10-04 15:41 UTC (permalink / raw)
  To: Paul Jackson
  Cc: efocht, akpm, nagar, ckrm-tech, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich

> Martin wrote:
>> I don't think anyone is suggesting that either system as is could replace
>> the other ...
> 
> I'm pretty sure Andrew was suggesting this.
> 
> He began this thread addressing me with the statement:
>> 
>> And CKRM is much more general than the cpu/memsets code, and hence it
>> should be possible to realize your end-users requirements using an

Note especially the last line:

>> appropriately modified CKRM, and a suitable controller.

So not CKRM as-is ...

M.


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-04 15:03                                       ` Martin J. Bligh
@ 2004-10-04 15:53                                         ` Paul Jackson
  2004-10-04 18:17                                           ` Martin J. Bligh
  2004-10-05  9:26                                         ` Simon Derr
  1 sibling, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-10-04 15:53 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich

Martin writes:
> OK, then your "exclusive" cpusets aren't really exclusive at all, since
> they have other stuff running in them.

What's clear is that 'exclusive' is not a sufficient precondition for
whatever it is that CKRM needs to have sufficient control.

Instead of trying to wrestle 'exclusive' into doing what you want, do me
a favor, if you would.  Help me figure out what conditions CKRM _does_
need to operate within a cpuset, and we'll invent a new property that
satisfies those conditions.

See my earlier posts in the last hour for my efforts to figure out what
these conditions might be.  I conjecture that it's something along the
lines of:

    Assuring each CKRM instance that it has control of some
    subset of a system that's separate and non-overlapping,
    with all Memory, CPU, Tasks, and Allowed masks of said
    Tasks either wholly owned by that CKRM instance, or
    entirely outside.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-04 15:41                             ` Martin J. Bligh
@ 2004-10-04 16:02                               ` Paul Jackson
  2004-10-04 18:19                                 ` Martin J. Bligh
  0 siblings, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-10-04 16:02 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: efocht, akpm, nagar, ckrm-tech, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich

Martin, quoting Andrew:
> >> appropriately modified CKRM, and a suitable controller.
> 
> So not CKRM as-is ...

Yes - by now we all agree that CKRM as it is doesn't provide some things
that cpusets provides (though of course CKRM provides much more that
cpusets doesn't.)

Andrew would ask, if I am channeling him correctly, how about CKRM as it
could be?  What would it take to modify CKRM so that it could subsume
(embrace and replace) cpusets, meeting all the requirements that in the
end we agreed were essential for cpusets to meet, rendering cpusets
redundant and no longer needed?

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-04 14:57                         ` Martin J. Bligh
  2004-10-04 15:30                           ` Paul Jackson
  2004-10-04 15:38                           ` Paul Jackson
@ 2004-10-04 16:46                           ` Paul Jackson
  2 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-04 16:46 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: efocht, akpm, nagar, ckrm-tech, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich

Martin wrote:
> I don't think anyone is suggesting that either system as is could replace
> the other ... more that a combined system could be made for both types
> of resource control that would be a better overall solution.

Oops - sorry, Martin.  I obviously didn't read your entire sentence
before objecting before.

Now that I do, it makes sense.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-04 15:53                                         ` [ckrm-tech] " Paul Jackson
@ 2004-10-04 18:17                                           ` Martin J. Bligh
  2004-10-04 20:25                                             ` Paul Jackson
  0 siblings, 1 reply; 233+ messages in thread
From: Martin J. Bligh @ 2004-10-04 18:17 UTC (permalink / raw)
  To: Paul Jackson
  Cc: pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich

--On Monday, October 04, 2004 08:53:27 -0700 Paul Jackson <pj@sgi.com> wrote:

> Martin writes:
>> OK, then your "exclusive" cpusets aren't really exclusive at all, since
>> they have other stuff running in them.
> 
> What's clear is that 'exclusive' is not a sufficient precondition for
> whatever it is that CKRM needs to have sufficient control.
> 
> Instead of trying to wrestle 'exclusive' into doing what you want, do me
> a favor, if you would.  Help me figure out what conditions CKRM _does_
> need to operate within a cpuset, and we'll invent a new property that
> satisfies those conditions.

Oh, I'm not even there yet ... just thinking about what cpusets needs
independantly to operate efficiently - I don't think cpus_allowed is efficient.

Whatever we call it, the resource management system definitely needs the 
ability to isolate a set of resources (CPUs, RAM) totally dedicated to
one class or group of processes. That's what I see as the main feature
of cpusets right now, though there may be other things there as well that
I've missed? At least that's the main feature I personally see a need for ;-)
 
> See my earlier posts in the last hour for my efforts to figure out what
> these conditions might be.  I conjecture that it's something along the
> lines of:
> 
>     Assuring each CKRM instance that it has control of some
>     subset of a system that's separate and non-overlapping,
>     with all Memory, CPU, Tasks, and Allowed masks of said
>     Tasks either wholly owned by that CKRM instance, or
>     entirely outside.

Mmm. Looks like you're trying to do multiple CKRMs, one inside each cpuset,
right? Not sure that's the way I'd go, but maybe it makes sense.

The way I'm looking at it, which is probably wholly insufficient, if not
downright wrong, we have multiple process groups, each of which gets some 
set of resources. Those resources may be dedicated to that class (a la 
cpusets) or not. One could view this as a set of resource groupings, and
set of process groupings, where one or more process groupings is bound to
a resource grouping.

The resources are cpus & memory, mainly, in my mind (though I guess IO,
etc fit too). The resource sets are more like cpusets, and the process
groups a bit more like CKRM, except they seem to overlap (to me) when
the sets in cpusets are non-exclusive, or when CKRM wants harder performance
guarantees.

Feel free to point out where I'm full of shit / missing the point ;-)

M.


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-04 16:02                               ` Paul Jackson
@ 2004-10-04 18:19                                 ` Martin J. Bligh
  2004-10-04 18:29                                   ` Paul Jackson
  0 siblings, 1 reply; 233+ messages in thread
From: Martin J. Bligh @ 2004-10-04 18:19 UTC (permalink / raw)
  To: Paul Jackson
  Cc: efocht, akpm, nagar, ckrm-tech, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich

--On Monday, October 04, 2004 09:02:32 -0700 Paul Jackson <pj@sgi.com> wrote:

> Martin, quoting Andrew:
>> >> appropriately modified CKRM, and a suitable controller.
>> 
>> So not CKRM as-is ...
> 
> Yes - by now we all agree that CKRM as it is doesn't provide some things
> that cpusets provides (though of course CKRM provides much more that
> cpusets doesn't.)
> 
> Andrew would ask, if I am channeling him correctly, how about CKRM as it
> could be?  What would it take to modify CKRM so that it could subsume
> (embrace and replace) cpusets, meeting all the requirements that in the
> end we agreed were essential for cpusets to meet, rendering cpusets
> redundant and no longer needed?

Well, or just merge the two somehow into one cohesive system, I'd think.
One doesn't need to completely subsume the other ;-)

M.


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-04 18:19                                 ` Martin J. Bligh
@ 2004-10-04 18:29                                   ` Paul Jackson
  0 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-04 18:29 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: efocht, akpm, nagar, ckrm-tech, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, colpatch, Simon.Derr, ak,
	sivanich

Martin writes:
>
> One doesn't need to completely subsume the other ;-)

Well, close to it.

It's not a marriage of equals in his challenge:
>
> And CKRM is much more general than the cpu/memsets code, and hence it
> should be possible to realize your end-users requirements using an
> appropriately modified CKRM, and a suitable controller.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-04 18:17                                           ` Martin J. Bligh
@ 2004-10-04 20:25                                             ` Paul Jackson
  2004-10-04 22:15                                               ` Martin J. Bligh
  0 siblings, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-10-04 20:25 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich

Martin wrote:
> Mmm. Looks like you're trying to do multiple CKRMs, one inside each cpuset,
> right? Not sure that's the way I'd go, but maybe it makes sense.

No - I was just reflecting my lack of adequate understanding of CKRM.

You guys were trying to get certain semantics out of cpusets to meet
your needs, putting words in my mouth as to what things like "exclusive"
meant, and I was pushing back, trying to get a fair, implementation
neutral statement of just what it was that CKRM needed out of cpusets,
by in part phrasing things in terms of what I thought you were trying to
have CKRM do with cpusets.  Turns out I speak CKRM substantially worse
than you guys speak cpusets. <grin>

So nevermind what I was trying to do, which was, as you guessed:
> 
> Looks like you're trying to do multiple CKRMs, one inside each cpuset,

Let me try again to see if I can figure out what you're trying to do.

You write:
>
> The way I'm looking at it, which is probably wholly insufficient, if not
> downright wrong, we have multiple process groups, each of which gets some 
> set of resources. Those resources may be dedicated to that class (a la 
> cpusets) or not. One could view this as a set of resource groupings, and
> set of process groupings, where one or more process groupings is bound to
> a resource grouping.
> 
> The resources are cpus & memory, mainly, in my mind (though I guess IO,
> etc fit too). The resource sets are more like cpusets, and the process
> groups a bit more like CKRM, except they seem to overlap (to me) when
> the sets in cpusets are non-exclusive, or when CKRM wants harder performance
> guarantees.

I can understand it far enough to see groups of processes using groups
of resources (cpus & memory, like cpusets).  Both of the phrases
containing "CKRM" in them go right past ... whizz.  And I'm a little
fuzzy on what are the sets, invariants, relations, domains, ranges,
operations, pre and post conditions and such that could be modeled in a
more precise manner.

Keep talking ...  Perhaps an example, along the lines of my "use case
scenarios", would help.  When we start losing each other trying to
generalize too fast, it can help to make up an overly concrete example,
to get things grounded again.


> Whatever we call it, the resource management system definitely needs the 
> ability to isolate a set of resources (CPUs, RAM) totally dedicated to
> one class or group of processes.

Not always "totally isolated and dedicated".

Here's a scenario that shows up some uses for "non-exclusive" cpusts.

Let's take my big 256 CPU system, divided into portions of 128, 64 and
64. At this level, these are three, mutually exclusive cpusets, and
interaction between them is minimized.  In the first two portions, the
128 and the first 64, a couple of "company jewel" applications run.
These are highly tuned, highly parallel applications that are sucking up
99% of every CPU cycle, bus cycle, cache line and memory page available,
for hours on end, in a closely synchronized dance.  They cannot tolerate
anything else interfering in their area.  Frankly, they have little use
for CKRM, fancy schedulers or sophisticated allocators.  They know
what's there, it's all their's, and they know exactly what they want to
do with it.  Get out of the way and let them do their job.  Industrial
strength computing at its finest.

Ok that much is as before.

Now the last portion, the second 64, is more of a general use area. It
is less fully utilized, and it's job mix more varied and less tightly
administered.  There's some 64-thread background application that puts a
fairly light load on things, running day and night (maybe the V.P. of
the MIS shop is a fan of SETI).

Since this is a parallel programming shop, people show up with at random
hours with smaller parallel jobs, carve off temporary cpusets of the
appropriate size, and run an application in them.  Their threads and
memory within their temporary cpuset are carefully placed, relative to
their cpuset, but they are not fully utilizing the nodes on which they
are running and they tolerate other things happening on the same nodes. 
Perhaps the other stuff doesn't impact their performance much, or
perhaps they are too poor to pay for dedicated nodes (grad students
still looking for a grant?) ... whatever.

They may well make good use of a batch manager, to which they submit
jobs of a specified size (cpus and memory) so that the batch manager can
smooth out the load. and avoid periods of excess idling or thrashing. 
The implementation of the batch manager relies heavily on the underlying
cpuset facility to manage various subsets of CPU and Memory Nodes.  The
batch manager might own the first 192 CPUs on the system too, but most
users never get to see that part of the system.

Within that last 64 portion the current mechanisms, including the per
task cpus_allowed and mems_allowed, and the current schedulers and
allocators, may well be doing a pretty good job.  Sure, there is an
element of chaos and things aren't perfect.  It's the "usual" timeshare
environment with a varied load mix.

The enforced placement within the smaller nested non-exclusive cpusets
probably surprises the scheduler and allocator at times, leading to
unfair inbalances.  I imagine that if CKRM just had that last 64 portion
to manage, and this was just a 64 CPU system, not a 256, then CKRM could
do a pretty good job of managing the systems resources.

Enough of this story ...

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-04 20:25                                             ` Paul Jackson
@ 2004-10-04 22:15                                               ` Martin J. Bligh
  2004-10-05  9:17                                                 ` Paul Jackson
  0 siblings, 1 reply; 233+ messages in thread
From: Martin J. Bligh @ 2004-10-04 22:15 UTC (permalink / raw)
  To: Paul Jackson
  Cc: pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich

>> The way I'm looking at it, which is probably wholly insufficient, if not
>> downright wrong, we have multiple process groups, each of which gets some 
>> set of resources. Those resources may be dedicated to that class (a la 
>> cpusets) or not. One could view this as a set of resource groupings, and
>> set of process groupings, where one or more process groupings is bound to
>> a resource grouping.
>> 
>> The resources are cpus & memory, mainly, in my mind (though I guess IO,
>> etc fit too). The resource sets are more like cpusets, and the process
>> groups a bit more like CKRM, except they seem to overlap (to me) when
>> the sets in cpusets are non-exclusive, or when CKRM wants harder performance
>> guarantees.
> 
> I can understand it far enough to see groups of processes using groups
> of resources (cpus & memory, like cpusets).  Both of the phrases
> containing "CKRM" in them go right past ... whizz.  And I'm a little
> fuzzy on what are the sets, invariants, relations, domains, ranges,
> operations, pre and post conditions and such that could be modeled in a
> more precise manner.
> 
> Keep talking ...  Perhaps an example, along the lines of my "use case
> scenarios", would help.  When we start losing each other trying to
> generalize too fast, it can help to make up an overly concrete example,
> to get things grounded again.

Let me make one thing clear: I don't work on CKRM ;-) So I'm not either
desperately familiar with it, or partial to it. Nor am I desperately
infatuated enough with my employer to believe  that just because they're
involved with it, it must be stunningly brilliant. So I think I'm actually
fairly impartial ... and balanced in ignorance on both sides ;-)

I do think both things are solving perfectly valid problems (that IMO
intersect) ... not sure whether either is doing it the best way though ;-).

>> Whatever we call it, the resource management system definitely needs the 
>> ability to isolate a set of resources (CPUs, RAM) totally dedicated to
>> one class or group of processes.
> 
> Not always "totally isolated and dedicated".
> 
> Here's a scenario that shows up some uses for "non-exclusive" cpusts.
> 
> Let's take my big 256 CPU system, divided into portions of 128, 64 and
> 64. At this level, these are three, mutually exclusive cpusets, and
> interaction between them is minimized.  In the first two portions, the
> 128 and the first 64, a couple of "company jewel" applications run.
> These are highly tuned, highly parallel applications that are sucking up
> 99% of every CPU cycle, bus cycle, cache line and memory page available,
> for hours on end, in a closely synchronized dance.  They cannot tolerate
> anything else interfering in their area.  Frankly, they have little use
> for CKRM, fancy schedulers or sophisticated allocators.  They know
> what's there, it's all their's, and they know exactly what they want to
> do with it.  Get out of the way and let them do their job.  Industrial
> strength computing at its finest.
> 
> Ok that much is as before.
> 
> Now the last portion, the second 64, is more of a general use area. It
> is less fully utilized, and it's job mix more varied and less tightly
> administered.  There's some 64-thread background application that puts a
> fairly light load on things, running day and night (maybe the V.P. of
> the MIS shop is a fan of SETI).
> 
> Since this is a parallel programming shop, people show up with at random
> hours with smaller parallel jobs, carve off temporary cpusets of the
> appropriate size, and run an application in them.  Their threads and
> memory within their temporary cpuset are carefully placed, relative to
> their cpuset, but they are not fully utilizing the nodes on which they
> are running and they tolerate other things happening on the same nodes. 
> Perhaps the other stuff doesn't impact their performance much, or
> perhaps they are too poor to pay for dedicated nodes (grad students
> still looking for a grant?) ... whatever.

OK, the dedicated stuff in cpusets makes a lot of sense to me, for the
reasons you describe above. One screaming problem we have at the moment
is we can easily say "I want to bind myself to CPU X" but no way to say
"kick everyone else off it". That seems like a very real problem.

However, the non-dedicated stuff seems much more debateable, and where
the overlap with CKRM stuff seems possible to me. Do the people showing
up at random with smaller parallel jobs REALLY, REALLY care about the
physical layout of the machine? I suspect not, it's not the highly tuned
syncopated rhythm stuff you describe above. The "give me 1.5 CPUs worth
of bandwidth please" model of CKRM makes much more sense to me.
 
> They may well make good use of a batch manager, to which they submit
> jobs of a specified size (cpus and memory) so that the batch manager can
> smooth out the load. and avoid periods of excess idling or thrashing. 
> The implementation of the batch manager relies heavily on the underlying
> cpuset facility to manage various subsets of CPU and Memory Nodes.  The
> batch manager might own the first 192 CPUs on the system too, but most
> users never get to see that part of the system.
> 
> Within that last 64 portion the current mechanisms, including the per
> task cpus_allowed and mems_allowed, and the current schedulers and
> allocators, may well be doing a pretty good job.  Sure, there is an
> element of chaos and things aren't perfect.  It's the "usual" timeshare
> environment with a varied load mix.
> 
> The enforced placement within the smaller nested non-exclusive cpusets
> probably surprises the scheduler and allocator at times, leading to
> unfair inbalances.  I imagine that if CKRM just had that last 64 portion
> to manage, and this was just a 64 CPU system, not a 256, then CKRM could
> do a pretty good job of managing the systems resources.

Right - exactly. Sounds like we're actually pretty much on the same page
(by the time I'd finished your email ;-)). So whatever the interface we
have, the underlying mechanisms seem to have two fundamentals: dedicated
and non-decicated resources. cpusets seems to do a good job of dedicated
and I'd argue the interface of specifying physical resources is a bit
clunky for non-dedicated stuff. CKRM doesn't seem to tackle the dedicated
at all, but seems to have an easier way of doing the non-dedicated.

So personally what I'd like is to have a unified interface (and I care 
not a hoot which, or a new one altogether), that can specify dedicated
or non-decicated resources for groups of processes, and then have a
"cpusets-style" mechanism for the dedicated, and "CKRM-style" mechanism
for the non-dedicated. Not sure if that's exactly what Andrew was hoping
for, or the rest of you either ;-)

The whole discussion about multiple sched-domains, etc, we had earlier
is kind of just an implementation thing, but is a crapload easier to do
something efficient here if the bits caring about that stuff are only
dealing with dedicated resource partitions.

OK, now my email is getting as long as yours, so I'll stop ;-) ;-)

M.

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-04 11:44                                 ` Rick Lindsley
@ 2004-10-04 22:46                                   ` Paul Jackson
  0 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-04 22:46 UTC (permalink / raw)
  To: Rick Lindsley
  Cc: mbligh, pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht,
	lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, colpatch, Simon.Derr, ak, sivanich

Good questions - thanks.

Rick wrote:
> So the examples you gave before were rather oversimplified, then?

Yes - they were.  Quite intentionally.

> some portion of that must be reserved for the "bootcpuset".  Would this
> be enforced by the kernel, or the administrator?

It's administrative.  You don't have to run your system this way.  The
kernel threads (both per-cpu and system-wide), as well as init and the
classic Unix daemons, can be left running in the root cpuset (see below
for what that is).  The kernel doesn't care.

It was the additional request for a CKRM friendly setup that led me to
point out that system-wide kernel threads could be confined to a
"bootcpuset".  Since bootcpuset is user level stuff, I hadn't mentioned
it before, on the kernel mailing list.

The more common reason for confining such kthreads and Unix daemons to a
bootcpuset are to minimize interactions between such tasks and important
applications.

> I might suggest a simpler approach.  As a matter of policy, at least one
> cpu must remain outside of cpusets so that system processes like init,
> getty, lpd, etc. have a place to run.

This is the same thing, in different words.  In my current cpuset
implemenation, _every_ task is attached to a cpuset.

What you call a cpu that "remains outside of cpusets" is the bootcpuset,
in my terms.

>     The tasks whose cpus_allowed is a strict _subset_ of cpus_online_map
>     need to be where they are.  These are things like the migration
>     helper threads, one for each cpu.  They get a license to violate
>     cpuset boundaries.
> 
> Literally, or figuratively?  (How do we recognize these tasks?)

I stated one critical word too vaguely.  Let me restate (s/tasks/kernel
threads/), then translate.


>     The kernel threads whose cpus_allowed is a strict _subset_ of cpus_online_map
>     need to be where they are.  These are things like the migration
>     helper threads, one for each cpu.  They get a license to violate
>     cpuset boundaries.

> Literally, or figuratively?  (How do we recognize these tasks?)

Literally.  The early (_very_ early) user level code that sets up the
bootcpuset, as requested by a configuration file in /etc, moves the
kthreads with a cpus_allowed >= what's online to the bootcpuset, but
leaves the kthreads with a cpus_allowed < online where they are, in the
root cpuset.

If you do a "ps -efl", look for the tasks early in the list whose
command names in something like "/2" (printf format "/%u").  These
are the kthreads that usually need to be pinned on a CPU.

But you don't need to do that - an early boot user utility does it
as part of setting up the bootcpuset.

> Will cpus in exclusive cpusets be asked to service interrupts?

The current cpuset implementation makes no effort to manage interrupts. 
To manage interrupts in relation to cpusets today, you'd have to use
some other means to control or determine where interrupts were going,
and then place your cpusets with that in mind.

>     So with my bootcpuset, the problem is reduced, to a few tasks
>     per CPU, such as the migration threads, which must remain pinned
>     on their one CPU (or perhaps on just the CPUs local to one Memory
>     Node).  These tasks remain in the root cpuset, which by the scheme
>     we're contemplating, doesn't get a sched_domain in the fancier
>     configurations.
> 
> You just confused me on many different levels:
> 
>     * what is the root cpuset? Is this the same as the "bootcpuset" you
>       made mention of?

Not the same.

The root cpuset is the all encompassing cpuset representing the entire
system, from which all other cpusets are formed.  The root cpuset always
contains all CPUs and all Memory Nodes.

The bootcpuset is typically a small cpuset, a direct child of the root
cpuset, containing what would be in your terms the one or a few cpus
that are reserved for the classic Unix system processes like init,
getty, lpd, etc.

>    * so where *do* these tasks go in the "fancier configurations"?

Er eh - in the root cpuset ;).  Hmmm ... guess that's not your question.

In this fancy configuration, I had the few kthreads that could _not_
be moved to the bootcpuset, because they had to remain pinned on
specific CPUs (e.g. the migration threads), remain in the root cpuset.

When the exclusive child cpusets were formed, and each given their own
special scheduler domain, I rebound the scheduler domain to use for
these per-cpu kthreads to which ever scheduler domain managed the cpu
that thread lived on.  The thread remained in the root cpuset, but
hitched a ride on the scheduler that had assumed control of the cpu that
the thread lived on.  Everything in this paragraphy is something I
invented in the last two days, in response to various requests from
others for setups that provided a clear boundary of control to
schedulers.

>     If we just wrote the code, and quit trying to find a grand unifying
>     theory to explain it consistently with the rest of our design,
>     it would probably work just fine.
> 
> I'll assume we're missing a smiley here.

Not really.  The per-cpu kthreads are a wart that doesn't fit the
particular design being discussed here very well.  Warts happen.

> When you "remove a cpuset" you just or in the right bits in everybody's
> cpus_allowed fields and they start migrating over.
> 
> To me, this all works for the cpu-intensive, gotta have it with 1% runtime
> variation example you gave.  Doesn't it?  And it seems to work for the
> department-needs-8-cpus-to-do-as-they-please example too, doesn't it?

What you're saying is rather like saying I don't need a file system
on my floppy disk.  Well, originally, I didn't.  I wrote the bytes
to my tape casette, I read them back.  What's the problem.  If I
wanted to name the bytes, I stuck a label on the cassette and wrote
a note on the label.

Yes, that works.  As systems get bigger, and as we add batch managers
and such to handle a more complicated set of jobs, we need to be able
to do things like:
   * name sets of CPUs/Memory, in a way consistent across the system
   * create and destroy a set
   * control who can query, modify and attach a set
   * change which set a task is attached to
   * list which tasks are currently attached to a set
   * query, set and change which CPUs and Memory are in a set.

This is like needing a FAT file system for your floppy.  Cpusets
join the collection of "first class, kernel managed" objects,
and are no longer just the implied attributes of each task.

Batch managers and sysadmins of more complex, dynamically changing
configurations, sometimes on very large systems that are shared across
several departments or divisions, depend on this ability to treat
cpusets as first class name, kernel managed objects.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-02 23:44                         ` Hubertus Franke
  2004-10-03  0:00                           ` Peter Williams
  2004-10-03  3:44                           ` Paul Jackson
@ 2004-10-05  3:13                           ` Matthew Helsley
  2004-10-05  8:30                             ` Hubertus Franke
  2 siblings, 1 reply; 233+ messages in thread
From: Matthew Helsley @ 2004-10-05  3:13 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: Peter Williams, dipankar, Paul Jackson, Andrew Morton, CKRM-Tech,
	efocht, Martin Bligh, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, Matthew Dobson, Simon.Derr,
	ak, sivanich

On Sat, 2004-10-02 at 16:44, Hubertus Franke wrote:
<snip>
> along cpuset boundaries. If taskclasses are allowed to span disjoint
> cpumemsets, what is then the definition of setting shares ?
<snip>

	I think the clearest interpretation is the share ratios are the same
but the quantity of "real" resources and the sum of shares allocated is
different depending on cpuset.

	For example, suppose we have taskclass/A that spans cpusets Foo and Bar
-- processes foo and bar are members of taskclass/A but in cpusets Foo
and Bar respectively. Both get up to 50% share of cpu time in their
respective cpusets because they are in taskclass/A. Further suppose that
cpuset Foo has 1 CPU and cpuset Bar has 2 CPUs.

	This means process foo could consume up to half a CPU while process bar
could consume up to a whole CPU. In order to enforce cpuset
partitioning, each class would then have to track its share usage on a
per-cpuset basis. [Otherwise share allocation in one partition could
prevent share allocation in another partition. Using the example above,
suppose process foo is using 45% of CPU in cpuset Foo. If the total
share consumption is calculated across cpusets process bar would only be
able to consume up to 5% of CPU in cpuset Bar.]

Cheers,
	-Matt Helsley


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-05  3:13                           ` [ckrm-tech] " Matthew Helsley
@ 2004-10-05  8:30                             ` Hubertus Franke
  2004-10-05 14:20                               ` Paul Jackson
  0 siblings, 1 reply; 233+ messages in thread
From: Hubertus Franke @ 2004-10-05  8:30 UTC (permalink / raw)
  To: Matthew Helsley
  Cc: Peter Williams, dipankar, Paul Jackson, Andrew Morton, CKRM-Tech,
	efocht, Martin Bligh, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, Matthew Dobson, Simon.Derr,
	ak, sivanich



Matthew Helsley wrote:

> On Sat, 2004-10-02 at 16:44, Hubertus Franke wrote:
> <snip>
> 
>>along cpuset boundaries. If taskclasses are allowed to span disjoint
>>cpumemsets, what is then the definition of setting shares ?
> 
> <snip>
> 
> 	I think the clearest interpretation is the share ratios are the same
> but the quantity of "real" resources and the sum of shares allocated is
> different depending on cpuset.
> 
> 	For example, suppose we have taskclass/A that spans cpusets Foo and Bar
> -- processes foo and bar are members of taskclass/A but in cpusets Foo
> and Bar respectively. Both get up to 50% share of cpu time in their
> respective cpusets because they are in taskclass/A. Further suppose that
> cpuset Foo has 1 CPU and cpuset Bar has 2 CPUs.

Yes, we ( Shailabh and I ) were talking about exactly that this 
afternoon. This would mean that the denominator of the cpu shares for a 
given class <cls> is not determined solely by the parents 
total_guarantee but by:
    total_guarantee * size(cls->parent->cpuset) / size(cls->cpuset)

This is effectively what you describe below.

> 
> 	This means process foo could consume up to half a CPU while process bar
> could consume up to a whole CPU. In order to enforce cpuset
> partitioning, each class would then have to track its share usage on a
> per-cpuset basis. [Otherwise share allocation in one partition could
> prevent share allocation in another partition. Using the example above,
> suppose process foo is using 45% of CPU in cpuset Foo. If the total
> share consumption is calculated across cpusets process bar would only be
> able to consume up to 5% of CPU in cpuset Bar.]
> 

This would require some changes in the CPU scheduler to teach the 
cpu-monitor to deal with the limited scope. It would also require some
mods to the API :
Since classes can span different cpu sets with different shares
how do we address the cpushare of a class in the particular context
of a cpu-set.
Alternatively, one could require that classes can not span different
cpu-sets, which would significantly reduce the complexity of this.

> Cheers,
> 	-Matt Helsley
> 
> 
> 
> -------------------------------------------------------
> This SF.net email is sponsored by: IT Product Guide on ITManagersJournal
> Use IT products in your business? Tell us what you think of them. Give us
> Your Opinions, Get Free ThinkGeek Gift Certificates! Click to find out more
> http://productguide.itmanagersjournal.com/guidepromo.tmpl
> _______________________________________________
> ckrm-tech mailing list
> https://lists.sourceforge.net/lists/listinfo/ckrm-tech
> 


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-04 22:15                                               ` Martin J. Bligh
@ 2004-10-05  9:17                                                 ` Paul Jackson
  2004-10-05 10:01                                                   ` Paul Jackson
  2004-10-05 22:24                                                   ` Matthew Dobson
  0 siblings, 2 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-05  9:17 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht, lse-tech,
	hch, steiner, jbarnes, sylvain.jeaugey, djh, linux-kernel,
	colpatch, Simon.Derr, ak, sivanich

Martin wrote:
> Let me make one thing clear: I don't work on CKRM ;-) 

ok ...

Indeed, unless I'm not recognizing someone's expertise properly, there
seems to be a shortage of the CKRM experts on this thread.

Who am I missing ...

> However, the non-dedicated stuff seems much more debateable, and where
> the overlap with CKRM stuff seems possible to me. Do the people showing
> up at random with smaller parallel jobs REALLY, REALLY care about the
> physical layout of the machine? I suspect not, it's not the highly tuned
> syncopated rhythm stuff you describe above. The "give me 1.5 CPUs worth
> of bandwidth please" model of CKRM makes much more sense to me.

It will vary.  In shops that are doing alot of highly parallel work,
such as with OpenMP or MPI, many smaller parallel jobs will also be
placement sensitive.  The performance of such jobs is hugely sensitive
to their placement and scheduling on dedicated CPUs and Memory, one per
active thread.

These shops will often use a batch scheduler or workload manager, such
as PBS or LSF to manage their jobs.  PBS and LSF make a business of
defining various sized cpusets to fit the queued jobs, and running each
job in a dedicated cpuset.  Their value comes from obtaining high
utilization, and optimum repeatable runtimes, on a varied input job
stream, especially of placement sensitive jobs.  The feature set of
cpusets was driven as much as anything by what was required to support a
port of PBS or LSF.

> I'd argue the interface of specifying physical resources is a bit
> clunky for non-dedicated stuff.

Likeky so - the interface is expected to be wrapped with a user level
'cpuset' library, which converts it to a 'C' friendly model.  And that
in turn is expected to be wrapped with a port of LSF or PBS, which
converts placement back to something that the customer finds familiar
and useful for managing their varied job mix.

I don't expect admins at HPC shops to spend much time poking around the
/dev/cpuset file system, though it is a nice way to look around and
figure out how things work.

The /dev/cpuset pseudo file system api was chosen because it was
convenient for small scale work, learning and experimentation, because
it was a natural for the hierarchical name space with permissions that I
required, and because it was convenient to leverage existing vfs
structure in the kernel.

> So personally what I'd like is to have a unified interface
> ...
> Not sure if that's exactly what Andrew was hoping
> for, or the rest of you either ;-)

Well, not what I'm pushing for, that's for sure.

We really have two different mechanisms here:

  1) A placement mechanism, explicitly specifying what CPUs and Memory
     Nodes are allowed, and
  2) A sharing mechanism, specifying what proportion of fungible
     resources as cpu cycles, page faults, i/o requests a particular
     subset (class) of the user population is to receive.

If you look at the very lowest level hooks for cpusets and CKRM, you
will see the essential difference:

  1) cpusets hooks the scheduler to prohibit scheduling on a CPU that
     is not allowed, and the allocator to prohibit obtaining memory
     on a Node that is not allowed.
  2) CKRM hooks these and other places to throttle tasks by inserting
     small delays, so as to obtain the requested share or percentage,
     per class of user, of the rate of usage of fungible resources.

The specific details which must be passed back and forth across the
boundary between the kernel and user-space for these two mechanisms are
simply different.  One controls which of a list of enumerable finite
non-substitutable resources may or may not be used, and the other
controls what share of other anonymous, fungible resources may be used.

Looking for a unified interface is a false economy in my view, and I
am suspicious that such a search reflects a failure to recognize the
essential differences between the two mechanisms.

> The whole discussion about multiple sched-domains, etc, we had earlier
> is kind of just an implementation thing, but is a crapload easier to do
> something efficient here if the bits caring about that stuff are only
> dealing with dedicated resource partitions.

Yes - much easier.  I suspect that someday I will have to add to cpusets
the ability to provide, for select cpusets, the additional guarantees
(sole and exclusive ownership of all the CPUs, Memory Nodes, Tasks and
affinity masks therein) which a scheduler or allocator that's trying to
be smart requires to avoid going crazy.  Not all cpusets need this - but
those cpusets which define the scope of scheduler or allocator domain
would sure like it.  Whatever my exclusive flag means now, I'm sure we
all agree that it is too weak to meet this particular requirement.

> OK, now my email is getting as long as yours, so I'll stop ;-) ;-)

That would be tragic indeed.  Good thing you stopped.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-04 15:03                                       ` Martin J. Bligh
  2004-10-04 15:53                                         ` [ckrm-tech] " Paul Jackson
@ 2004-10-05  9:26                                         ` Simon Derr
  2004-10-05  9:58                                           ` Paul Jackson
                                                             ` (2 more replies)
  1 sibling, 3 replies; 233+ messages in thread
From: Simon Derr @ 2004-10-05  9:26 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Paul Jackson, pwil3058, frankeh, dipankar, akpm, ckrm-tech,
	efocht, lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, colpatch, Simon.Derr, ak, sivanich

On Mon, 4 Oct 2004, Martin J. Bligh wrote:

> OK, then your "exclusive" cpusets aren't really exclusive at all, since
> they have other stuff running in them. The fact that you may institute
> the stuff early enough to avoid most things falling into this doesn't
> really solve the problems, AFAICS. 

I'd like to present you at this point what was the original decision for 
having exclusive (called strict, at this point in history) and 
non-exclusive cpusets.

The idea was to have a system, and run all jobs on it through a batch 
scheduler. Some jobs cared about performance, some didn't.

The ones who cared about performance got an 'exclusive' cpuset, the ones 
who didn't got a 'non exclusive' cpuset.

Now there is a possibility, that at a given time, only 'exclusive' jobs 
are running, and hence that 'exclusive' cpusets have been created for jobs 
on all the CPUs.

Our system (at Bull) is both a big and a small machine:
-big:   we have NUMA constraints.
-small: we don't have enough CPUs to spare one, we need to use ALL CPUs 
for our jobs.

There are still processes running outside the job cpusets (i.e in the root 
cpuset), sshd, the batch scheduler. These tasks use a low amount of CPU, 
so it is okay if they happen to run inside even 'exclusive' cpusets. For 
us, 'exclusive' only means that no other CPU-hungry job is going to share 
our CPU.

Of course, in our case, a valid argument is that 'exclusiveness' should 
not be enforced by the kernel but rather by the job scheduler. Probably.

But now I see that the discussion is going towards:
-fully exclusive cpusets, maybe even with no interrupts handling
-maybe only allow exclusive cpusets, since non-exclusive cpusets are 
tricky wrt CKRM.

That would be a no-go for us.


	Simon.


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-05  9:26                                         ` Simon Derr
@ 2004-10-05  9:58                                           ` Paul Jackson
  2004-10-05 19:34                                           ` Martin J. Bligh
  2004-10-05 22:33                                           ` Matthew Dobson
  2 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-05  9:58 UTC (permalink / raw)
  To: Simon Derr
  Cc: mbligh, pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht,
	lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, colpatch, Simon.Derr, ak, sivanich

Simon wrote:
> But now I see that the discussion is going towards:
> -fully exclusive cpusets, maybe even with no interrupts handling
> -maybe only allow exclusive cpusets, since non-exclusive cpusets are 
> tricky wrt CKRM.
> 
> That would be a no-go for us.

I'm with you there, Simon.  Not all cpusets should be exclusive.

It is reasonable for domain-capable schedulers, allocators and
resource managers (domain aware CKRM?) require that any domain
they manage correspond to an exclusive cpuset, for some value
of exclusive stronger than now.

Less exclusive cpusets just wouldn't qualify for their own
scheduler, allocator or resource manager domains.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-05  9:17                                                 ` Paul Jackson
@ 2004-10-05 10:01                                                   ` Paul Jackson
  2004-10-05 22:24                                                   ` Matthew Dobson
  1 sibling, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-05 10:01 UTC (permalink / raw)
  To: Paul Jackson
  Cc: mbligh, pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht,
	lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, colpatch, Simon.Derr, ak, sivanich

> Who am I missing ...

Oops - hi, Hubertus ;).

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-05  8:30                             ` Hubertus Franke
@ 2004-10-05 14:20                               ` Paul Jackson
  0 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-05 14:20 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: matthltc, pwil3058, dipankar, akpm, ckrm-tech, efocht, mbligh,
	lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, colpatch, Simon.Derr, ak, sivanich

Hubertus writes:
> Since classes can span different cpu sets with different shares
> how do we address the cpushare of a class in the particular context
> of a cpu-set.
> Alternatively, one could require that classes can not span different
> cpu-sets, which would significantly reduce the complexity of this.

It's not just cpusets that sets a tasks cpus_allowed ...

Lets say we have a 16 thread OpenMP application, running on a cpuset of
16 CPUs on a large system, one thread pinned to each CPU of the 16 using
sched_setaffinity, running exclusively there.  Which means that there
are perhaps eight tasks pinned on each of those 16 CPUs, the one OpenMP
thread, and perhaps seven indigenous per-cpu kernel threads:
    migration, ksoftirq, events, kblockd, aio, xfslogd and xfsdatad
(using what happens to be on a random 2.6 Altix in front of me).

Then the classe(s) containing the eight tasks on any given one of these
CPUs would be required to not contain any other tasks outside of those
eight, by your reduced complexity alternative, right?

On whom/what would this requirement be imposed?  Hopefully some CKRM
classification would figure this out and handle the classification
automatically.

What of the couple of "mother" tasks in this OpenMP application, which
are in this same 16 CPU cpuset, probably pinned to all 16 of the CPUs,
instead of to any individual one of them?  What are the requirements on
the classes to which these tasks belong, in relation to the above
classes for the per-cpu kthreads and per-cpu OpenMP threads?  And on
what person/software is the job of adapting to these requirements
imposed?

Observe by the way that so long as:
 1) the per-cpu OpenMP threads each get to use 99+% of their
    respective CPUs,
 2) CKRM didn't impose any constraints or work on anything else

then what CKRM does here doesn't matter.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-05  9:26                                         ` Simon Derr
  2004-10-05  9:58                                           ` Paul Jackson
@ 2004-10-05 19:34                                           ` Martin J. Bligh
  2004-10-06  0:28                                             ` Paul Jackson
  2004-10-05 22:33                                           ` Matthew Dobson
  2 siblings, 1 reply; 233+ messages in thread
From: Martin J. Bligh @ 2004-10-05 19:34 UTC (permalink / raw)
  To: Simon Derr
  Cc: Paul Jackson, pwil3058, frankeh, dipankar, akpm, ckrm-tech,
	efocht, lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, colpatch, ak, sivanich

> The idea was to have a system, and run all jobs on it through a batch 
> scheduler. Some jobs cared about performance, some didn't.
> 
> The ones who cared about performance got an 'exclusive' cpuset, the ones 
> who didn't got a 'non exclusive' cpuset.

OK, makes sense. Thanks for that.
 
> Of course, in our case, a valid argument is that 'exclusiveness' should 
> not be enforced by the kernel but rather by the job scheduler. Probably.
> 
> But now I see that the discussion is going towards:
> -fully exclusive cpusets, maybe even with no interrupts handling
> -maybe only allow exclusive cpusets, since non-exclusive cpusets are 
> tricky wrt CKRM.

Nope - personally I see us more headed for the exclusive cpusets, and
handle the non-exclusive stuff via a more CKRM-style mechanism. Which
I still think achieves what you need, though perhaps not in exactly the
fashion you envisioned.

M.


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-03 23:53                             ` Martin J. Bligh
  2004-10-04  0:02                               ` Martin J. Bligh
  2004-10-04  0:45                               ` Paul Jackson
@ 2004-10-05 22:19                               ` Matthew Dobson
  2004-10-06  2:39                                 ` Paul Jackson
                                                   ` (3 more replies)
  2 siblings, 4 replies; 233+ messages in thread
From: Matthew Dobson @ 2004-10-05 22:19 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Paul Jackson, pwil3058, frankeh, dipankar, Andrew Morton,
	ckrm-tech, efocht, LSE Tech, hch, steiner, Jesse Barnes,
	sylvain.jeaugey, djh, LKML, Simon.Derr, Andi Kleen, sivanich

On Sun, 2004-10-03 at 16:53, Martin J. Bligh wrote:
> > Martin wrote:
> >> Matt had proposed having a separate sched_domain tree for each cpuset, which
> >> made a lot of sense, but seemed harder to do in practice because "exclusive"
> >> in cpusets doesn't really mean exclusive at all.
> > 
> > See my comments on this from yesterday on this thread.
> > 
> > I suspect we don't want a distinct sched_domain for each cpuset, but
> > rather a sched_domain for each of several entire subtrees of the cpuset
> > hierarchy, such that every CPU is in exactly one such sched domain, even
> > though it be in several cpusets in that sched_domain.
> 
> Mmmm. The fundamental problem I think we ran across (just whilst pondering,
> not in code) was that some things (eg ... init) are bound to ALL cpus (or
> no cpus, depending how you word it); i.e. they're created before the cpusets
> are, and are a member of the grand-top-level-uber-master-thingummy.
> 
> How do you service such processes? That's what I meant by the exclusive
> domains aren't really exclusive. 
> 
> Perhaps Matt can recall the problems better. I really liked his idea, aside
> from the small problem that it didn't seem to work ;-)

Well that doesn't seem like a fair statement.  It's potentially true,
but it's really hard to say without an implementation! ;)

I think that the idea behind cpusets is really good, essentially
creating isolated areas of CPUs and memory for tasks to run
undisturbed.  I feel that the actual implementation, however, is taking
a wrong approach, because it attempts to use the cpus_allowed mask to
override the scheduler in the general case.  cpus_allowed, in my
estimation, is meant to be used as the exception, not the rule.  If we
wish to change that, we need to make the scheduler more aware of it, so
it can do the right thing(tm) in the presence of numerous tasks with
varying cpus_allowed masks.  The other option is to implement cpusets in
a way that doesn't use cpus_allowed.  That is the option that I am
pursuing.  

My idea is to make sched_domains much more flexible and dynamic.  By
adding locking and reference counting, and simplifying the way in which
sched_domains are created, linked, unlinked and eventually destroyed we
can use sched_domains as the implementation of cpusets.  IA64 already
allows multiple sched_domains trees without a shared top-level domain. 
My proposal is to make this functionality more generally available. 
Extending the "isolated domains" concept a little further will buy us
most (all?) the functionality of "exclusive" cpusets without the need to
use cpus_allowed at all.

I've got some code.  I'm in the midst of pushing it forward to rc3-mm2. 
I'll post an RFC later today or tomorrow when it's cleaned up.

-Matt


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-05  9:17                                                 ` Paul Jackson
  2004-10-05 10:01                                                   ` Paul Jackson
@ 2004-10-05 22:24                                                   ` Matthew Dobson
  1 sibling, 0 replies; 233+ messages in thread
From: Matthew Dobson @ 2004-10-05 22:24 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Martin J. Bligh, pwil3058, frankeh, dipankar, Andrew Morton,
	ckrm-tech, efocht, LSE Tech, hch, steiner, Jesse Barnes,
	sylvain.jeaugey, djh, LKML, Simon.Derr, Andi Kleen, sivanich

On Tue, 2004-10-05 at 02:17, Paul Jackson wrote:
> The /dev/cpuset pseudo file system api was chosen because it was
> convenient for small scale work, learning and experimentation, because
> it was a natural for the hierarchical name space with permissions that I
> required, and because it was convenient to leverage existing vfs
> structure in the kernel.

I really like the /dev/cpuset FS.  I would like to leverage most of that
code to be the user level interface to creating, linking & destroying
sched_domains at some point.  This, of course, is assuming that the
dynamic sched_domains concept meets with something less than catcalls
and jeers... ;)

-Matt


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-05  9:26                                         ` Simon Derr
  2004-10-05  9:58                                           ` Paul Jackson
  2004-10-05 19:34                                           ` Martin J. Bligh
@ 2004-10-05 22:33                                           ` Matthew Dobson
  2004-10-06  3:01                                             ` Paul Jackson
  2 siblings, 1 reply; 233+ messages in thread
From: Matthew Dobson @ 2004-10-05 22:33 UTC (permalink / raw)
  To: Simon Derr
  Cc: Martin J. Bligh, Paul Jackson, pwil3058, frankeh, dipankar,
	Andrew Morton, ckrm-tech, efocht, LSE Tech, hch, steiner,
	Jesse Barnes, sylvain.jeaugey, djh, LKML, Andi Kleen, sivanich

On Tue, 2004-10-05 at 02:26, Simon Derr wrote:
> I'd like to present you at this point what was the original decision for 
> having exclusive (called strict, at this point in history) and 
> non-exclusive cpusets.
> 
> The idea was to have a system, and run all jobs on it through a batch 
> scheduler. Some jobs cared about performance, some didn't.
> 
> The ones who cared about performance got an 'exclusive' cpuset, the ones 
> who didn't got a 'non exclusive' cpuset.

It sounds to me (and please correct me if I'm wrong) like 'non
exclusive' cpusets are more like a convenient way to group tasks than
any sort of performance or scheduling imperative.  It would seem what
we'd really want here is a task grouping functionality, more than a
'cpuset'.  A cpuset seems a bit heavy handed if all we want to do group
tasks for ease of administration.


> There are still processes running outside the job cpusets (i.e in the root 
> cpuset), sshd, the batch scheduler. These tasks use a low amount of CPU, 
> so it is okay if they happen to run inside even 'exclusive' cpusets. For 
> us, 'exclusive' only means that no other CPU-hungry job is going to share 
> our CPU.

If that's all 'exclusive' means then 'exclusive' is a poor choice of
terminology.  'Exclusive' sounds like it would exclude all tasks it is
possible to exclude from running there (ie: with the exception of
certain necessary kernel threads).

-Matt


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-05 19:34                                           ` Martin J. Bligh
@ 2004-10-06  0:28                                             ` Paul Jackson
  2004-10-06  1:16                                               ` Martin J. Bligh
  0 siblings, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-10-06  0:28 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Simon.Derr, pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht,
	lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, colpatch, ak, sivanich

Martin wrote:
> Nope - personally I see us more headed for the exclusive cpusets, and
> handle the non-exclusive stuff via a more CKRM-style mechanism.

As Simon said, no go.

Martin:

 1) Are you going to prevent sched_setaffinity calls as well?
    What about the per-cpu kernel threads?

    See my reply to Hubertus on this thread:

	Date: Tue, 5 Oct 2004 07:20:48 -0700
	Message-Id: <20041005072048.16632106.pj@sgi.com>

 2) Do you have agreement from the LSF and PBS folks that they
    can port to systems that support "shares" (the old Cray
    term roughly equivalent to CKRM), but lacking placement
    for jobs using shared resources?  I doubt it.

 3) Do you understand that OpenMP and MPI applications can really
    need placement, in order to get separate threads on separate
    CPUs, to allow concurrent execution, even when they aren't using
    (or worth providing) 100% of each CPU they are on.

 4) Continuing on item (1), I think that CKRM is going to have to
    deal with varying, detailed placement constraints, such as is
    presently implemented using a variety of settings of cpus_allowed
    and mems_allowed.  So too will schedulers and allocators.  We can
    setup a few, high level domains, that correspond to entire cpuset
    subtrees, that have closer to the exclusive properties that
    you want (stronger than the current cpuset exclusive flag ensures).
    But within any of those domains, we need a mix of exclusive
    and non-exclusive placement.

The CKRM controlled shares style of mechanism is appropriate when
one CPU cycle is as good as another, and one just needs to manage
what share of the total capacity a given class of users receive.

There are other applications, such as OpenMP and MPI applications with
closely coupled parallel threads, that require placement, including in
setups where that application doesn't get a totally isolated exclusive
'soft' partition of its own.  If an OpenMP or MPI job doesn't have each
separate thread placed on a distinct CPU, it runs like crud.  This is
so whether the job has its own dedicated cpuset, or it is sharing CPUs.

And there are important system management products, such as OpenPBS and
LSF, which rely on placement of jobs in named sets of CPUs and Memory
Nodes, both for jobs that are closely coupled parallel and jobs that are
not, both for jobs that have exclusive use of the CPUs and Memory Nodes
assigned to them and not.

CKRM cannot make these other usage patterns and requirements go away,
and even if it could force cpusets to only come in the totally isolated
flavor, CKRM would still have to deal with the placement that occurs
on a thread-by-thread basis that is essential to the performance of
tightly coupled thread applications and essential to the basic function
of certain per-cpu kernel threads.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-06  0:28                                             ` Paul Jackson
@ 2004-10-06  1:16                                               ` Martin J. Bligh
  2004-10-06  2:08                                                 ` Paul Jackson
  0 siblings, 1 reply; 233+ messages in thread
From: Martin J. Bligh @ 2004-10-06  1:16 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Simon.Derr, pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht,
	lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, colpatch, ak, sivanich

>  1) Are you going to prevent sched_setaffinity calls as well?

Outside of the the exclusive domain they're bound into, yes.

>     What about the per-cpu kernel threads?

Those are set up before the userspace domains, so will fall into
whatever domain they're bound to.

<cut lots of other stuff ...>

I think we're now getting down into really obscure requirements for
particular types of wierd MP jobs. Whether Linux wants to support that
or not is open to debate, but personally, given the complexity involved,
I'd be against it.

I agree with the basic partitioning stuff - and see a need for that. The
non-exclusive stuff I think is fairly obscure, and unnecessary complexity
at this point, as 90% of it is covered by CKRM. It's Andrew and Linus's 
decision, but that's my input.

We'll never be able to provide every single feature everyone wants without
overloading the kernel with reams of complexity. It's also an evolutionary
process of putting in the most important stuff first, and seeing how it
goes. I see that as the exclusive domain stuff (when we find a better
implementation than cpus_allowed) + the CKRM scheduling resource control.
I know you have other opinions.

M.


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-06  1:16                                               ` Martin J. Bligh
@ 2004-10-06  2:08                                                 ` Paul Jackson
  2004-10-06 22:59                                                   ` Matthew Dobson
  0 siblings, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-10-06  2:08 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Simon.Derr, pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht,
	lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, colpatch, ak, sivanich

Martin writes:
> I agree with the basic partitioning stuff - and see a need for that. The
> non-exclusive stuff I think is fairly obscure, and unnecessary complexity
> at this point, as 90% of it is covered by CKRM. It's Andrew and Linus's 
> decision, but that's my input.

Now you're trying to marginalize non-exclusive cpusets as a fringe
requirement.  Thanks a bunch ;).

Instead of requiring complete exclusion for all cpusets, and pointing to
the current 'exclusive' flag as the wrong flag at the wrong place at the
wrong time (sorry - my radio is turned to the V.P. debate in the
background) how about let's being clear what sort of exclusion the
schedulers, the allocators and here the resource manager (CKRM) require.

I can envision dividing a machine into a few large, quite separate,
'soft' partitions, where each such partition is represented by a subtree
of the cpuset hierarchy, and where there is no overlap of CPUs, Memory
Nodes or tasks between the 'soft' partitions, even though there is a
possibly richly nested cpuset (cpu and memory affinity) structure within
any given 'soft' partition.

Nothing would cross 'soft' partition boundaries.  So far as CPUs, Memory
Nodes, Tasks and their Affinity, the 'soft' partitions would be
separate, isolated, and non-overlapping.

Each such 'soft' partition could host a separate instance (domain) of
the scheduler, allocator, and resource manager.  Any such domain would
know what set of CPUs, Memory Nodes and Tasks it was managing, and would
have complete and sole control of the scheduling, allocation or resource
sharing of those entities.

But also within a 'soft' partition, there would be finer grain placement,
finer grain CPU and Memory affinity, whether by the current tasks
cpus_allowed and mems_allowed, or by some improved mechanism that the
schedulers, allocators and resource managers could better deal with.

There _has_ to be.  Even if cpusets, sched_setaffinity, mbind, and
set_mempolicy all disappeared tomorrow, you still have the per-cpu
kernel threads that have to be placed to a tighter specification than
the whole of such a 'soft' partition.

Could you or some appropriate CKRM guru please try to tell me what
isolation you actually need for CKRM.  Matthew or Peter please do the
same for the schedulers.

In particular, do you need to prohibit any finer grained placement
within a particular domain, or not.  I believe not.  Is it not the case
that what you really need is that the cpusets that correspond to one of
your domains (my 'soft' partitions, above) be isolated from any other
such 'soft' partition?  Is it not the case that further, finer grained
placement within such an isolated 'soft' partition is acceptable?  Sure
better be.  Indeed, that's pretty much what we have now, with what
amounts to a single domain covering the entire system.

Instead of throwing out half of cpusets on claims that it conflicts
with the requirements of the schedulers, resource managers or (not yet
raised) the allocators, please be more clear as to what the actual
requirements are.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-05 22:19                               ` Matthew Dobson
@ 2004-10-06  2:39                                 ` Paul Jackson
  2004-10-06 23:21                                   ` Matthew Dobson
  2004-10-06  2:47                                 ` Paul Jackson
                                                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-10-06  2:39 UTC (permalink / raw)
  To: colpatch
  Cc: mbligh, pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht,
	lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, Simon.Derr, ak, sivanich

Matthew  wrote:
> 
> I feel that the actual implementation, however, is taking
> a wrong approach, because it attempts to use the cpus_allowed mask to
> override the scheduler in the general case.  cpus_allowed, in my
> estimation, is meant to be used as the exception, not the rule.

I agree that big chunks of a large system that are marching to the beat
of two distinctly different drummers would better have their schedulers
organized along the domains that you describe, than by brute force abuse
of the cpus_allowed mask.

I look forward to your RFC, Matthew.  Though not being a scheduler guru,
I will mostly have to rely on the textual commentary in order to
understand what it means.

Existing finer grain placement of CPUs (sched_setaffinity) and Memory
(mbind, set_mempolicy) already exists, and is required by parallel
threaded applications such as OpenMP and MPI are commonly used to
develop.

The finer grain use of non-exclusive cpusets, in order to support
such workload managers as PBS and LSF in managing this finer grained
placement on a system (domain) wide basis should not be placing any
significantly further load on the schedulers or resource managers.

The top level cpusets must provide additional isolation properties so
that separate scheduler and resource manager domains can work in
relative isolation.  I've tried hard to speculate what these additional
isolation properties might be.  I look forward to hearing from the CKRM
and scheduler folks on this.  I agree that simple unconstrained (ab)use
of the cpus_allowed and mems_allowed masks, at that scale, places an
undo burden on the schedulers, allocators and resource managers.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-05 22:19                               ` Matthew Dobson
  2004-10-06  2:39                                 ` Paul Jackson
@ 2004-10-06  2:47                                 ` Paul Jackson
  2004-10-06  9:43                                   ` Simon Derr
  2004-10-06  8:02                                 ` Simon Derr
  2005-02-07 23:59                                 ` Matthew Dobson
  3 siblings, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-10-06  2:47 UTC (permalink / raw)
  To: colpatch
  Cc: mbligh, pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht,
	lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, Simon.Derr, ak, sivanich

Matthew wrote:
> 
> By adding locking and reference counting, and simplifying the way in which
> sched_domains are created, linked, unlinked and eventually destroyed we
> can use sched_domains as the implementation of cpusets.

I'd be inclined to turn this sideways from what you say.

Rather, add another couple of properties to cpusets:

 1) An isolated flag, that guarantees whatever isolation properties
    we agree that schedulers, allocators and resource allocators
    require between domains, and

 2) For those cpusets which are so isolated, the option to add
    links of some form, between that cpuset, and distinct scheduler,
    allocator and/or resource domains.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-05 22:33                                           ` Matthew Dobson
@ 2004-10-06  3:01                                             ` Paul Jackson
  2004-10-06 23:12                                               ` Matthew Dobson
  0 siblings, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-10-06  3:01 UTC (permalink / raw)
  To: colpatch
  Cc: Simon.Derr, mbligh, pwil3058, frankeh, dipankar, akpm, ckrm-tech,
	efocht, lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, ak, sivanich

Matthew writes:
> 
> If that's all 'exclusive' means then 'exclusive' is a poor choice of
> terminology.  'Exclusive' sounds like it would exclude all tasks it is
> possible to exclude from running there (ie: with the exception of
> certain necessary kernel threads).

I suspect that my aggressive pushing of mechanism _out_ of the
kernel has obscured what's going on here.

The real 'exclusive' use of some set of CPUs and Memory Nodes
is provided by the workload managers, PBS and LSF.  They fabricate
this out of the kernel cpuset 'exclusive' property, plus other
optional user level stuff.

For instance, one doesn't have to follow Simon's example, and leave the
classic Unix daemon load running in a cpuset that share resources with
all other cpusets.  Instead, one can coral this classic Unix load into a
bootcpuset, administratively, at system boot.  All the kernel mechanisms
required to support this exist in my current cpuset patch in Andrew's
tree.

The kernel cpuset 'mems_exclusive' and 'cpus_exclusive' flags are like
vitamin precursors.  They are elements out of which the real nutrative
compound is constructed.  Occassionally, as in Simon's configuration,
they are actually sufficient in their current state.  Usually, more
processing is required.  This processing just isn't visible to the
kernel code.

Perhaps these flags should be called:
	mems_exclusive_precursor
	cpus_exclusive_precursor
;).

And I also agree that there is some other, stronger, set of conditions
that the scheduler, allocator and resource manager domains need in order
to obtain sufficient isolation to stay sane.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-05 22:19                               ` Matthew Dobson
  2004-10-06  2:39                                 ` Paul Jackson
  2004-10-06  2:47                                 ` Paul Jackson
@ 2004-10-06  8:02                                 ` Simon Derr
  2005-02-07 23:59                                 ` Matthew Dobson
  3 siblings, 0 replies; 233+ messages in thread
From: Simon Derr @ 2004-10-06  8:02 UTC (permalink / raw)
  To: Matthew Dobson
  Cc: Martin J. Bligh, Paul Jackson, pwil3058, frankeh, dipankar,
	Andrew Morton, ckrm-tech, efocht, LSE Tech, hch, steiner,
	Jesse Barnes, sylvain.jeaugey, djh, LKML, Simon.Derr, Andi Kleen,
	sivanich

On Tue, 5 Oct 2004, Matthew Dobson wrote:

> On Sun, 2004-10-03 at 16:53, Martin J. Bligh wrote:
> > > Martin wrote:
> > >> Matt had proposed having a separate sched_domain tree for each cpuset, which
> > >> made a lot of sense, but seemed harder to do in practice because "exclusive"
> > >> in cpusets doesn't really mean exclusive at all.
> > > 
> > > See my comments on this from yesterday on this thread.
> > > 
> > > I suspect we don't want a distinct sched_domain for each cpuset, but
> > > rather a sched_domain for each of several entire subtrees of the cpuset
> > > hierarchy, such that every CPU is in exactly one such sched domain, even
> > > though it be in several cpusets in that sched_domain.
> > 
> > Mmmm. The fundamental problem I think we ran across (just whilst pondering,
> > not in code) was that some things (eg ... init) are bound to ALL cpus (or
> > no cpus, depending how you word it); i.e. they're created before the cpusets
> > are, and are a member of the grand-top-level-uber-master-thingummy.
> > 
> > How do you service such processes? That's what I meant by the exclusive
> > domains aren't really exclusive. 
> > 
> > Perhaps Matt can recall the problems better. I really liked his idea, aside
> > from the small problem that it didn't seem to work ;-)
> 
> Well that doesn't seem like a fair statement.  It's potentially true,
> but it's really hard to say without an implementation! ;)
> 
> I think that the idea behind cpusets is really good, essentially
> creating isolated areas of CPUs and memory for tasks to run
> undisturbed.  I feel that the actual implementation, however, is taking
> a wrong approach, because it attempts to use the cpus_allowed mask to
> override the scheduler in the general case.  cpus_allowed, in my
> estimation, is meant to be used as the exception, not the rule.  If we
> wish to change that, we need to make the scheduler more aware of it, so
> it can do the right thing(tm) in the presence of numerous tasks with
> varying cpus_allowed masks.  The other option is to implement cpusets in
> a way that doesn't use cpus_allowed.  That is the option that I am
> pursuing.  

I like this idea. 

The current implementation uses cpus_allowed because it is non-intrusive, 
as it does not touch the scheduler at all, and also maybe because it was 
easy to do this way since the cpuset development team seems to lack 
scheduler gurus.

The 'non intrusive' part was also important as long as the cpusets were 
mostly 'on their own', but if now it appears that more cooperation with 
other functions such as CKRM is needed, I suppose a deeper impact on the 
scheduler code might be OK. Especially if we intend to enforce 'real 
exclusive' cpusets or things like that.

So I'm really interested in any design/bits of code that would go in that 
direction.

	Simon.


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-06  2:47                                 ` Paul Jackson
@ 2004-10-06  9:43                                   ` Simon Derr
  2004-10-06 13:27                                     ` Paul Jackson
  2004-10-06 21:55                                     ` Peter Williams
  0 siblings, 2 replies; 233+ messages in thread
From: Simon Derr @ 2004-10-06  9:43 UTC (permalink / raw)
  To: Paul Jackson
  Cc: colpatch, mbligh, pwil3058, frankeh, dipankar, akpm, ckrm-tech,
	efocht, lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, Simon.Derr, ak, sivanich

On Tue, 5 Oct 2004, Paul Jackson wrote:

> Matthew wrote:
> > 
> > By adding locking and reference counting, and simplifying the way in which
> > sched_domains are created, linked, unlinked and eventually destroyed we
> > can use sched_domains as the implementation of cpusets.
> 
> I'd be inclined to turn this sideways from what you say.
> 
> Rather, add another couple of properties to cpusets:
> 
>  1) An isolated flag, that guarantees whatever isolation properties
>     we agree that schedulers, allocators and resource allocators
>     require between domains, and
> 
>  2) For those cpusets which are so isolated, the option to add
>     links of some form, between that cpuset, and distinct scheduler,
>     allocator and/or resource domains.
> 

Just to make sure we speak the same language:

That would lead to three kinds of cpusets:

1-'isolated' cpusets, with maybe a distinct scheduler, allocator and/or 
resource domains.

2-'exclusive' cpusets (maybe with a better name?), that just don't overlap 
with other cpusets who have the same parent.

3-'non-exclusive, non isolated' cpusets, with no restriction of any kind.

I suppose it would still be possible to create cpusets of type 2 or 3 
inside a type-1 cpuset. They would be managed by the scheduler of the 
parent 'isolated' cpuset.

I was thinking that the top cpuset is a particular case of type-1, but 
actually no.

'isolated' cpusets should probably be at the same level as the top cpuset 
(who should lose this name, then).

How should 'isolated' cpusets be created ? Should the top_cpuset be shrunk 
to free some CPUs so we have room to create a new 'isolated' cpuset ?

Or should 'isolated' cpusets stay inside the top cpuset, that whould have 
to schedule its processes outside the 'isolated' cpusets ? Should it then 
be forbidden to cover the whole system with 'isolated' cpusets ?

That's a lot of question marks...

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-06  9:43                                   ` Simon Derr
@ 2004-10-06 13:27                                     ` Paul Jackson
  2004-10-06 21:55                                     ` Peter Williams
  1 sibling, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-06 13:27 UTC (permalink / raw)
  To: Simon Derr
  Cc: colpatch, mbligh, pwil3058, frankeh, dipankar, akpm, ckrm-tech,
	efocht, lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, Simon.Derr, ak, sivanich

Simon wrote:
> Just to make sure we speak the same language:

Approximately.  We already have two cpuset properties of cpus_exclusive
and mems_exclusive, which if set, assure that cpus and mems respectively
don't overlap with siblings or cousins.

I am imagining adding one more cpuset property: isolated.

Just what it would guarantee if set isn't clear yet: it would have to
provide whatever we agreed the scheduler, allocator and resource manager
folks needed in order to sanely support a separate domain in that
isolated cpuset.  I'm currently expecting this to be something along the
lines of the following:
  a. mems_exclusive == 1
  b. cpus_exclusive == 1
  c. no isolated ancestor or descendent
  d. no task attached to any ancestor that is not either entirely within,
       or entirely without, both the cpus and mems of the isolated cpuset.

Attempts later on to change the cpus or mems allowed of any task so as
to create a violation of [d.] would fail.  As would any other action
that would violate the above.

I'm still unsure of just what is needed.  I'm beginning to suspect that
there is a reasonable meeting point with the scheduler folks, but that
the CKRM folks may want something constructed from unobtainium.  The
allocator folks are easy so far, as they haven't formed an organized
resistance ;).

There would be five flavors of cpusets.  The four flavors obtained by
each combination of cpus_exclusive 0 or 1, and mems_exclusive 0 or 1,
but with isolated == 0.  And the fifth flavor, with each of the
exclusive flags set 1, plus the isolated flag set 1.

The root node would start out isolated, but the first time you went to
mark a direct child of it isolated, if that effort succeeded, then the
root would lose its isolation (isolated goes to '0'), in accordance with
property [c.]  You would have to be using a bootcpuset for this to have
any chance of working, with all the tasks having cpus_allowed ==
CPU_MASK_ALL, or mems_allowed == NODE_MASK_ALL, already confined to the
bootcpuset.  The top level default scheduler, allocator and resource
manager would have to be able to work in a domain that was not isolated
and with some of its tasks, cpus and memory perhaps being managed by a
scheduler, allocator and/or resource manager in an isolated subordinate
domain.


> 'isolated' cpusets should probably be at the same level as the top cpuset 
> (who should lose this name, then).

I don't think so.  The top remains the one and only, all encompassing, top.


> Or should 'isolated' cpusets stay inside the top cpuset, that whould have 
> to schedule its processes outside the 'isolated' cpusets 

Yes - isolated cpusets stay beneath the top cpuset.  Any given task in
the top cpuset would lie either entirely within, or without, of any
isolated descendent.  If within and if that isolated descendent has a
scheduler, it owns the scheduling of that task.  Similarly for the
allocator and resource manager.


> Should it then 
> be forbidden to cover the whole system with 'isolated' cpusets ?

No need for this that I am aware of, yet anyway.


> That's a lot of question marks...

Yes - lots of question marks.

But the basic objectives are not too much up to question at this point:
 1) An isolated cpuset must not overlap any other isolated cpuset, not in
    mems, not in cpus, and (the tricky part) not in the affinity masks (or
    whatever becomes of cpus_allowed and mems_allowed) of any task in the
    system.
 2) For any cpus_allowed or mems_allowed of any task or cpuset in the
    entire system, it is either entirely contained within some isolated
    cpuset, or entirely outside all of them.
 3) Necessarily from the above, the isolated cpusets form a partial,
    non-overlapping covering of the entire systems cpus, memory nodes,
    and (via the per-task affinity bitmaps) tasks.

The final result being that for any scheduler, allocator or resource
manager:
 * it knows exactly what is its domain of cpus, memory nodes or tasks
 * it is the sole and exclusive owner of all in its domain, and 
 * it has no bearing on anything outside its domain.

It may well be that task->cpus_allowed and task->mems_allowed remain as
they are now, but that for major top level 'soft' partitionings of the system,
we use these isolated cpusets, and attach additional properties friendly to
the needs of schedulers, allocators and resource managers to such isolated
cpusets.  This would put the *_allowed bitmaps back closer to being what they
should be - small scale exceptions rather than large scale abuses.

An isolated cpuset might well not have its own dedicated domains for
all three of schedulers, allocators and resource managers.  It might
have say just its own scheduler, but continue to rely on the global
allocator and resource manager.

===

First however - I am still eager to hear what the CKRM folks think of
set_affinity, mbind and set_mempolicy, as well as what they think of the
current existing per-cpu kernel threads.  It would seem that, regardless
of their take on cpusets, the CKRM folks might not be too happy with any
of these other means of setting the *_allowed bitmaps to anything other
than CPU_MASK_ALL.  My best guess from what I've seen so far is that
they are trying to ignore these other issues with varied *_allowed
bitmap settings as being 'beneath the radar', but trying to use the same
issues to transform cpusets into being pretty much _only_ the flat space
of isolated cpusets from above, minus its hierarchical nesting and
non-exclusive options.

And in any case, I've yet to see that OpenMP and MPI jobs, with their
tight threading, fit well in the CKRM-world.  Such jobs require to have
each thread on a separate CPU, or their performance sucks big time. They
can share CPUs and Nodes with other work and not suffer _too_ bad
(especially if something like gang scheduling is available), but they
must be placed one thread per distinct CPU.  This is absolutely a
placement matter, not a fair share percentage of overall resources
matter.  From all I can see, the CKRM folks just wish such jobs would go
away, or at least they wish that the main Linux kernel would accept a
CKRM patch that is inhospitable to such jobs.

My hope is that CKRM, like the schedulers, is tolerant of smaller scale
exceptions to the allowed placement.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-06  9:43                                   ` Simon Derr
  2004-10-06 13:27                                     ` Paul Jackson
@ 2004-10-06 21:55                                     ` Peter Williams
  2004-10-06 22:49                                       ` Paul Jackson
  1 sibling, 1 reply; 233+ messages in thread
From: Peter Williams @ 2004-10-06 21:55 UTC (permalink / raw)
  To: Simon Derr
  Cc: Paul Jackson, colpatch, mbligh, frankeh, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

Simon Derr wrote:
> On Tue, 5 Oct 2004, Paul Jackson wrote:
> 
> 
>>Matthew wrote:
>>
>>>By adding locking and reference counting, and simplifying the way in which
>>>sched_domains are created, linked, unlinked and eventually destroyed we
>>>can use sched_domains as the implementation of cpusets.
>>
>>I'd be inclined to turn this sideways from what you say.
>>
>>Rather, add another couple of properties to cpusets:
>>
>> 1) An isolated flag, that guarantees whatever isolation properties
>>    we agree that schedulers, allocators and resource allocators
>>    require between domains, and
>>
>> 2) For those cpusets which are so isolated, the option to add
>>    links of some form, between that cpuset, and distinct scheduler,
>>    allocator and/or resource domains.
>>
> 
> 
> Just to make sure we speak the same language:
> 
> That would lead to three kinds of cpusets:
> 
> 1-'isolated' cpusets, with maybe a distinct scheduler, allocator and/or 
> resource domains.
> 
> 2-'exclusive' cpusets (maybe with a better name?), that just don't overlap 
> with other cpusets who have the same parent.
> 
> 3-'non-exclusive, non isolated' cpusets, with no restriction of any kind.
> 
> I suppose it would still be possible to create cpusets of type 2 or 3 
> inside a type-1 cpuset. They would be managed by the scheduler of the 
> parent 'isolated' cpuset.
> 
> I was thinking that the top cpuset is a particular case of type-1, but 
> actually no.
> 
> 'isolated' cpusets should probably be at the same level as the top cpuset 
> (who should lose this name, then).
> 
> How should 'isolated' cpusets be created ? Should the top_cpuset be shrunk 
> to free some CPUs so we have room to create a new 'isolated' cpuset ?
> 
> Or should 'isolated' cpusets stay inside the top cpuset, that whould have 
> to schedule its processes outside the 'isolated' cpusets ? Should it then 
> be forbidden to cover the whole system with 'isolated' cpusets ?
> 
> That's a lot of question marks...
> 

I think that this is becoming overly complicated.  I think that you need 
(at most) two types of cpuset: 1. the top level non overlapping type and 
2. possibly overlapping sets within the top level ones. I think that the 
term cpuset should be reserved for the top level ones and some other 
term be coined for the others.  The type 2 ones are really just the 
equivalent of the current affinity mask but with the added constraint 
that it be a (non empty) proper subset of the containing cpuset.

The three types that you've described are then just examples of 
configurations that could be achieved using this model.

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-06 21:55                                     ` Peter Williams
@ 2004-10-06 22:49                                       ` Paul Jackson
  0 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-06 22:49 UTC (permalink / raw)
  To: Peter Williams
  Cc: Simon.Derr, colpatch, mbligh, frankeh, dipankar, akpm, ckrm-tech,
	efocht, lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, ak, sivanich

Peter protests:
> I think that this is becoming overly complicated.

My brainstorming ways to accomodate the isolation that the scheduler,
allocator and resource manager domains require is getting ahead of
itself.

First I need to hear from the CKRM folks what degree of isolation they
really need, the essential minimum, and how they intend to accomodate
not just cpusets, but also the other placement API's sched_setaffinity,
mbind and set_mempolicy, as well as the per-cpu kernel threads.

Then it makes sense to revisit the implementation.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-06  2:08                                                 ` Paul Jackson
@ 2004-10-06 22:59                                                   ` Matthew Dobson
  2004-10-06 23:23                                                     ` Peter Williams
  2004-10-07  8:51                                                     ` Paul Jackson
  0 siblings, 2 replies; 233+ messages in thread
From: Matthew Dobson @ 2004-10-06 22:59 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Martin J. Bligh, Simon.Derr, pwil3058, frankeh, dipankar,
	Andrew Morton, ckrm-tech, efocht, LSE Tech, hch, steiner,
	Jesse Barnes, sylvain.jeaugey, djh, LKML, Andi Kleen, sivanich

On Tue, 2004-10-05 at 19:08, Paul Jackson wrote:
> Martin writes:
> > I agree with the basic partitioning stuff - and see a need for that. The
> > non-exclusive stuff I think is fairly obscure, and unnecessary complexity
> > at this point, as 90% of it is covered by CKRM. It's Andrew and Linus's 
> > decision, but that's my input.
> 
> Now you're trying to marginalize non-exclusive cpusets as a fringe
> requirement.  Thanks a bunch ;).
> 
> Instead of requiring complete exclusion for all cpusets, and pointing to
> the current 'exclusive' flag as the wrong flag at the wrong place at the
> wrong time (sorry - my radio is turned to the V.P. debate in the
> background) how about let's being clear what sort of exclusion the
> schedulers, the allocators and here the resource manager (CKRM) require.

I think what Martin is trying to say, in his oh so eloquent way, is that
the difference between 'non-exclusive' cpusets and, say, CKRM
taskclasses isn't very clear.  It seems to me that non-exclusive cpusets
are little more than a convenient way to group tasks.  Now, I'm not
saying that I don't think that is a useful functionality, but I am
saying that cpusets seem like the wrong way to go about it.


> I can envision dividing a machine into a few large, quite separate,
> 'soft' partitions, where each such partition is represented by a subtree
> of the cpuset hierarchy, and where there is no overlap of CPUs, Memory
> Nodes or tasks between the 'soft' partitions, even though there is a
> possibly richly nested cpuset (cpu and memory affinity) structure within
> any given 'soft' partition.
> 
> Nothing would cross 'soft' partition boundaries.  So far as CPUs, Memory
> Nodes, Tasks and their Affinity, the 'soft' partitions would be
> separate, isolated, and non-overlapping.

Ok.  These imaginary 'soft' partitions sound much like what I expected
'exclusive' cpusets to be based on the terminology.  They also sound
exactly like what I am trying to implement through my sched_domains
work.


> Each such 'soft' partition could host a separate instance (domain) of
> the scheduler, allocator, and resource manager.  Any such domain would
> know what set of CPUs, Memory Nodes and Tasks it was managing, and would
> have complete and sole control of the scheduling, allocation or resource
> sharing of those entities.

I don't know that these partitions would necessarily need their own
scheduler, allocator and resource manager, or if we would just make the
current scheduler, allocator and resource manager aware of these
boundaries.  In either case, that is an implementation detail not to be
agonized over now.


> But also within a 'soft' partition, there would be finer grain placement,
> finer grain CPU and Memory affinity, whether by the current tasks
> cpus_allowed and mems_allowed, or by some improved mechanism that the
> schedulers, allocators and resource managers could better deal with.
> 
> There _has_ to be.  Even if cpusets, sched_setaffinity, mbind, and
> set_mempolicy all disappeared tomorrow, you still have the per-cpu
> kernel threads that have to be placed to a tighter specification than
> the whole of such a 'soft' partition.

Agreed.  I'm not proposing that we rip out sched_set/getaffinity, mbind,
etc.  What I'm saying is that tasks should not *default* to using these
mechanisms because, at least in their current incarnations, our
scheduler and allocator are written in such a way that these mechanisms
are secondary.  The assumption is that the scheduler/allocator can
schedule/allocate wherever they choose.  The scheduler does look at
these bindings and if they contradict the decision made we deal with
that after the fact.  The allocator has longer code paths and more logic
to deal with if there are bindings in place.  So our options are to
either:
1) find a way to not have to rely on these mechanisms for most/all tasks
in the system, or 
2) rewrite the scheduler/allocator to deal with these bindings up front,
and take them into consideration early in the scheduling/allocating
process.


> Could you or some appropriate CKRM guru please try to tell me what
> isolation you actually need for CKRM.  Matthew or Peter please do the
> same for the schedulers.
> 
> In particular, do you need to prohibit any finer grained placement
> within a particular domain, or not.  I believe not.  Is it not the case
> that what you really need is that the cpusets that correspond to one of
> your domains (my 'soft' partitions, above) be isolated from any other
> such 'soft' partition?  Is it not the case that further, finer grained
> placement within such an isolated 'soft' partition is acceptable?  Sure
> better be.  Indeed, that's pretty much what we have now, with what
> amounts to a single domain covering the entire system.

I must also plead ignorance to the gritty details of CKRM.  It would
seem to me, from discussions on this thread, that CKRM could be made to
deal with 'isolated' domains, 'soft' partitions, or 'exclusive' cpusets
without TOO much headache.  Basically just telling CKRM that the tasks
in this group are sharing CPU time from a pool of 4 CPUs, rather than
all 16 CPUs in the system.  Hubertus?  As far as supporting fine grained
binding inside domains, that should definitely be supported in any
solution worthy of acceptance.  CKRM, to the best of my knowledge,
currently deals with cpus_allowed, and there's no reason to think that
it wouldn't be able to deal with cpus_allowed in the multiple domain
case.


> Instead of throwing out half of cpusets on claims that it conflicts
> with the requirements of the schedulers, resource managers or (not yet
> raised) the allocators, please be more clear as to what the actual
> requirements are.

That's not really the reason that I was arguing against half of
cpusets.  My argument is not related to CKRM's requirements, as I really
don't know what those are! :)  My argument is that I don't see what
non-exclusive cpusets buys us.  If all we're looking for is basic
task-grouping functionality, I'm quite certain that we can implement
that in a much more light-weight way that doesn't conflict with the
scheduler's decision making process.  In fact, for non-exclusive
cpusets, I'd say that we can probably implement that type of
task-grouping in a non-intrusive way that will complement the scheduler
and possibly even improve performance by giving the scheduler a hint
about which tasks should be scheduled together.  Using cpus_allowed is
not that way.  cpus_allowed should be reserved for what it was
originally meant for: specifying a *strict* subset of CPUs that a task
is restricted to running on.

-Matt


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-06  3:01                                             ` Paul Jackson
@ 2004-10-06 23:12                                               ` Matthew Dobson
  2004-10-07  8:59                                                 ` [ckrm-tech] " Paul Jackson
  0 siblings, 1 reply; 233+ messages in thread
From: Matthew Dobson @ 2004-10-06 23:12 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Simon.Derr, Martin J. Bligh, pwil3058, frankeh, dipankar,
	Andrew Morton, ckrm-tech, efocht, LSE Tech, hch, steiner,
	Jesse Barnes, sylvain.jeaugey, djh, LKML, Andi Kleen, sivanich

On Tue, 2004-10-05 at 20:01, Paul Jackson wrote:
> Matthew writes:
> > 
> > If that's all 'exclusive' means then 'exclusive' is a poor choice of
> > terminology.  'Exclusive' sounds like it would exclude all tasks it is
> > possible to exclude from running there (ie: with the exception of
> > certain necessary kernel threads).
> 
> I suspect that my aggressive pushing of mechanism _out_ of the
> kernel has obscured what's going on here.
> 
> The real 'exclusive' use of some set of CPUs and Memory Nodes
> is provided by the workload managers, PBS and LSF.  They fabricate
> this out of the kernel cpuset 'exclusive' property, plus other
> optional user level stuff.
> 
> For instance, one doesn't have to follow Simon's example, and leave the
> classic Unix daemon load running in a cpuset that share resources with
> all other cpusets.  Instead, one can coral this classic Unix load into a
> bootcpuset, administratively, at system boot.  All the kernel mechanisms
> required to support this exist in my current cpuset patch in Andrew's
> tree.
> 
> The kernel cpuset 'mems_exclusive' and 'cpus_exclusive' flags are like
> vitamin precursors.  They are elements out of which the real nutrative
> compound is constructed.  Occassionally, as in Simon's configuration,
> they are actually sufficient in their current state.  Usually, more
> processing is required.  This processing just isn't visible to the
> kernel code.
> 
> Perhaps these flags should be called:
> 	mems_exclusive_precursor
> 	cpus_exclusive_precursor
> ;).

Ok...  So if we could offer the 'real' exclusion that the PBS and LSF
workload managers offer directly, would that suffice?  Meaning, could we
make PBS and LSF work on top of in-kernel mechanisms that offer 'real'
exclusion.  'Real' exclusion defined as isolated groups of CPUs and
memory that the kernel can guarantee will not run other processes?  That
way we can get the job done without having to rely on these external
workload managers, and be able to offer this dynamic partitioning to all
users.  Thoughts?

-Matt


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-06  2:39                                 ` Paul Jackson
@ 2004-10-06 23:21                                   ` Matthew Dobson
  2004-10-07  9:41                                     ` [ckrm-tech] " Paul Jackson
  0 siblings, 1 reply; 233+ messages in thread
From: Matthew Dobson @ 2004-10-06 23:21 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Martin J. Bligh, pwil3058, frankeh, dipankar, Andrew Morton,
	ckrm-tech, efocht, LSE Tech, hch, steiner, Jesse Barnes,
	sylvain.jeaugey, djh, LKML, Simon.Derr, Andi Kleen, sivanich

On Tue, 2004-10-05 at 19:39, Paul Jackson wrote:
> Matthew  wrote:
> > 
> > I feel that the actual implementation, however, is taking
> > a wrong approach, because it attempts to use the cpus_allowed mask to
> > override the scheduler in the general case.  cpus_allowed, in my
> > estimation, is meant to be used as the exception, not the rule.
> 
> I agree that big chunks of a large system that are marching to the beat
> of two distinctly different drummers would better have their schedulers
> organized along the domains that you describe, than by brute force abuse
> of the cpus_allowed mask.

Wonderful news! :)


> I look forward to your RFC, Matthew.  Though not being a scheduler guru,
> I will mostly have to rely on the textual commentary in order to
> understand what it means.

Wow, building a fan base already.  I'll need all the cheerleaders I can
get! ;)


> Existing finer grain placement of CPUs (sched_setaffinity) and Memory
> (mbind, set_mempolicy) already exists, and is required by parallel
> threaded applications such as OpenMP and MPI are commonly used to
> develop.

Absolutely.  I have no intention of removing or modifying those
mechanisms.  My only goal is to see that using them remains the
exceptional case, and not the default behavior of most tasks.


> The finer grain use of non-exclusive cpusets, in order to support
> such workload managers as PBS and LSF in managing this finer grained
> placement on a system (domain) wide basis should not be placing any
> significantly further load on the schedulers or resource managers.
> 
> The top level cpusets must provide additional isolation properties so
> that separate scheduler and resource manager domains can work in
> relative isolation.  I've tried hard to speculate what these additional
> isolation properties might be.  I look forward to hearing from the CKRM
> and scheduler folks on this.  I agree that simple unconstrained (ab)use
> of the cpus_allowed and mems_allowed masks, at that scale, places an
> undo burden on the schedulers, allocators and resource managers.

I'm really glad to hear that, Paul.  That unconstrained (ab)use was my
only real concern with the cpusets patches.  I look forward to massaging
our two approaches into something that will satisfy all interested
parties.

-Matt


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-06 22:59                                                   ` Matthew Dobson
@ 2004-10-06 23:23                                                     ` Peter Williams
  2004-10-07  0:16                                                       ` Rick Lindsley
  2004-10-07  8:51                                                     ` Paul Jackson
  1 sibling, 1 reply; 233+ messages in thread
From: Peter Williams @ 2004-10-06 23:23 UTC (permalink / raw)
  To: colpatch
  Cc: Paul Jackson, Martin J. Bligh, Simon.Derr, frankeh, dipankar,
	Andrew Morton, ckrm-tech, efocht, LSE Tech, hch, steiner,
	Jesse Barnes, sylvain.jeaugey, djh, LKML, Andi Kleen, sivanich

Matthew Dobson wrote:
> On Tue, 2004-10-05 at 19:08, Paul Jackson wrote:
> 
> I don't know that these partitions would necessarily need their own
> scheduler, allocator and resource manager, or if we would just make the
> current scheduler, allocator and resource manager aware of these
> boundaries.  In either case, that is an implementation detail not to be
> agonized over now.

It's not so much whether they NEED their own scheduler, etc. as whether 
it should be possible for them to have their own scheduler, etc.  With a 
configurable scheduler (such as ZAPHOD) this could just be a matter of 
having separate configuration variables for each cpuset (e.g. if a 
cpuset has been created to contain as bunch of servers there's no need 
to try and provide good interactive response for its tasks (as none of 
them will be interactive) so the interactive response mechanism can be 
turned off in that cpuset leading to better server response and throughput).

Peter
-- 
Peter Williams                                   pwil3058@bigpond.net.au

"Learning, n. The kind of ignorance distinguishing the studious."
  -- Ambrose Bierce

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-06 23:23                                                     ` Peter Williams
@ 2004-10-07  0:16                                                       ` Rick Lindsley
  2004-10-07 18:27                                                         ` Paul Jackson
  0 siblings, 1 reply; 233+ messages in thread
From: Rick Lindsley @ 2004-10-07  0:16 UTC (permalink / raw)
  To: Peter Williams
  Cc: colpatch, Paul Jackson, Martin J. Bligh, Simon.Derr, frankeh,
	dipankar, Andrew Morton, ckrm-tech, efocht, LSE Tech, hch,
	steiner, Jesse Barnes, sylvain.jeaugey, djh, LKML, Andi Kleen,
	sivanich

    It's not so much whether they NEED their own scheduler, etc. as whether 
    it should be possible for them to have their own scheduler, etc.  With a 
    configurable scheduler (such as ZAPHOD) this could just be a matter of 
    having separate configuration variables for each cpuset (e.g. if a 
    cpuset has been created to contain as bunch of servers there's no need 
    to try and provide good interactive response for its tasks (as none of 
    them will be interactive) so the interactive response mechanism can be 
    turned off in that cpuset leading to better server response and throughput).

Providing configurable schedulers is a feature/bug/argument completely
separate from cpusets.  Let's stay focused on that for now.

Two concrete examples for cpusets stick in my mind:

    * the department that has been given 16 cpus of a 128 cpu machine,
      is free to do what they want with them, and doesn't much care
      specifically how they're laid out. Think general timeshare.

    * the department that has been given 16 cpus of a 128 cpu machine
      to run a finely tuned application which expects and needs everybody
      to stay off those cpus. Think compute-intensive.

Correct me if I'm wrong, but CKRM can handle the first, but cannot
currently handle the second.  And the mechanism(s) for creating either
situation are suboptimal at best and non-existent at worst.

Rick

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-06 22:59                                                   ` Matthew Dobson
  2004-10-06 23:23                                                     ` Peter Williams
@ 2004-10-07  8:51                                                     ` Paul Jackson
  2004-10-07 10:53                                                       ` Rick Lindsley
                                                                         ` (2 more replies)
  1 sibling, 3 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-07  8:51 UTC (permalink / raw)
  To: colpatch
  Cc: mbligh, Simon.Derr, pwil3058, frankeh, dipankar, akpm, ckrm-tech,
	efocht, lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, ak, sivanich

> I don't see what non-exclusive cpusets buys us.

One can nest them, overlap them, and duplicate them ;)

For example, we could do the following:

 * Carve off CPUs 128-255 of a 256 CPU system in which
   to run various HPC jobs, requiring numbers of CPUs.
   This is named /dev/cpuset/hpcarena, and it is the really
   really exclusive and isolated sort of cpuset which can and
   does have its own scheduler domain, for a scheduler configuration
   that is tuned for running a mix of HPC jobs.  In this hpcarena
   also runs the per-cpu kernel threads that are pinned on CPUs
   128-255 (for _all_ tasks running on an exclusive cpuset
   must be in that cpuset or below).

 * The testing group gets half of this cpuset each weekend, in
   order to run a battery of tests: /dev/cpuset/hpcarena/testing.
   In this testing cpuset runs the following batch manager.

 * They run a home brew batch manager, which takes an input
   stream of test cases, carves off a small cpuset of the
   requested size, and runs that test case in that cpuset.
   This results in cpusets with names like:
   /dev/cpuset/hpcarena/testing/test123.  Our test123 is
   running in this cpuset.

 * Test123 here happens to be a test of the integrity of cpusets,
   so sets up a couple of cpusets to run two independent jobs,
   each a 2 CPU MPI job.  This results in the cpusets:
   /dev/cpuset/hpcarena/testing/test123/a and
   /dev/cpuset/hpcarena/testing/test123/b.  Our little
   MPI jobs 'a' and 'b' are running in these two cpusets.

We now have several nested cpusets, each overlapping its ancestors,
with tasks in each cpuset.

But only the top hpcarena cpuset has the exclusive ownership
with no form of overlap of everything in its subtree that
something like a distinct scheduler domain wants.

Hopefully the above is not what you meant by "little more than a
convenient way to group tasks."


> 2) rewrite the scheduler/allocator to deal with these bindings up front,
> and take them into consideration early in the scheduling/allocating
> process.

The allocator is less stressed here by varied mems_allowed settings
than is the scheduler.  For in 99+% of the cases, the allocator is
dealing with a zonelist that has the local (currently executing)
first on the zonelist, and is dealing with a mems_allowed that allows
allocation on the local node.  So the allocator almost always succeeds
the first time it goes to see if the candidate page it has in hand
comes from a node allowed in current->mems_allowed.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-06 23:12                                               ` Matthew Dobson
@ 2004-10-07  8:59                                                 ` Paul Jackson
  0 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-07  8:59 UTC (permalink / raw)
  To: colpatch
  Cc: Simon.Derr, mbligh, pwil3058, frankeh, dipankar, akpm, ckrm-tech,
	efocht, lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, ak, sivanich

Matthew wrote:
> > Perhaps these flags should be called:
> > 	mems_exclusive_precursor
> > 	cpus_exclusive_precursor
> > ;).
> 
> Ok...  So if we could offer the 'real' exclusion that the PBS and LSF
> workload managers offer directly, would that suffice?  Meaning, could we
> make PBS and LSF work on top of in-kernel mechanisms that offer 'real'
> exclusion.  'Real' exclusion defined as isolated groups of CPUs and
> memory that the kernel can guarantee will not run other processes?  That
> way we can get the job done without having to rely on these external
> workload managers, and be able to offer this dynamic partitioning to all
> users.  Thoughts?


I agree entirely.  Before when I was being a penny pincher about
how much went in the kernel, it might have made sense to have
the mems_exclusive and cpus_exclusive precursor flags.

But now that we have demonstrated a bone fide need for a really
really exclusive cpuset, it was silly of me to consider offering:

> > 	mems_exclusive_precursor
> > 	cpus_exclusive_precursor
> >     really_really_exclusive

These multiple flavors just confuse and annoy.

You're right.  Just one flag option, for the really exclusive cpuset,
is required here.

A different scheduler domain (whether same scheduler with awareness of
the boundaries, or something more substantially distinct) may only be
attached to a cpuset if it is exclusive.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-06 23:21                                   ` Matthew Dobson
@ 2004-10-07  9:41                                     ` Paul Jackson
  0 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-07  9:41 UTC (permalink / raw)
  To: colpatch
  Cc: mbligh, pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht,
	lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, Simon.Derr, ak, sivanich

Matt wrote:
> I'm really glad to hear that, Paul.  That unconstrained (ab)use was my
> only real concern with the cpusets patches.  I look forward to massaging
> our two approaches into something that will satisfy all interested
> parties.

Sounds good.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-07  8:51                                                     ` Paul Jackson
@ 2004-10-07 10:53                                                       ` Rick Lindsley
  2004-10-07 14:41                                                         ` Martin J. Bligh
       [not found]                                                         ` <20041007072842.2bafc320.pj@sgi.com>
  2004-10-07 12:47                                                       ` [Lse-tech] " Simon Derr
  2004-10-08 23:48                                                       ` Matthew Dobson
  2 siblings, 2 replies; 233+ messages in thread
From: Rick Lindsley @ 2004-10-07 10:53 UTC (permalink / raw)
  To: Paul Jackson
  Cc: colpatch, mbligh, Simon.Derr, pwil3058, frankeh, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

    > I don't see what non-exclusive cpusets buys us.
    
    One can nest them, overlap them, and duplicate them ;)
    
    For example, we could do the following:

Once you have the exclusive set in your example, wouldn't the existing
functionality of CKRM provide you all the functionality the other
non-exclusive sets require?

Seems to me, we need a way to *restrict use* of certain resources
(exclusive) and a way to *share use* of certain resources (non-exclusive.)
CKRM does the latter right now, I believe, but not the former. (Does
CKRM support sharing hierarchies as in the dept/group/individual example
you used?)

What about this model:

    * All exclusive sets exist at the "top level" (non-overlapping,
      non-hierarchical) and each is represented by a separate sched_domain
      hierarchy suitable for the hardware used to create the cpuset.
      I can't imagine anything more than an academic use for nested
      exclusive sets.

    * All non-exclusive sets are rooted at the "top level" but may
      subdivide their range as needed in a tree fashion (multiple levels
      if desired).  Right now I believe this functionality could be
      provided by CKRM.

Observations:

    * There is no current mechanism to create exclusive sets; cpus_allowed
      alone won't cut it.  A combination of Matt's patch plus Paul's
      code could probably resolve this.

    * There is no clear policy on how to amiably create an exclusive set.
      The main problem is what to do with the tasks already there.
      I'd suggest they get forcibly moved.  If their current cpus_allowed
      mask does not allow them to move, then if they are a user process
      they are killed.  If they are a system process and cannot be
      moved, they stay and gain squatter's rights in the newly created
      exclusive set.

    * Interrupts are not under consideration right now. They land where
      they land, and this may affect exclusive sets.  If this is a
      problem, for now, you simply lay out your hardware and exclusive
      sets more intelligently.

    * Memory allocation has a tendency and preference, but no hard policy
      with regards to where it comes from.  A task which starts on one
      part of the system but moves to another may have all its memory
      allocated relatively far away.  In unusual cases, it may acquire
      remote memory because that's all that's left.  A memory allocation
      policy similar to cpus_allowed might be needed. (Martin?)

    * If we provide a means for creating exclusive sets, I haven't heard
      a good reason why CKRM can't manage this.

Rick

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-07  8:51                                                     ` Paul Jackson
  2004-10-07 10:53                                                       ` Rick Lindsley
@ 2004-10-07 12:47                                                       ` Simon Derr
  2004-10-07 14:49                                                         ` Martin J. Bligh
  2004-10-08 23:48                                                       ` Matthew Dobson
  2 siblings, 1 reply; 233+ messages in thread
From: Simon Derr @ 2004-10-07 12:47 UTC (permalink / raw)
  To: Paul Jackson
  Cc: colpatch, mbligh, Simon.Derr, pwil3058, frankeh, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

On Thu, 7 Oct 2004, Paul Jackson wrote:

> > I don't see what non-exclusive cpusets buys us.
> 
> One can nest them, overlap them, and duplicate them ;)

I would also add, if the decision comes to make 'real exclusive' cpusets, 
my previous example, as a use for non-exclusive cpusets: 

we are running jobs that need to be 'mostly' isolated on some part of the 
system, and run in a specific location. We use cpusets for that. But we 
can't afford to dedicate a part of the system for administrative tasks 
(daemons, init..). These tasks should not be put inside one of the 
'exclusive' cpusets, even temporary : they do not belong there. They 
should just be allowed to steal a few cpu cycles from time to time : non 
exclusive cpusets are the way to go.

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-07 10:53                                                       ` Rick Lindsley
@ 2004-10-07 14:41                                                         ` Martin J. Bligh
       [not found]                                                         ` <20041007072842.2bafc320.pj@sgi.com>
  1 sibling, 0 replies; 233+ messages in thread
From: Martin J. Bligh @ 2004-10-07 14:41 UTC (permalink / raw)
  To: Rick Lindsley, Paul Jackson
  Cc: colpatch, Simon.Derr, pwil3058, frankeh, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

>     * Interrupts are not under consideration right now. They land where
>       they land, and this may affect exclusive sets.  If this is a
>       problem, for now, you simply lay out your hardware and exclusive
>       sets more intelligently.

They're easy to fix, just poke the values in /proc appropriately (same
as cpus_allowed, exactly).
 
>     * Memory allocation has a tendency and preference, but no hard policy
>       with regards to where it comes from.  A task which starts on one
>       part of the system but moves to another may have all its memory
>       allocated relatively far away.  In unusual cases, it may acquire
>       remote memory because that's all that's left.  A memory allocation
>       policy similar to cpus_allowed might be needed. (Martin?)

The membind API already does this.

M.

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-07 12:47                                                       ` [Lse-tech] " Simon Derr
@ 2004-10-07 14:49                                                         ` Martin J. Bligh
  2004-10-07 17:54                                                           ` Paul Jackson
  2004-10-10  5:12                                                           ` [ckrm-tech] " Paul Jackson
  0 siblings, 2 replies; 233+ messages in thread
From: Martin J. Bligh @ 2004-10-07 14:49 UTC (permalink / raw)
  To: Simon Derr, Paul Jackson
  Cc: colpatch, pwil3058, frankeh, dipankar, akpm, ckrm-tech, efocht,
	lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, ak, sivanich

> On Thu, 7 Oct 2004, Paul Jackson wrote:
> 
>> > I don't see what non-exclusive cpusets buys us.
>> 
>> One can nest them, overlap them, and duplicate them ;)
> 
> I would also add, if the decision comes to make 'real exclusive' cpusets, 
> my previous example, as a use for non-exclusive cpusets: 
> 
> we are running jobs that need to be 'mostly' isolated on some part of the 
> system, and run in a specific location. We use cpusets for that. But we 
> can't afford to dedicate a part of the system for administrative tasks 
> (daemons, init..). These tasks should not be put inside one of the 
> 'exclusive' cpusets, even temporary : they do not belong there. They 
> should just be allowed to steal a few cpu cycles from time to time : non 
> exclusive cpusets are the way to go.

That makes no sense to me whatsoever, I'm afraid. Why if they were allowed
"to steal a few cycles" are they so fervently banned from being in there?
You can keep them out of your userspace management part if you want.

So we have the purely exclusive stuff, which needs kernel support in the form
of sched_domains alterations. The rest of cpusets is just poking and prodding
at cpus_allowed, the membind API, and the irq binding stuff. All of which
you could do from userspace, without any further kernel support, right?
Or am I missing something?

M.



^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-07 14:49                                                         ` Martin J. Bligh
@ 2004-10-07 17:54                                                           ` Paul Jackson
  2004-10-07 18:13                                                             ` Martin J. Bligh
                                                                               ` (2 more replies)
  2004-10-10  5:12                                                           ` [ckrm-tech] " Paul Jackson
  1 sibling, 3 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-07 17:54 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Simon.Derr, colpatch, pwil3058, frankeh, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

Martin wrote:
> 
> So we have the purely exclusive stuff, which needs kernel support in the form
> of sched_domains alterations. The rest of cpusets is just poking and prodding
> at cpus_allowed, the membind API, and the irq binding stuff. All of which
> you could do from userspace, without any further kernel support, right?
> Or am I missing something?

Well ... we're gaining.  A couple of days ago you were suggesting
that cpusets could be replaced with some exclusive domains plus
CKRM.

Now it's some exclusive domains plus poking the affinity masks.

Yes - you're still missing something.

But I must keep in mind that I had concluded, perhaps three years ago,
just what you conclude now: that cpusets is just poking some affinity
masks, and that I could do most of it from user land.  The result ended
up missing some important capabilities.  User level code could not
manage collections of hardware nodes (sets of CPUs and Memory Nodes) in
a co-ordinated and controlled manner.

The users of cpusets need to have system wide names for them, with
permissions for viewing, modifying and attaching to them, and with the
ability to list both what hardware (CPUs and Memory) in a cpuset, and
what tasks are attached to a cpuset.  As is usual in such operating
systems, the kernel manages such system wide synchronized controlled
access views.

As I quote below, I've been saying this repeatedly.  Could you
tell me, Martin, whether the disconnect is:
 1) that you didn't yet realize that cpusets provided this model (names,
    permissions, ...) or
 2) you don't think such a model is useful, or
 3) you think that such a model can be provided sensibly from user space?

If I knew this, I could focus my response better.

The rest of this message is just quotes from this last week - many
can stop reading here.

===

Date: Fri, 1 Oct 2004 23:06:44 -0700
From: Paul Jackson <pj@sgi.com>

Even the flat model (no hierarchy) uses require some way to
name and control access to cpusets, with distinct permissions
for examining, attaching to, and changing them, that can be
used and managed on a system wide basis.

===

Date: Sat, 2 Oct 2004 12:14:30 -0700
From: Paul Jackson <pj@sgi.com>

And our customers _do_ want to manage these logically isolated
chunks as named "virtual computers" with system managed permissions
and integrity (such as the system-wide attribute of "Exclusive"
ownership of a CPU or Memory by one cpuset, and a robust ability
to list all tasks currently in a cpuset).

===

Date: Sat, 2 Oct 2004 19:26:03 -0700
From: Paul Jackson <pj@sgi.com>

Consider the following use case scenario, which emphasizes this
isolation aspect (and ignores other requirements, such as the need for
system admins to manage cpusets by name [some handle valid across
process contexts], with a system wide imposed permission model and
exclusive use guarantees, and with a well defined system supported
notion of which tasks are "in" which cpuset at any point in time).

===

Date: Sun, 3 Oct 2004 18:41:24 -0700
From: Paul Jackson <pj@sgi.com>

SGI makes heavy and critical use of the cpuset facilities on both Irix
and Linux that have been developed since pset.  These facilities handle
both cpu and memory placment, and provide the essential kernel support
(names and permissions and operations to query, modify and attach) for a
system wide administrative interface for managing the resulting sets of
CPUs and Memory Nodes.

===

Date: Tue, 5 Oct 2004 02:17:36 -0700
From: Paul Jackson <pj@sgi.com>
To: "Martin J. Bligh" <mbligh@aracnet.com>

The /dev/cpuset pseudo file system api was chosen because it was
convenient for small scale work, learning and experimentation, because
it was a natural for the hierarchical name space with permissions that I
required, and because it was convenient to leverage existing vfs
structure in the kernel.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-07 17:54                                                           ` Paul Jackson
@ 2004-10-07 18:13                                                             ` Martin J. Bligh
  2004-10-08  9:23                                                               ` Erich Focht
  2004-10-14 10:35                                                               ` Eric W. Biederman
  2004-10-07 18:25                                                             ` Andrew Morton
  2004-10-07 19:16                                                             ` Rick Lindsley
  2 siblings, 2 replies; 233+ messages in thread
From: Martin J. Bligh @ 2004-10-07 18:13 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Simon.Derr, colpatch, pwil3058, frankeh, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

>> So we have the purely exclusive stuff, which needs kernel support in the form
>> of sched_domains alterations. The rest of cpusets is just poking and prodding
>> at cpus_allowed, the membind API, and the irq binding stuff. All of which
>> you could do from userspace, without any further kernel support, right?
>> Or am I missing something?
> 
> Well ... we're gaining.  A couple of days ago you were suggesting
> that cpusets could be replaced with some exclusive domains plus
> CKRM.
> 
> Now it's some exclusive domains plus poking the affinity masks.
> 
> Yes - you're still missing something.
> 
> But I must keep in mind that I had concluded, perhaps three years ago,
> just what you conclude now: that cpusets is just poking some affinity
> masks, and that I could do most of it from user land.  The result ended
> up missing some important capabilities.  User level code could not
> manage collections of hardware nodes (sets of CPUs and Memory Nodes) in
> a co-ordinated and controlled manner.
> 
> The users of cpusets need to have system wide names for them, with
> permissions for viewing, modifying and attaching to them, and with the
> ability to list both what hardware (CPUs and Memory) in a cpuset, and
> what tasks are attached to a cpuset.  As is usual in such operating
> systems, the kernel manages such system wide synchronized controlled
> access views.
> 
> As I quote below, I've been saying this repeatedly.  Could you
> tell me, Martin, whether the disconnect is:
>  1) that you didn't yet realize that cpusets provided this model (names,
>     permissions, ...) or
>  2) you don't think such a model is useful, or
>  3) you think that such a model can be provided sensibly from user space?
> 
> If I knew this, I could focus my response better.
> 
> The rest of this message is just quotes from this last week - many
> can stop reading here.

My main problem is that I don't think we want lots of overlapping complex 
interfaces in the kernel. Plus I think some of the stuff proposed is fairly 
klunky as an interface (physical binding where it's mostly not needed, and
yes I sort of see your point about keeping jobs on separate CPUs, though I
still think it's tenuous), and makes heavy use of stuff that doesn't work 
well (e.g. cpus_allowed). So I'm searching for various ways to address that.

The purely exclusive parts of cpusets can be implemented in a much nicer
manner inside the kernel, by messing with sched_domains, instead of just
using cpus_allowed as a mechanism ... so that seems like much less of a
problem.

The non-exclusive bits seem to overlap heavily with both CKRM and what
could be done in userspace. I still think the physical stuff is rather
obscure, and binding stuff to specific CPUs is an ugly way to say "I want
these two threads to not run on the same CPU". But if we can find some
other way (eg userspace) to allow you to do that should you utterly insist
on doing so, that'd be a convenient way out.

As for the names and permissions issue, both would be *doable* from 
userspace, though maybe not as easily as in-kernel. Names would probably 
be less hassle than permissions, but neither would be impossible, it seems.

It all just seems like a lot of complexity for a fairly obscure set of
requirements for a very limited group of users, to be honest. Some bits
(eg partitioning system resources hard in exclusive sets) would seem likely
to be used by a much broader audience, and thus are rather more attractive.
But they could probably be done with a much simpler interface than the whole
cpusets (BTW, did that still sit on top of PAGG as well, or is that long
gone?)

M.


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-07 17:54                                                           ` Paul Jackson
  2004-10-07 18:13                                                             ` Martin J. Bligh
@ 2004-10-07 18:25                                                             ` Andrew Morton
  2004-10-07 19:52                                                               ` Paul Jackson
  2004-10-10  3:22                                                               ` Paul Jackson
  2004-10-07 19:16                                                             ` Rick Lindsley
  2 siblings, 2 replies; 233+ messages in thread
From: Andrew Morton @ 2004-10-07 18:25 UTC (permalink / raw)
  To: Paul Jackson
  Cc: mbligh, Simon.Derr, colpatch, pwil3058, frankeh, dipankar,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

Paul Jackson <pj@sgi.com> wrote:
>
>  3) you think that such a model can be provided sensibly from user space?

As you say, it's a matter of coordinated poking at cpus_allowed.  I'd be
interested to know why this all cannot be done by a userspace daemon/server
thing.

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-07  0:16                                                       ` Rick Lindsley
@ 2004-10-07 18:27                                                         ` Paul Jackson
  0 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-07 18:27 UTC (permalink / raw)
  To: Rick Lindsley
  Cc: pwil3058, colpatch, mbligh, Simon.Derr, frankeh, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

Rick wrote:
> 
> Two concrete examples for cpusets stick in my mind:
> 
>     * the department that has been given 16 cpus of a 128 cpu machine,
>       is free to do what they want with them, and doesn't much care
>       specifically how they're laid out. Think general timeshare.
> 
>     * the department that has been given 16 cpus of a 128 cpu machine
>       to run a finely tuned application which expects and needs everybody
>       to stay off those cpus. Think compute-intensive.
> 
> Correct me if I'm wrong, but CKRM can handle the first, but cannot
> currently handle the second.

Even the first scenario is not well handled by CKRM, in my view, for
most workloads.  On a 128 cpu, if you want 16 cpus of compute power, you
are much better off having that power on 16 specific cpus, rather than
getting 12.5% of each of the 128 cpus, unless your workload has very low
cache footprint.

I think of it like this.  Long ago, I learned to consider performance
for many of the applications I wrote in terms of how many disk accesses
I needed, for the disk was a thousand times slower than the processor
and dominated performance across a broad scale.

The gap between the speed of interior cpu cycles and external ram
access across a bus or three is approaching the processor to disk
gap of old.  A complex hierarchy of caches has grown up, within and
surrounding each processor, in an effort to ameliorate this gap.

The dreaded disk seek of old is now the cache line miss of today.

Look at the advertisements for compute power for hire in the magazines.
I can rent a decent small computer, with web access and offsite backup,
in an air conditioned room with UPS and 24/7 administration for under
$100/month. These advertisements never sell me 12.5% of the cycles on
each of the 128 cpus in a large server.  They show pictures of some nice
little rack machine -- that can be all mine, for just $79/month.  Sign
up now with our online web server and be using your system in minutes.

[ hmmm ... wonder how many spam filters I hit on that last paragraph ... ]

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
       [not found]                                                         ` <20041007072842.2bafc320.pj@sgi.com>
@ 2004-10-07 19:05                                                           ` Rick Lindsley
  2004-10-10  2:15                                                             ` [ckrm-tech] " Paul Jackson
  2004-10-10  2:28                                                             ` Paul Jackson
  2004-10-09  0:06                                                           ` Matthew Dobson
       [not found]                                                           ` <4165A31E.4070905@watson.ibm.com>
  2 siblings, 2 replies; 233+ messages in thread
From: Rick Lindsley @ 2004-10-07 19:05 UTC (permalink / raw)
  To: Paul Jackson
  Cc: colpatch, mbligh, Simon.Derr, pwil3058, frankeh, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

    > Once you have the exclusive set in your example, wouldn't the existing
    > functionality of CKRM provide you all the functionality the other
    > non-exclusive sets require?
    > 
    > Seems to me, we need a way to *restrict use* of certain resources
    > (exclusive) and a way to *share use* of certain resources (non-exclusive.)
    > CKRM does the latter right now, I believe, but not the former.
    
    
    I'm losing you right at the top here, Rick.  Sorry.

    I'm no CKRM wizard, so tell me if I'm wrong.

    But doesn't CKRM provide a way to control what percentage of the
    compute cycles are available from a pool of cycles?

    And don't cpusets provide a way to control which physical CPUs a
    task can or cannot use?

Right.

And what I'm hearing is that if you're a job running in a set of shared
resources (i.e., non-exclusive) then by definition you are *not* a job
who cares about which processor you run on.  I can't think of a situation
where I'd care about the physical locality, and the proximity of memory
and other nodes, but NOT care that other tasks might steal my cycles.

    For parallel threaded apps with rapid synchronization between the
    threads, as one gets with say OpenMP or MPI, there's a world of
    difference. Giving both threads in a 2-way application of this kind
    50% of the cycles on each of 2 processors can be an order of magnitude
    slower than giving each thread 100% of one processor.  Similarly, the
    variability of runtimes for such threads pinned on distinct processors
    can be an order of magnitude less than for floating threads.

Ah, so you want processor affinity for the tasks, then, not cpusets.

    For shared resource environments where one is purchasing time
    on your own computer, there's also world of difference. In many
    cases one has paid (whether in real money to another company, or in
    inter-departmental funny money - doesn't matter a whole lot here)
    money for certain processor power, and darn well expects those
    processors to sit idle if you don't use them.

One does?  No, in my world, there's constant auditing going on and if
you can get away with having a machine idle, power to ya, but chances
are somebody's going to come and take away at least the cycles and maybe
the whole machine for somebody yammering louder than you about their
budget cuts.  You get first cut, but if you're not using it, you don't
get to sit fat and happy.

    And the vendor (whether your ISP or your MIS department) of these
    resources can't hide the difference. Your work runs faster and with
    dramatically more consistent runtimes if the entire processor/memory
    units are yours, all yours, whether you use them or not.

When I'm not using them, my work doesn't run faster.  It just doesn't run.

    There is a fundamental difference between controlling which physical
    processors on an SMP or NUMA system one may use, and adding delays
    to the tasks of select users to ensure they don't use too much.

    In the experience of SGI, and I hear tell of other companies,
    workload management by fair share techniques (add delays to tasks
    exceeding their allotment) has been found to be dramatically less
    useful to customers,

Less useful than ... what?  As a substitute for exclusive access to
one or more cpus, which currently is not possible?  I can believe that.
But you're saying these companies didn't size their tasks properly to
the cpus they had allocated and yet didn't require exclusivity? How
would non-exclusive sets address this human failing?  You have 30 cpus'
worth of tasks to run on 24 cpus.  Somebody will take a hit, right,
whether CKRM or cpusets are managing those 24 cpus?

    >     * There is no clear policy on how to amiably create an exclusive set.
    >       The main problem is what to do with the tasks already there.

    There is a policy, that works well, and those of us in this
    business have been using for years.  When the system boots,
    you put everything that doesn't need to be pinned elsewhere in
    a bootcpuset, and leave the rest of the system dark.  You then,
    whether by manual administrative techniques or a batch scheduler,
    hand out dedicated sets of CPU and Memory to jobs, which get exclusive
    use of those compute resources (or controlled sharing with only what
    you intentionally let share).

This presumes you know, at boot time, how you want things divided.
All of your examples so far have seemed to indicate that policy changes
may well be made *after* boot time.  So I'll rephrase: any time you
create an exclusive set after boot time, you may find tasks already
running there.  I suggested one policy for dealing with them.

    The difference between cpusets and CKRM is not about restricting
    versus sharing.  Rather cpusets is about controlled allocation of big,
    named chunks of a computer - certain numbered CPUs and Memory Nodes
    allocated by number.  CKRM is about enforcing the rate of usage of
    anonymous, fungible resources such as cpu cycles and memory pages.

    Unfortunately for CKRM, on modern system architectures of two or more
    CPUs, cycles are not interchangeable and fungible, due to the caching.
    On NUMA systems, which is the norm for all vendors above 10 or 20 CPUs
    (due to our inability to make a backplane fast enough to handle more)
    memory pages are not interchangeable and fungible either.

CKRM is not going to merrily move tasks around just because it can,
either, and it will still adhere to common scheduling principles regarding
cache warmth and processor affinity.

You use the example of a two car family, and preferring one over the other.
I'd turn that around and say it's really two exclusive sets of one
car each, rather than a shared set of two cars.  In that example, do you
ask your wife before you take "her" car, or do just take it because it's
a shared resource?  I know how it works in *my* family :)

You've given a convincing argument for the exclusive side of things.
But my point is that on the non-exclusive side the features you claim
to need seem in confict: if the cpu/memory linkage is important to job
predictability, how can you then claim it's ok to share it with anybody,
even a "friendly" task?  If it's ok to share, then you've just thrown
predictability out the window.  The cpu/memory linkage is interesting,
but it won't drive the job performance anymore.

I'm trying to nail down requirements.  I think we've nailed down the
exclusive one.  It's real, and it's currently unmet.  The code you've
written looks to provide a good base upon which to meet that requirement.
On the non-exclusive side, I keep hearing conflicting information
about how layout is important for performance but it's ok to share with
arbitrary jobs -- like sharing won't affect performance?

Rick

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-07 17:54                                                           ` Paul Jackson
  2004-10-07 18:13                                                             ` Martin J. Bligh
  2004-10-07 18:25                                                             ` Andrew Morton
@ 2004-10-07 19:16                                                             ` Rick Lindsley
  2004-10-10  2:35                                                               ` Paul Jackson
  2 siblings, 1 reply; 233+ messages in thread
From: Rick Lindsley @ 2004-10-07 19:16 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Martin J. Bligh, Simon.Derr, colpatch, pwil3058, frankeh,
	dipankar, akpm, ckrm-tech, efocht, lse-tech, hch, steiner,
	jbarnes, sylvain.jeaugey, djh, linux-kernel, ak, sivanich

    The users of cpusets need to have system wide names for them, with
    permissions for viewing, modifying and attaching to them, and with the
    ability to list both what hardware (CPUs and Memory) in a cpuset, and
    what tasks are attached to a cpuset.  As is usual in such operating
    systems, the kernel manages such system wide synchronized controlled
    access views.

Well, you are *asserting* the kernel will manage this.  But doesn't
CKRM offer this capability?  The only thing it *can't* do is assure
exclusivity, today .. correct?

Rick

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-07 18:25                                                             ` Andrew Morton
@ 2004-10-07 19:52                                                               ` Paul Jackson
  2004-10-07 21:04                                                                 ` [ckrm-tech] " Matthew Helsley
  2004-10-10  3:22                                                               ` Paul Jackson
  1 sibling, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-10-07 19:52 UTC (permalink / raw)
  To: Andrew Morton
  Cc: mbligh, Simon.Derr, colpatch, pwil3058, frankeh, dipankar,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

Andrew wrote:
> I'd be interested to know why this all cannot be done by a
> userspace daemon/server thing.

The biggest stumbling block was the binding of task to cpuset, the
task->cpuset pointer.  I doubt you would accept a patch to the kernel
that called out to my daemon on every fork and exit, to update this
binding.  We require a robust answer to the question of which tasks are
in a cpuset.  And the loop to read this back out, which scans each task
to see if it points to a particular cpuset, would be significantly less
atomic than it is now, if it had to be done, one task at a time, from
user space.

A second stumbling block, which perhaps you can recommend some way to
deal with, is permissions.  What's the recommended way for this daemon
to verify the authority of the requesting process?

Also the other means to poke the affinity masks, sched_setaffinity,
mbind and set_mempolicy, need to be constrained to respect cpuset
boundaries and honor exclusion.  I doubt you want them calling out to a
user daemon either.

And the memory affinity mask, mems_allowed, seems to require updating
within the current task context.  Perhaps someone else is smart enough
to see an alternative, but I could not find a safe way to update this
from outside the current context.  So it's updated on the path going
into __alloc_pages().  I doubt you want a patch that calls out to my
daemon on each call into __alloc_pages().

We also need to begin correct placement earlier in the boot process
than when a user daemon could start.  It's important to get init
and the early shared libraries placed.  This part has reasons of
its own to be pre-init.  I am able to do this in user space today,
because the kernel has cpuset support, but I'd have to fold at
least this much back into the kernel otherwise.

And of course the hooks I added to __alloc_pages, to only allow
allocations from nodes in the tasks mems_allowed, would still be needed,
in some form, just as the already existing schedulers check for
cpus_allowed are needed, in some form (perhaps less blunt).

The hook in the sched code to offline a cpu needs to know what else is
allowed in a tasks cpuset so it can honor the cpuset boundary, if
possible, when migrating the task off the departing cpu.  Would you want
this code calling out to a user daemon to determine what cpu to use
next?

The cpuset file system seems like an excellent way to present a system
wide hierarchical name space.  I guess that this could be done as a
mount handled by my user space daemon, but using vfs for this sure
seemed sweet at the time.

There's a Linus quote I'm trying to remember ... something about while
kernels have an important role in providing hardware access, their
biggest job is in providing a coherent view of system wide resources. 
Does this ring a bell?  I haven't been able to recall enough of the
actual wording to google it.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-07 19:52                                                               ` Paul Jackson
@ 2004-10-07 21:04                                                                 ` Matthew Helsley
  0 siblings, 0 replies; 233+ messages in thread
From: Matthew Helsley @ 2004-10-07 21:04 UTC (permalink / raw)
  To: Paul Jackson; +Cc: CKRM-Tech

On Thu, 2004-10-07 at 12:52, Paul Jackson wrote:
<snip>
> Also the other means to poke the affinity masks, sched_setaffinity,
> mbind and set_mempolicy, need to be constrained to respect cpuset
> boundaries and honor exclusion.  I doubt you want them calling out to a
> user daemon either.
> 
> And the memory affinity mask, mems_allowed, seems to require updating
> within the current task context.  Perhaps someone else is smart enough
> to see an alternative, but I could not find a safe way to update this
> from outside the current context.  So it's updated on the path going
> into __alloc_pages().  I doubt you want a patch that calls out to my
> daemon on each call into __alloc_pages().
<snip>

	Just a thought: could a system-wide ld preload of some form be useful
here? You could use preload to add wrappers around the necessary calls
(you'd probably want to do this in /etc/ld.so.preload). Then have those
wrappers communicate with a daemon or open some /etc config files that
describe the topology you wish to enforce.

Cheers,
	-Matt Helsley


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-07 18:13                                                             ` Martin J. Bligh
@ 2004-10-08  9:23                                                               ` Erich Focht
  2004-10-08  9:50                                                                 ` Andrew Morton
                                                                                   ` (2 more replies)
  2004-10-14 10:35                                                               ` Eric W. Biederman
  1 sibling, 3 replies; 233+ messages in thread
From: Erich Focht @ 2004-10-08  9:23 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Paul Jackson, Simon.Derr, colpatch, pwil3058, frankeh, dipankar,
	akpm, ckrm-tech, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

On Thursday 07 October 2004 20:13, Martin J. Bligh wrote:
> It all just seems like a lot of complexity for a fairly obscure set of
> requirements for a very limited group of users, to be honest. Some bits
> (eg partitioning system resources hard in exclusive sets) would seem likely
> to be used by a much broader audience, and thus are rather more attractive.

May I translate the first sentence to: the requirements and usage
models described by Paul (SGI), Simon (Bull) and myself (NEC) are
"fairly obscure" and the group of users addressed (those mainly
running high performance computing (AKA HPC) applications) is "very
limited"? If this is what you want to say then it's you whose view is
very limited. Maybe I'm wrong with what you really wanted to say but I
remember similar arguing from your side when discussing benchmark
results in the context of the node affine scheduler.

This "very limited group of users" (small part of them listed in
www.top500.org) is who drives computer technology, processor design,
network interconnect technology forward since the 1950s. Their
requirements on the operating system are rather limited and that might
be the reason why kernel developers tend to ignore them. All that
counts for HPC is measured in GigaFLOPS or TeraFLOPS, not in elapsed
seconds for a kernel compile, AIM-7, Spec-SDET or Javabench. The way
of using these machines IS different from what YOU experience in day
by day work and Linux is not yet where it should be (though getting
close). Paul's endurance in this thread is certainly influenced by the
perspective of having to support soon a 20x512 CPU NUMA cluster at
NASA...

As a side note: put in the right context your statement on fairly
obscure requirements for a very limited group of users is a marketing
argument ... against IBM.

Thanks ;-)
Erich

--
Core Technology Group
NEC High Performance Computing Europe GmbH, EHPCTC


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-08  9:23                                                               ` Erich Focht
@ 2004-10-08  9:50                                                                 ` Andrew Morton
  2004-10-08 10:40                                                                   ` Erich Focht
  2004-10-08  9:53                                                                 ` Nick Piggin
  2004-10-08 14:24                                                                 ` Martin J. Bligh
  2 siblings, 1 reply; 233+ messages in thread
From: Andrew Morton @ 2004-10-08  9:50 UTC (permalink / raw)
  To: Erich Focht
  Cc: mbligh, pj, Simon.Derr, colpatch, pwil3058, frankeh, dipankar,
	ckrm-tech, lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, ak, sivanich

Erich Focht <efocht@hpce.nec.com> wrote:
>
>  May I translate the first sentence to: the requirements and usage
>  models described by Paul (SGI), Simon (Bull) and myself (NEC) are
>  "fairly obscure" and the group of users addressed (those mainly
>  running high performance computing (AKA HPC) applications) is "very
>  limited"? If this is what you want to say then it's you whose view is
>  very limited.

Martin makes a legitimate point.  We're talking here about a few tens or
hundreds of machines world-wide, yes?  And those machines are very
high-value so it is a relatively small cost for their kernel providers to
add such a highly specialised patch as cpusets.

These are strong arguments for leaving cpusets as an out-of-kernel.org
patch, for those who need it.

On the other hand, the impact is small:

 25-akpm/fs/proc/base.c            |   19 
 25-akpm/include/linux/cpuset.h    |   63 +
 25-akpm/include/linux/sched.h     |    7 
 25-akpm/init/Kconfig              |   10 
 25-akpm/init/main.c               |    5 
 25-akpm/kernel/Makefile           |    1 
 25-akpm/kernel/cpuset.c           | 1550 ++++++++++++++++++++++++++++++++++++++
 25-akpm/kernel/exit.c             |    2 
 25-akpm/kernel/fork.c             |    3 
 25-akpm/kernel/sched.c            |    8 
 25-akpm/mm/mempolicy.c            |   13 
 25-akpm/mm/page_alloc.c           |   13 
 25-akpm/mm/vmscan.c               |   19 

So it's a quite cheap patch for the kernel.org people to carry.

So I'm (just) OK with it from that point of view.  My main concern is that
the CKRM framework ought to be able to accommodate the cpuset function,
dammit.  I don't want to see us growing two orthogonal resource management
systems partly because their respective backers have no incentive to make
their code work together.

I realise there are technical/architectural problems too, but I do fear
that there's a risk of we-don't-have-a-business-case happening here too.

I don't think there are any architectural concerns around cpusets - the
major design question here is "is CKRM up to doing this and if not, why
not?".  From what Hubertus has been saying CKRM _is_ up to the task, but
the cpuset team may decide that the amount of rework involved isn't
worthwhile and they're better off carrying an offstream patch.

But we're not there yet - we're still waiting for the design dust to
settle.


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-08  9:23                                                               ` Erich Focht
  2004-10-08  9:50                                                                 ` Andrew Morton
@ 2004-10-08  9:53                                                                 ` Nick Piggin
  2004-10-08 11:40                                                                   ` Erich Focht
  2004-10-08 14:24                                                                 ` Martin J. Bligh
  2 siblings, 1 reply; 233+ messages in thread
From: Nick Piggin @ 2004-10-08  9:53 UTC (permalink / raw)
  To: Erich Focht
  Cc: Martin J. Bligh, Paul Jackson, Simon.Derr, colpatch, pwil3058,
	frankeh, dipankar, akpm, ckrm-tech, lse-tech, hch, steiner,
	jbarnes, sylvain.jeaugey, djh, linux-kernel, ak, sivanich

Erich Focht wrote:
> On Thursday 07 October 2004 20:13, Martin J. Bligh wrote:
> 
>>It all just seems like a lot of complexity for a fairly obscure set of
>>requirements for a very limited group of users, to be honest. Some bits
>>(eg partitioning system resources hard in exclusive sets) would seem likely
>>to be used by a much broader audience, and thus are rather more attractive.
> 
> 
> May I translate the first sentence to: the requirements and usage
> models described by Paul (SGI), Simon (Bull) and myself (NEC) are
> "fairly obscure" and the group of users addressed (those mainly
> running high performance computing (AKA HPC) applications) is "very
> limited"? If this is what you want to say then it's you whose view is
> very limited. Maybe I'm wrong with what you really wanted to say but I
> remember similar arguing from your side when discussing benchmark
> results in the context of the node affine scheduler.
> 
> This "very limited group of users" (small part of them listed in
> www.top500.org) is who drives computer technology, processor design,
> network interconnect technology forward since the 1950s. Their
> requirements on the operating system are rather limited and that might
> be the reason why kernel developers tend to ignore them. All that
> counts for HPC is measured in GigaFLOPS or TeraFLOPS, not in elapsed
> seconds for a kernel compile, AIM-7, Spec-SDET or Javabench. The way
> of using these machines IS different from what YOU experience in day
> by day work and Linux is not yet where it should be (though getting
> close). Paul's endurance in this thread is certainly influenced by the
> perspective of having to support soon a 20x512 CPU NUMA cluster at
> NASA...
> 
> As a side note: put in the right context your statement on fairly
> obscure requirements for a very limited group of users is a marketing
> argument ... against IBM.
> 
> Thanks ;-)
> Erich
> 

With all due respect, Linux gets driven as much from the bottom up
as it does from the top down I think. Compared to desktop and small
servers, yes you are obscure :)

My view on it is this, we can do *exclusive* dynamic partitioning
today (we're very close to it - it wouldn't add complexity in the
scheduler to support it). You can also hack up a fair bit of other
functionality with cpu affinity masks.

So with any luck, that will hold you over until everyone working on
this can agree and produce a nice implementation that doesn't add
complexity to the normal case (or can be configured out), and then
pull it into the kernel.

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-08  9:50                                                                 ` Andrew Morton
@ 2004-10-08 10:40                                                                   ` Erich Focht
  2004-10-08 14:26                                                                     ` Martin J. Bligh
  0 siblings, 1 reply; 233+ messages in thread
From: Erich Focht @ 2004-10-08 10:40 UTC (permalink / raw)
  To: Andrew Morton
  Cc: mbligh, pj, Simon.Derr, colpatch, pwil3058, frankeh, dipankar,
	ckrm-tech, lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, ak, sivanich

On Friday 08 October 2004 11:50, Andrew Morton wrote:
> So it's a quite cheap patch for the kernel.org people to carry.
> 
> So I'm (just) OK with it from that point of view.  My main concern is that
> the CKRM framework ought to be able to accommodate the cpuset function,
> dammit.  I don't want to see us growing two orthogonal resource management
> systems partly because their respective backers have no incentive to make
> their code work together.

I don't think cpusets needs to grow beyond what it contains now. The
discussion started as an API discussion. Cpusets requirements, current
API and usage models were clearly shown. According to Hubertus CKRM
will be able to deal with these and implement them in its own API. It
isn't today. So why not wait for that? Having two APIs for the same
thing isn't unusual. Whether we switch from affinity to sched_domains
underneath isn't really the question.

> I realise there are technical/architectural problems too, but I do fear
> that there's a risk of we-don't-have-a-business-case happening here too.

ISVs are already using the current cpusets API. I think of resource
management systems like PBS (Altair), LSF (Platform Computing) plus
several providers of industrial simulation codes in the area of CAE
(computer aided engineering). I know examples from static and dynamic
mechanical stress analysis, fluid dynamics and electromagnetics
simulations. Financial simulation software could benefit of such
stuff, too, but I don't know of any example. Anyhow, I'd say we
already have a business case here. And instead of pushing ISVs to
support the SGI way of doing this, the Bull way and the NEC way, it
makes more sense to ask them to support the LINUX way.

> But we're not there yet - we're still waiting for the design dust to
> settle.

:-)

Regards,
Erich



^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-08  9:53                                                                 ` Nick Piggin
@ 2004-10-08 11:40                                                                   ` Erich Focht
  0 siblings, 0 replies; 233+ messages in thread
From: Erich Focht @ 2004-10-08 11:40 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Martin J. Bligh, Paul Jackson, Simon.Derr, colpatch, pwil3058,
	frankeh, dipankar, akpm, ckrm-tech, lse-tech, hch, steiner,
	jbarnes, sylvain.jeaugey, djh, linux-kernel, ak, sivanich

On Friday 08 October 2004 11:53, Nick Piggin wrote:
> Erich Focht wrote:
> > On Thursday 07 October 2004 20:13, Martin J. Bligh wrote:
> > 
> >>It all just seems like a lot of complexity for a fairly obscure set of
> >>requirements for a very limited group of users, to be honest. Some bits
> >>(eg partitioning system resources hard in exclusive sets) would seem likely
> >>to be used by a much broader audience, and thus are rather more attractive.
> > 
> > May I translate the first sentence to: the requirements and usage
> > models described by Paul (SGI), Simon (Bull) and myself (NEC) are
> > "fairly obscure" and the group of users addressed (those mainly
> > running high performance computing (AKA HPC) applications) is "very
> > limited"? If this is what you want to say then it's you whose view is
> > very limited. Maybe I'm wrong with what you really wanted to say but I
> > remember similar arguing from your side when discussing benchmark
> > results in the context of the node affine scheduler.
> > 
> > This "very limited group of users" (small part of them listed in
> > www.top500.org) is who drives computer technology, processor design,
> > network interconnect technology forward since the 1950s.

> With all due respect, Linux gets driven as much from the bottom up
> as it does from the top down I think. Compared to desktop and small
> servers, yes you are obscure :)

I wasn't speaking of driving the Linux development, I was speaking of
driving the computer technology development. Just look at where the
DOD, DARPA, DOE money goes to. I actually aknowledged that HPC doesn't
really have a foot in the kernel developer community.

> My view on it is this, we can do *exclusive* dynamic partitioning
> today (we're very close to it - it wouldn't add complexity in the
> scheduler to support it).

Right, but that's an implementation question. The question 
  cpusets {AND, OR, XOR} CKRM ?
was basically a user space API question. I'm sure nobody will object
to changing the guts of cpusets to use sched_domains on exclusive sets
when this possibility will be there and ... simple.

> You can also hack up a fair bit of other functionality with cpu
> affinity masks.

I'm doing that for a subset of cpusets functionality in a module
(i.e. without touching the task structure and without hooking on
fork/exec) but that's ugly and on the long term insufficient.

Regards,
Erich


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [PATCH] cpusets - big numa cpu and memory placement
       [not found]                                                           ` <4165A31E.4070905@watson.ibm.com>
@ 2004-10-08 13:14                                                             ` Paul Jackson
  2004-10-08 15:42                                                               ` Hubertus Franke
  2004-10-09  0:51                                                               ` Matthew Dobson
  2004-10-09  0:22                                                             ` Matthew Dobson
  1 sibling, 2 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-08 13:14 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: ricklind, colpatch, mbligh, Simon.Derr, pwil3058, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

First, thank-you, Hubertus, for comparing me to a puppy, rather
than a kitten.  I am definitely a dog person, not a cat person,
and I appreciate your considerate choice of analog.

I gather from the tone of your post yesterday that there is
a disconnect between us - you speak with the frustration of
someone who has been shouting into the wind and not being
heard.

I suspect that the disconnect, if such be, is not where you
think it is:

Hubertus wrote:
> 
> The disconnect is that you do not want to recognize that CKRM does NOT 
> have to be systemwide. Once you open your mind to the fact that CKRM can 
> be deployed with in a subset of disconnected resources (cpu domains)
> and manages shares independently within that domain, I truely don't see
> what the problem is.

I have recognized for months that eventually we'd want to allow
for cpuset-relative CKRM domains, and I'm pretty sure I've
dropped comments to that affect one time or another here on lkml.

I suspect instead that "CKRM" is one layer more abstract than
I am normally comfortable with.

As best as I can tell, CKRM has evolved from its origins as a
fair share scheduler, into a framework (*) for things called by
such names as classes and controllers.  As you may recall from
an inconclusive thread between us on the ckrm-tech email list two
months ago, I find those terms uncomfortably vague and abstract.

In general, frameworks are high risk business.  What they
gain in generality, covering a wider range of situations in
a uniform pattern, they lose in down to earth concreteness,
leaving their users less confident of what works, and less able
to rely on their intuitions.  The risk of serious design flaws,
shrouded for a long time in the fog of abstraction, is higher.

The more successful frameworks, such as vfs for example,
typically have deep roots in prior art, and a sizable population
of journeyman and master practitioners.

CKRM is young, its roots more shallow, and the population of
its practitioners small.

 (*) P.S. - It's more like CKRM is now the combination of
     a virtual resource manager framework and a particular
     instance of such (the fair shair controllers that have
     their conceptual origins in IBM's WLM, I suspect).  If
     numa placement controllers (aka cpusets) are going to
     exist as well, then CKRM needs to split into (1) a
     virtual resource manager framework (vrm), and (2) the
     fair share stuff.  The vrm framework should be neutral
     of either fair share or numa placement bias.

===

So here I am with this new cpuset design (Simon Derr, primary
architect, both Simon and I feel a strong sense of ownership)
for numa placement, perhaps the 4th or 5th in SGI's history,
and the 2nd in mine.  I am finding that it deliciously and
elegantly reflects the needs of its anticipated users (Sylvain
might demur, noting a couple of things I removed).

I am now being asked to morph it into a CKRM controller.

Further I deduce from the efforts over the last few days to talk
me down from meeting all the requirements satisfied by my current
cpuset patch that something of cpusets will be lost in the translation.

But I haven't figured out exactly what will be lost.  And I lack the
mastery of CKRM that would enable me to engage in a constructive dialog
on the various tradeoffs that come into play here.

I look at the CKRM patch, and see something that looks an order
of magnitude larger than my cpuset patch.  With its increased
number of hooks in the kernel, and its more abstract style
(it is a framework afterall), I also see something with a
higher risk of performance impact, especially on the large NUMA
configurations that I care about.

And I am looking at trading what I thought had hope of being a
Sept or Oct date for acceptance into Linus's kernel, into some
unknown schedule that is definitely further out.

I've got the bacon sizzling on the skillet, I can smell it, my
mouth is watering, and just as I go to lift it off the burner,
Andrew asks me to consider trading it for a pig in a poke.
Thanks a bunch, Andrew - you da man ;).

Putting aside for a moment my personal frustrations (which
are after all my problem - and my dogs) I am simply unable to
make sense yet of how deep would be the hit on the capabilities
of cpusets, if so morphed, and I am painfully aware of the
undetermined schedule delays and increased risks to product
performance and even ultimate success that attend such a change.

>From what my field engineers tell me, whom I've been polling
furiously on this matter the last few days, at least in the
markets that SGI frequents, there is very little overlap between
system configurations which benefit from fair share resource
management and those which benefit from numa placement resource
management.  So, if that experience is generally applicable, we
are at risk of marrying a helicopter and a boat, just because
both have a motor and a hull, to the detriment of both.

Merging projects always has risks.  The payoff for synergies
gained is not always greater than the cost of the inefficiencies
and compromises introduced, and the less immediate involvement
of the participants in the end result.

I cannot in good conscience recommend such a change.

Keep talking.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-08  9:23                                                               ` Erich Focht
  2004-10-08  9:50                                                                 ` Andrew Morton
  2004-10-08  9:53                                                                 ` Nick Piggin
@ 2004-10-08 14:24                                                                 ` Martin J. Bligh
  2004-10-08 22:37                                                                   ` Erich Focht
  2 siblings, 1 reply; 233+ messages in thread
From: Martin J. Bligh @ 2004-10-08 14:24 UTC (permalink / raw)
  To: Erich Focht
  Cc: Paul Jackson, Simon.Derr, colpatch, pwil3058, frankeh, dipankar,
	akpm, ckrm-tech, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

> On Thursday 07 October 2004 20:13, Martin J. Bligh wrote:
>> It all just seems like a lot of complexity for a fairly obscure set of
>> requirements for a very limited group of users, to be honest. Some bits
>> (eg partitioning system resources hard in exclusive sets) would seem likely
>> to be used by a much broader audience, and thus are rather more attractive.
> 
> May I translate the first sentence to: the requirements and usage
> models described by Paul (SGI), Simon (Bull) and myself (NEC) are
> "fairly obscure" and the group of users addressed (those mainly
> running high performance computing (AKA HPC) applications) is "very
> limited"? If this is what you want to say then it's you whose view is
> very limited. Maybe I'm wrong with what you really wanted to say but I
> remember similar arguing from your side when discussing benchmark
> results in the context of the node affine scheduler.

No, I was talking about the non-exclusive part of cpusets that wouldn't
fit inside another mechanism. The basic partitioning I have no problem
with, and that seemed to cover most of the requirements, AFAICS.

As I've said before, the exclusive stuff makes sense, and is useful to
a wider audience, I think. Having non-exclusive stuff whilst still 
requiring physical partioning is what I think is obscure, won't work
well (cpus_allowed is problematic) and could be done in userspace anyway.

M.

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-08 10:40                                                                   ` Erich Focht
@ 2004-10-08 14:26                                                                     ` Martin J. Bligh
  0 siblings, 0 replies; 233+ messages in thread
From: Martin J. Bligh @ 2004-10-08 14:26 UTC (permalink / raw)
  To: Erich Focht, Andrew Morton
  Cc: pj, Simon.Derr, colpatch, pwil3058, frankeh, dipankar, ckrm-tech,
	lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, ak, sivanich

> Anyhow, I'd say we
> already have a business case here. And instead of pushing ISVs to
> support the SGI way of doing this, the Bull way and the NEC way, it
> makes more sense to ask them to support the LINUX way.

Right. But we're trying to work out what the Linux way *is* ;-)

M.


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [PATCH] cpusets - big numa cpu and memory placement
  2004-10-08 13:14                                                             ` Paul Jackson
@ 2004-10-08 15:42                                                               ` Hubertus Franke
  2004-10-08 18:23                                                                 ` Paul Jackson
  2004-10-09  0:51                                                               ` Matthew Dobson
  1 sibling, 1 reply; 233+ messages in thread
From: Hubertus Franke @ 2004-10-08 15:42 UTC (permalink / raw)
  To: Paul Jackson
  Cc: ricklind, colpatch, mbligh, Simon.Derr, pwil3058, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich



Paul Jackson wrote:
> First, thank-you, Hubertus, for comparing me to a puppy, rather
> than a kitten.  I am definitely a dog person, not a cat person,
> and I appreciate your considerate choice of analog.

Heeeh .. where did I compare you to a puppy? I was talking about *MY* 
puppy. And the moral of the story was that you can teach them new 
tricks. That's all. So in case you took this in any other way my sincere 
apologies.

> 
> I gather from the tone of your post yesterday that there is
> a disconnect between us - you speak with the frustration of
> someone who has been shouting into the wind and not being
> heard.

Frustration ... only in the sense that what seems to me to
be a pretty clear path to melt your functionality into CKRM.

Andrews, intial request was the challenge to see whether CKRM suffice as 
an API and with an additional controller suffices to provide the 
functionality.

> 
> I suspect that the disconnect, if such be, is not where you
> think it is:
> 
> Hubertus wrote:
> 
>>The disconnect is that you do not want to recognize that CKRM does NOT 
>>have to be systemwide. Once you open your mind to the fact that CKRM can 
>>be deployed with in a subset of disconnected resources (cpu domains)
>>and manages shares independently within that domain, I truely don't see
>>what the problem is.
> 
> 
> I have recognized for months that eventually we'd want to allow
> for cpuset-relative CKRM domains, and I'm pretty sure I've
> dropped comments to that affect one time or another here on lkml.
> 
> I suspect instead that "CKRM" is one layer more abstract than
> I am normally comfortable with.
> 
> As best as I can tell, CKRM has evolved from its origins as a
> fair share scheduler, into a framework (*) for things called by
> such names as classes and controllers.  As you may recall from
> an inconclusive thread between us on the ckrm-tech email list two
> months ago, I find those terms uncomfortably vague and abstract.
> 
> In general, frameworks are high risk business.  What they
> gain in generality, covering a wider range of situations in
> a uniform pattern, they lose in down to earth concreteness,
> leaving their users less confident of what works, and less able
> to rely on their intuitions.  The risk of serious design flaws,
> shrouded for a long time in the fog of abstraction, is higher.
> 
> The more successful frameworks, such as vfs for example,
> typically have deep roots in prior art, and a sizable population
> of journeyman and master practitioners.
> 
> CKRM is young, its roots more shallow, and the population of
> its practitioners small.
> 
>  (*) P.S. - It's more like CKRM is now the combination of
>      a virtual resource manager framework and a particular
>      instance of such (the fair shair controllers that have
>      their conceptual origins in IBM's WLM, I suspect).  If
>      numa placement controllers (aka cpusets) are going to
>      exist as well, then CKRM needs to split into (1) a
>      virtual resource manager framework (vrm), and (2) the
>      fair share stuff.  The vrm framework should be neutral
>      of either fair share or numa placement bias.

As indicated in many notes so are the usage of cpusets.
Very few people have the #cpus to even worry about this.
As Andrew said, its quite possible that the installations can maintain 
their own kernel, although
> 
> ===
> 
> 
> Putting aside for a moment my personal frustrations (which
> are after all my problem - and my dogs) I am simply unable to
> make sense yet of how deep would be the hit on the capabilities
> of cpusets, if so morphed, and I am painfully aware of the
> undetermined schedule delays and increased risks to product
> performance and even ultimate success that attend such a change.
> 
> From what my field engineers tell me, whom I've been polling
> furiously on this matter the last few days, at least in the
> markets that SGI frequents, there is very little overlap between
> system configurations which benefit from fair share resource
> management and those which benefit from numa placement resource
> management.  So, if that experience is generally applicable, we
> are at risk of marrying a helicopter and a boat, just because
> both have a motor and a hull, to the detriment of both.

I learned my lesson, no more analogies with you....

Bottom line I believe the cpusets should be first morphed into
sched_domains. The problem with large systems is the load balancing
which is highly unscalable. You can twist an turn but at the end of
the day you set cpu_affinity masks. The load balancing of the system
needs to be aware of the structure of the system.
sched_domains to me are the right approach for that not setting some
affinity masks underneath.

Assuming that will be resolved at some point and given Andrew's 
hypothetical assumption that CKRM makes it into his kernel, then
I don't see the obstacle of adopting an existing API to serve at the
API for cpusets/sched_domains.

> 
> Merging projects always has risks.  The payoff for synergies
> gained is not always greater than the cost of the inefficiencies
> and compromises introduced, and the less immediate involvement
> of the participants in the end result.
> 
> I cannot in good conscience recommend such a change.
But by self admission, you are driven by timing constraints as
your bacon is sizzling.
> 
> Keep talking.

To whom ?   :-)

-- Hubertus


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [PATCH] cpusets - big numa cpu and memory placement
  2004-10-08 15:42                                                               ` Hubertus Franke
@ 2004-10-08 18:23                                                                 ` Paul Jackson
  2004-10-09  1:00                                                                   ` Matthew Dobson
  0 siblings, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-10-08 18:23 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: ricklind, colpatch, mbligh, Simon.Derr, pwil3058, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

Hubertus, responding to Paul:
> >  (*) P.S. - It's more like CKRM is now the combination of
> >      a virtual resource manager framework and a particular
> >      instance of such (the fair shair controllers that have
> >      their conceptual origins in IBM's WLM, I suspect).  If
> >      numa placement controllers (aka cpusets) are going to
> >      exist as well, then CKRM needs to split into (1) a
> >      virtual resource manager framework (vrm), and (2) the
> >      fair share stuff.  The vrm framework should be neutral
> >      of either fair share or numa placement bias.
> 
> As indicated in many notes so are the usage of cpusets.

Cpusets pretends to be nothing more than what it is now.  I am not
recommending to Andrew that cpusets incorporate your fair shair
scheduling.

CKRM aspires to be both a general purpose resource management framework
and the embodiment of fair share scheduling.

Let me put the question again, and this time don't try to dodge it by
saying "but cpusets does it too ..."

> >      If numa placement controllers (aka cpusets) are going to
> >      exist as well, then CKRM needs to split into (1) a
> >      virtual resource manager framework (vrm), and (2) the
> >      fair share stuff.  The vrm framework should be neutral
> >      of either fair share or numa placement bias.

Hubertus' second response to the above:
>
> Very few people have the #cpus to even worry about this.

If for whatever reason, you don't think it is worth the effort to morph
the virtual resouce manager that is currently embedded within CKRM into
an independent, neutral framework, then don't expect the rest of us to
embrace it.  Do you think Reiser would have gladly used vfs to plug in
his file system if it had been called "ext"?  In my personal opinion, it
would be foolhardy for SGI, NEC, Bull, Platform (LSF) or Altair (PBS) to
rely on critical technology so clearly biased toward and dominated by a
natural competitor.

> But by self admission, you are driven by timing constraints as
> your bacon is sizzling.

You broke your promise - you said no more analogies ;)

Of _course_ I have scheduling pressures.  You don't?

> > Keep talking.
> 
> To whom ?   :-)

A duh ... to us, here.

Just in case there was a communication failure here, let me be explicit.
When I said "Here's where I stand today - keep talking" it meant that my
current position was thus, but that I was still open to further
discussion and analysis.

When someone offers you an open door ("Keep talking"), don't slam it in
their face.  It might not open again.

... keep talking ...

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-08 14:24                                                                 ` Martin J. Bligh
@ 2004-10-08 22:37                                                                   ` Erich Focht
  0 siblings, 0 replies; 233+ messages in thread
From: Erich Focht @ 2004-10-08 22:37 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Paul Jackson, Simon.Derr, colpatch, pwil3058, frankeh, dipankar,
	akpm, ckrm-tech, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

On Friday 08 October 2004 16:24, Martin J. Bligh wrote:
> > On Thursday 07 October 2004 20:13, Martin J. Bligh wrote:
> >> It all just seems like a lot of complexity for a fairly obscure set of
> >> requirements for a very limited group of users, to be honest. Some bits
> >> (eg partitioning system resources hard in exclusive sets) would seem likely
> >> to be used by a much broader audience, and thus are rather more attractive.
> > 
> > May I translate the first sentence to: the requirements and usage
> > models described by Paul (SGI), Simon (Bull) and myself (NEC) are
> > "fairly obscure" and the group of users addressed (those mainly
> > running high performance computing (AKA HPC) applications) is "very
> > limited"? If this is what you want to say then it's you whose view is
> > very limited. Maybe I'm wrong with what you really wanted to say but I
> > remember similar arguing from your side when discussing benchmark
> > results in the context of the node affine scheduler.
> 
> No, I was talking about the non-exclusive part of cpusets that wouldn't
> fit inside another mechanism. The basic partitioning I have no problem
> with, and that seemed to cover most of the requirements, AFAICS.

I was hoping that I did misunderstand you ;-)

> As I've said before, the exclusive stuff makes sense, and is useful to
> a wider audience, I think. Having non-exclusive stuff whilst still 
> requiring physical partioning is what I think is obscure, won't work
> well (cpus_allowed is problematic) and could be done in userspace anyway.

Do you mean non-exclusive or simply overlapping? If you think at the
implementation through sched_domains you really don't need a 1 to 1
mapping between them and cpusets. IMO one could map sched domains
structure from the toplevel cpuset down only as far as the
non-overlapping sets go. Below you just don't use sched domains any
more and leave it to the affinity masks. The logical setup would
anyhow have a first (uppermost) level soft-partitioning the machine,
overlaps don't make sense to me here. Then sched domains already buy
you something. If soft partition 1 allows overlap in the lower levels
(because we want to overcommit the machine here and fear the OpenMP
jobs which pin themselves blindly in their cpuset), just don't
continue mapping sched domains deeper. In soft-partition 2 you may not
allow overlapping subpartitions, so go ahead and map them to sched
domains. It doesn't really add complexity this way, just some IF
statement.

Regards,
Erich



^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-07  8:51                                                     ` Paul Jackson
  2004-10-07 10:53                                                       ` Rick Lindsley
  2004-10-07 12:47                                                       ` [Lse-tech] " Simon Derr
@ 2004-10-08 23:48                                                       ` Matthew Dobson
  2004-10-09  0:18                                                         ` Nick Piggin
  2 siblings, 1 reply; 233+ messages in thread
From: Matthew Dobson @ 2004-10-08 23:48 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Martin J. Bligh, Simon.Derr, pwil3058, frankeh, dipankar,
	Andrew Morton, ckrm-tech, efocht, LSE Tech, hch, steiner,
	Jesse Barnes, sylvain.jeaugey, djh, LKML, Andi Kleen, sivanich

On Thu, 2004-10-07 at 01:51, Paul Jackson wrote:
> > I don't see what non-exclusive cpusets buys us.
> 
> One can nest them, overlap them, and duplicate them ;)

<snip example>

> We now have several nested cpusets, each overlapping its ancestors,
> with tasks in each cpuset.
> 
> But only the top hpcarena cpuset has the exclusive ownership
> with no form of overlap of everything in its subtree that
> something like a distinct scheduler domain wants.
> 
> Hopefully the above is not what you meant by "little more than a
> convenient way to group tasks."

I think this example is easily achievable with the sched_domains
modifications I am proposing.  You can still create your 128 CPU
exclusive domain, called big_domain (due to my lack of naming
creativity), and further divide big_domain into smaller, non-exclusive
sched_domains.  We do this all the time, albeit statically at boot time,
with the current sched_domains code.  When we create a 4-node domain on
IA64, and underneath it we create 4 1-node domains.  We've now
partitioned the system into 4 sched_domains, each containing 4 cpus. 
Balancing between these 4 node-level sched_domains is allowed, but can
be disallowed by not setting the SD_LOAD_BALANCE flag.  Your example
does show that it can be more than just a convenient way to group tasks,
but your example can be done with what I'm proposing.


> > 2) rewrite the scheduler/allocator to deal with these bindings up front,
> > and take them into consideration early in the scheduling/allocating
> > process.
> 
> The allocator is less stressed here by varied mems_allowed settings
> than is the scheduler.  For in 99+% of the cases, the allocator is
> dealing with a zonelist that has the local (currently executing)
> first on the zonelist, and is dealing with a mems_allowed that allows
> allocation on the local node.  So the allocator almost always succeeds
> the first time it goes to see if the candidate page it has in hand
> comes from a node allowed in current->mems_allowed.

Very true.  The allocator and scheduler are very different beasts, just
as memory and CPUs are.  The allocator does not struggle to cope with
mems_allowed (at least currently) as much as the scheduler struggles to
cope with cpus_allowed.

-Matt


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
       [not found]                                                         ` <20041007072842.2bafc320.pj@sgi.com>
  2004-10-07 19:05                                                           ` Rick Lindsley
@ 2004-10-09  0:06                                                           ` Matthew Dobson
       [not found]                                                           ` <4165A31E.4070905@watson.ibm.com>
  2 siblings, 0 replies; 233+ messages in thread
From: Matthew Dobson @ 2004-10-09  0:06 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Rick Lindsley, Martin J. Bligh, Simon.Derr, pwil3058, frankeh,
	dipankar, Andrew Morton, ckrm-tech, efocht, LSE Tech, hch,
	steiner, Jesse Barnes, sylvain.jeaugey, djh, LKML, Andi Kleen,
	sivanich

On Thu, 2004-10-07 at 07:28, Paul Jackson wrote:
> Rick wrote:
> >     * There is no clear policy on how to amiably create an exclusive set.
> >       The main problem is what to do with the tasks already there.
> 
> There is a policy, that works well, and those of us in this business
> have been using for years.  When the system boots, you put everything
> that doesn't need to be pinned elsewhere in a bootcpuset, and leave
> the rest of the system dark.  You then, whether by manual administrative
> techniques or a batch scheduler, hand out dedicated sets of CPU and
> Memory to jobs, which get exclusive use of those compute resources
> (or controlled sharing with only what you intentionally let share).

No one is trying to take that away.  There is nothing that says you
can't boot with a small, 1-2 CPU 'boot' domain where you stick all those
tasks you typically put in a 'boot' cpuset.

<offtopic> In fact, people have talked before about reducing boot times
by booting only a single CPU, then bringing the rest online later.  This
work could potentially facilitate that.  Boot up just a single 'boot'
CPU.  All 'boot' tasks would necessarily be stuck there.  Create a new
'work' domain and add (hotplug on) CPUs into that domain to your heart's
content. </offtopic>

-Matt


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-08 23:48                                                       ` Matthew Dobson
@ 2004-10-09  0:18                                                         ` Nick Piggin
  2004-10-11 23:00                                                           ` Matthew Dobson
  0 siblings, 1 reply; 233+ messages in thread
From: Nick Piggin @ 2004-10-09  0:18 UTC (permalink / raw)
  To: colpatch
  Cc: Paul Jackson, Martin J. Bligh, Simon.Derr, pwil3058, frankeh,
	dipankar, Andrew Morton, ckrm-tech, efocht, LSE Tech, hch,
	steiner, Jesse Barnes, sylvain.jeaugey, djh, LKML, Andi Kleen,
	sivanich

Matthew Dobson wrote:
> On Thu, 2004-10-07 at 01:51, Paul Jackson wrote:
> 
>>>I don't see what non-exclusive cpusets buys us.
>>
>>One can nest them, overlap them, and duplicate them ;)
> 
> 
> <snip example>
> 
>>We now have several nested cpusets, each overlapping its ancestors,
>>with tasks in each cpuset.
>>
>>But only the top hpcarena cpuset has the exclusive ownership
>>with no form of overlap of everything in its subtree that
>>something like a distinct scheduler domain wants.
>>
>>Hopefully the above is not what you meant by "little more than a
>>convenient way to group tasks."
> 
> 
> I think this example is easily achievable with the sched_domains
> modifications I am proposing.  You can still create your 128 CPU
> exclusive domain, called big_domain (due to my lack of naming
> creativity), and further divide big_domain into smaller, non-exclusive
> sched_domains.  We do this all the time, albeit statically at boot time,
> with the current sched_domains code.  When we create a 4-node domain on
> IA64, and underneath it we create 4 1-node domains.  We've now
> partitioned the system into 4 sched_domains, each containing 4 cpus. 
> Balancing between these 4 node-level sched_domains is allowed, but can
> be disallowed by not setting the SD_LOAD_BALANCE flag.  Your example
> does show that it can be more than just a convenient way to group tasks,
> but your example can be done with what I'm proposing.
> 

You wouldn't be able to do this just with sched domains, because
it doesn't know anything about individual tasks. As soon as you
have some overlap, all your tasks can escape out of your domain.

I don't think there is a really nice way to do overlapping sets.
Those that want them need to just use cpu affinity for now.

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [PATCH] cpusets - big numa cpu and memory placement
       [not found]                                                           ` <4165A31E.4070905@watson.ibm.com>
  2004-10-08 13:14                                                             ` Paul Jackson
@ 2004-10-09  0:22                                                             ` Matthew Dobson
  2004-10-12 22:24                                                               ` [Lse-tech] " Hanna Linder
  1 sibling, 1 reply; 233+ messages in thread
From: Matthew Dobson @ 2004-10-09  0:22 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: Paul Jackson, Rick Lindsley, Martin J. Bligh, Simon.Derr,
	pwil3058, dipankar, Andrew Morton, ckrm-tech, efocht, LSE Tech,
	hch, steiner, Jesse Barnes, sylvain.jeaugey, djh, LKML,
	Andi Kleen, sivanich

On Thu, 2004-10-07 at 13:12, Hubertus Franke wrote:
> The way this is heading is quite promising.
> - sched_domains seems the right answer wrt to partitioning the machine.
>    Given some boot option or dynamic means one can shift cpus from
>    on domain to the next domain.
> - If I understood correctly, there would be only one level of such
>    hard partitioning, speak exclusive cpu-set or sched_domain.
> - In each such domain/set we allow shared *use*.

I don't think that there needs to be a requirement that we have only one
level of hard partitioning.  The rest of your points are valid though,
Hubertus.

It'd be really nice if we could all get together with a wall of
whiteboards, some markers, and a few pots of coffee.  I think we'd all
get this pretty much hashed out in an hour or two.  This isn't directed
at you, Hubertus, but at the many communication breakdowns in this
thread.


> First, one needs to understand that sched_domains are nothing else
> but a set of cpus that are considered during load balancing times.
> By constricting the top_domain to the respective exclusive set one
> essentially has accomplished the desired feature of partitioning
> the machines into "isolated" sections (here from cpu perspective).
> So it is quite possible that an entire domain is empty based, while
> another exclusive domain would be totally overloaded.

I think that is very well stated, Hubertus.  By having a more or less
passive data structure that is only checked at load balance time, we can
ensure (in a very light-weight way) that no task ever moves *out* of
it's area nor moves *into* someone else's area.

-Matt


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [PATCH] cpusets - big numa cpu and memory placement
  2004-10-08 13:14                                                             ` Paul Jackson
  2004-10-08 15:42                                                               ` Hubertus Franke
@ 2004-10-09  0:51                                                               ` Matthew Dobson
  2004-10-10  0:50                                                                 ` [Lse-tech] " Paul Jackson
  2004-10-10  0:59                                                                 ` Paul Jackson
  1 sibling, 2 replies; 233+ messages in thread
From: Matthew Dobson @ 2004-10-09  0:51 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Hubertus Franke, Rick Lindsley, Martin J. Bligh, Simon.Derr,
	pwil3058, dipankar, Andrew Morton, ckrm-tech, efocht, LSE Tech,
	hch, steiner, Jesse Barnes, sylvain.jeaugey, djh, LKML,
	Andi Kleen, sivanich

On Fri, 2004-10-08 at 06:14, Paul Jackson wrote:
> So here I am with this new cpuset design (Simon Derr, primary
> architect, both Simon and I feel a strong sense of ownership)
> for numa placement, perhaps the 4th or 5th in SGI's history,
> and the 2nd in mine.  I am finding that it deliciously and
> elegantly reflects the needs of its anticipated users (Sylvain
> might demur, noting a couple of things I removed).
> 
> I am now being asked to morph it into a CKRM controller.
> 
> Further I deduce from the efforts over the last few days to talk
> me down from meeting all the requirements satisfied by my current
> cpuset patch that something of cpusets will be lost in the translation.
> 
> But I haven't figured out exactly what will be lost.  And I lack the
> mastery of CKRM that would enable me to engage in a constructive dialog
> on the various tradeoffs that come into play here.

I hope that *nothing* will be lost.  We (I) aim to still offer
users/admins named groupings of CPUs and memory.  They may not be called
cpusets, in favor of names like classes or domains, but they will
*still* be named groupings of CPUs and memory.  I further aim to not
change your API significantly.  I really like the filesystem API you
came up with to interact with cpusets from userspace.  I'd like to
incorporate this into CKRM's filesystem API (called rcfs) with minimal
changes.  I really like the exclusive cpusets you describe.  You tried
to implement them with some kernel exclusivity (your vitamin
precursors), while I'd like to see the kernel offer real exclusivity. 
This shouldn't affect your customers because real exclusivity will
*still* be offered.  I also aim to support what seems like a large
portion of your non-exclusive cpusets through nested, hierarchical
sched_domains.  And I hope to do all of this with less overhead.

Now, I'm certainly not saying my patch provides all these things now.  I
am saying that I believe the approach/model I'm using could do all these
things with some further work.


> And I am looking at trading what I thought had hope of being a
> Sept or Oct date for acceptance into Linus's kernel, into some
> unknown schedule that is definitely further out.
> 
> I've got the bacon sizzling on the skillet, I can smell it, my
> mouth is watering, and just as I go to lift it off the burner,
> Andrew asks me to consider trading it for a pig in a poke.
> Thanks a bunch, Andrew - you da man ;).

Andrew is da man.  Sometimes da man works with you, sometimes da man
works against you.  As the French say, c'est la vie.

I think we can figure out how to merge cpusets into CKRM's framework,
with minimal changes to both the cpusets API & functionality without
slipping that *too* much.  End of the year isn't unreasonable....


> Keep talking.

You asked for it!!! ;)

-Matt


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [PATCH] cpusets - big numa cpu and memory placement
  2004-10-08 18:23                                                                 ` Paul Jackson
@ 2004-10-09  1:00                                                                   ` Matthew Dobson
  2004-10-09 20:08                                                                     ` [Lse-tech] " Paul Jackson
  2004-10-10  0:05                                                                     ` Paul Jackson
  0 siblings, 2 replies; 233+ messages in thread
From: Matthew Dobson @ 2004-10-09  1:00 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Hubertus Franke, Rick Lindsley, Martin J. Bligh, Simon.Derr,
	pwil3058, dipankar, Andrew Morton, ckrm-tech, efocht, LSE Tech,
	hch, steiner, Jesse Barnes, sylvain.jeaugey, djh, LKML,
	Andi Kleen, sivanich

On Fri, 2004-10-08 at 11:23, Paul Jackson wrote:
> CKRM aspires to be both a general purpose resource management framework
> and the embodiment of fair share scheduling.

I think your missing something here.  CKRM, as I understand it, aspires
to be a general purpose resource management framework.  To that point I
will accede.  But the second part, about CKRM being the embodiment of
fair share scheduling, is secondary.  That is simply one of it's
potential functions as a general purpose resource management framework. 
It could also be the embodiment of unfair scheduling, if you choose to
implement such a resource controller.  It wouldn't be very useful, but
it could be a fun exercise! ;)


> If for whatever reason, you don't think it is worth the effort to morph
> the virtual resouce manager that is currently embedded within CKRM into
> an independent, neutral framework, then don't expect the rest of us to
> embrace it.  Do you think Reiser would have gladly used vfs to plug in
> his file system if it had been called "ext"?  In my personal opinion, it
> would be foolhardy for SGI, NEC, Bull, Platform (LSF) or Altair (PBS) to
> rely on critical technology so clearly biased toward and dominated by a
> natural competitor.

I don't think that is terribly fair.  I can honestly say that I'm not
opposing your implementation because of who you work for.  I could care
less.  I'm opposing it because I think I've got an alternative that can
offer the same functionality with less impact.  I don't work on CKRM,
and when I wrote my code CKRM was not even on my radar.  If
sched_domains will play nicer with CKRM than cpusets, thats just a
bonus.  I certainly didn't design it that way!


> When someone offers you an open door ("Keep talking"), don't slam it in
> their face.  It might not open again.

*More* analogies!?! ;)


> ... keep talking ...

I warned you!

-Matt


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] Re: [PATCH] cpusets - big numa cpu and memory placement
  2004-10-09  1:00                                                                   ` Matthew Dobson
@ 2004-10-09 20:08                                                                     ` Paul Jackson
  2004-10-11 22:16                                                                       ` Matthew Dobson
  2004-10-10  0:05                                                                     ` Paul Jackson
  1 sibling, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-10-09 20:08 UTC (permalink / raw)
  To: colpatch
  Cc: frankeh, ricklind, mbligh, Simon.Derr, pwil3058, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

Matthrew, responding to Paul:
> > If for whatever reason, you don't think it is worth the effort to morph
> > the virtual resouce manager that is currently embedded within CKRM into
> > an independent, neutral framework, then don't expect the rest of us to
> > embrace it.  Do you think Reiser would have gladly used vfs to plug in
> > his file system if it had been called "ext"?  In my personal opinion, it
> > would be foolhardy for SGI, NEC, Bull, Platform (LSF) or Altair (PBS) to
> > rely on critical technology so clearly biased toward and dominated by a
> > natural competitor.
> 
> I don't think that is terribly fair.  I can honestly say that I'm not
> opposing your implementation because of who you work for. 

Good point.  I was painting with too wide a brush (hmmm ... someday I
should see if I can get through an entire post without an analogy ...)

When people show a good ability to work with others on lkml who have a
shared interest and sufficient competence in an area, then it doesn't
much matter what company they work for.  I see such a discussion
happening on another portion of this thread, with yourself, Nick, Peter
and Erich, involving the domain scheduler.  That works well.

So far I have been unable to achieve confidence in my ability to
interact well with the key CKRM folks.  In various ways, I have found
their project, and their demeanor on this list, to be difficult for me
to approach.

Normally this wouldn't matter.  However Andrew and others have proposed
that cpusets have a critical dependency on CKRM.  Now it matters.

If I am to have a critical project dependency on an external group with
whom I lack confidence that we share sufficient common interest and a
healthy ability to communicate, then I prefer a more adversarial style
of relations, with explicit contracts, minimum clearly defined and
verifiable deliverables, and suitable fallback contingencies in place.
I keep a sharp eye out for potential conflicts of interest.

My suggestion to separate the virtual resource management framework
(which I named 'vrm') from CKRM's other elements, such as fair share
scheduling, was an attempt to establish such a minimum verifiable
deliverable.  That suggestion was clearly dead on arrival.

My apologies for implicating everyone whose email ends in "ibm.com" in
my earlier comment.  IBM is a big place, and all manner and variety
of people work there.  It's a pleasure working with yourself, Matthew,
and many others from IBM.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] Re: [PATCH] cpusets - big numa cpu and memory placement
  2004-10-09  1:00                                                                   ` Matthew Dobson
  2004-10-09 20:08                                                                     ` [Lse-tech] " Paul Jackson
@ 2004-10-10  0:05                                                                     ` Paul Jackson
  2004-10-11 22:18                                                                       ` Matthew Dobson
  1 sibling, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-10-10  0:05 UTC (permalink / raw)
  To: colpatch
  Cc: frankeh, ricklind, mbligh, Simon.Derr, pwil3058, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

Matthew writes:
> > CKRM aspires to be both a general purpose resource management framework
> > and the embodiment of fair share scheduling.
> 
> I think your missing something here.  CKRM, as I understand it, aspires
> to be a general purpose resource management framework.  To that point I
> will accede.  But the second part, about CKRM being the embodiment of
> fair share scheduling, is secondary.

Ok - you may well be right that CKRM does not aspire to be the embodiment
of fair share scheduling.  But doesn't it embody a fair share sheduler
(and no other such policy) as a matter of current implementation fact?

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] Re: [PATCH] cpusets - big numa cpu and memory placement
  2004-10-09  0:51                                                               ` Matthew Dobson
@ 2004-10-10  0:50                                                                 ` Paul Jackson
  2004-10-10  0:59                                                                 ` Paul Jackson
  1 sibling, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-10  0:50 UTC (permalink / raw)
  To: colpatch
  Cc: frankeh, ricklind, mbligh, Simon.Derr, pwil3058, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

Matthew, responding to Paul:
> > But I haven't figured out exactly what will be lost.  And I lack the
> > mastery of CKRM that would enable me to engage in a constructive dialog
> > on the various tradeoffs that come into play here.
> 
> I hope that *nothing* will be lost.  We (I) aim to still offer
> users/admins named groupings of CPUs and memory.  They may not be called
> cpusets, in favor of names like classes or domains, but they will
> *still* be named groupings of CPUs and memory.  I further aim to not
> change your API significantly. 

This might work.

I've no earthly idea yet how it might work.  But I take you at your word
that there's potential here worth pursuing.

I've gotten behind on my email the last three days - sleeping off a
cold.  Do you have any suggestions for readings, or further explanations
you can provide, that might help me better understand how you intend to
accomplish this minor miracle?  Perhaps there is something in one of the
messages that I haven't digested yet.  I see your work-in-progress patch
of Wed, 06 Oct 2004 17:51:07 is one of the messages still in my input
queue.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] Re: [PATCH] cpusets - big numa cpu and memory placement
  2004-10-09  0:51                                                               ` Matthew Dobson
  2004-10-10  0:50                                                                 ` [Lse-tech] " Paul Jackson
@ 2004-10-10  0:59                                                                 ` Paul Jackson
  1 sibling, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-10  0:59 UTC (permalink / raw)
  To: colpatch
  Cc: frankeh, ricklind, mbligh, Simon.Derr, pwil3058, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

Matthew wrote:
> while I'd like to see the kernel offer real exclusivity. 

I agree - once we've identified some reason the kernel needs real
exclusivity, and I think we did, it is best to discard my vitamin
precursors and go with the real thing.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-07 19:05                                                           ` Rick Lindsley
@ 2004-10-10  2:15                                                             ` Paul Jackson
  2004-10-11 22:06                                                               ` Matthew Dobson
  2004-10-10  2:28                                                             ` Paul Jackson
  1 sibling, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-10-10  2:15 UTC (permalink / raw)
  To: Rick Lindsley
  Cc: colpatch, mbligh, Simon.Derr, pwil3058, frankeh, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

Rick replying to Paul:
> > But doesn't CKRM provide a way to control what percentage of the
> > compute cycles are available from a pool of cycles?
> > 
> > And don't cpusets provide a way to control which physical CPUs a
> > task can or cannot use?
> 
> Right.

I am learning (see other messages of the last couple days on this
thread) that CKRM is supposed to be a general purpose workload manager
framework, and that fair share scheduling (managing percentage of
compute cycles) just happens to be the first instance of such a manager.

> And what I'm hearing is that if you're a job running in a set of shared
> resources (i.e., non-exclusive) then by definition you are *not* a job
> who cares about which processor you run on.  I can't think of a situation
> where I'd care about the physical locality, and the proximity of memory
> and other nodes, but NOT care that other tasks might steal my cycles.

There are at least these situations:
 1) proximity to special hardware (graphics, networking, storage, ...)
 2) non-dedicated tightly coupled multi-threaded apps (OpenMP, MPI)
 3) batch managers switching resources between jobs

On (2), if say you want to run eight copies of an application, on a
system that only has eight CPUs, where each copy of the app is an
eight-way tightly coupled app, they will go much faster if each app is
placed across all 8 CPUs, one thread per CPU, than if they are placed
willy-nilly.  Or a bit more realistically, if you have a random input
queue of such tightly coupled apps, each with a predetermined number of
threads between one and eight, you will get more work done by pinning
the threads of any given app on different CPUs.  The users submitting
the jobs may well not care which CPUs are used for their job, but an
intermediate batch manager probably will care, as it may be solving the
knapsack problem of how to fit a stream of varying sized jobs onto a
given size of hardware.

On (3), a batch manager might say have two small cpusets, and also one
larger cpuset that is the two small ones combined.  It might run one job
in each of the two small cpusets for a while, then suspend these two
jobs, in order to run a third job in the larger cpuset.  The two small
cpusets don't go away while the third job runs -- you don't want to lose
or have to tear down and rebuild the detailed inter-cpuset placement of
the two small jobs while they are suspended.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-07 19:05                                                           ` Rick Lindsley
  2004-10-10  2:15                                                             ` [ckrm-tech] " Paul Jackson
@ 2004-10-10  2:28                                                             ` Paul Jackson
  1 sibling, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-10  2:28 UTC (permalink / raw)
  To: Rick Lindsley
  Cc: colpatch, mbligh, Simon.Derr, pwil3058, frankeh, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

Rick wrote:
> One does?  No, in my world, there's constant auditing going on and if
> you can get away with having a machine idle, power to ya, but chances
> are somebody's going to come and take away at least the cycles and maybe

I don't doubt that such worlds as yours exist, nor that you live in one.

In some of the worlds my customers live in, they have been hit so many
times with the pains of performance degradation and variation due to
unwanted interaction between applications that they get nervous if a
supposedly unused CPU or Memory looks to be in use.  Just the common use
by Linux of unused memory to keep old pages in cache upsets them.

And, perhaps more to the point, while indeed some other department may
soon show up to make use of those lost cycles, the computer had jolly
well better leave those cycles lost _until_ the customer decides to use
them.

Unlike the computer in my dentists office, which should "just do it",
maximizing throughput as best it can, not asking any questions, the
computers in some of my customers high end shops are managed more tightly
(sometimes very tightly) and they expect to control load placement.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-07 19:16                                                             ` Rick Lindsley
@ 2004-10-10  2:35                                                               ` Paul Jackson
  0 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-10  2:35 UTC (permalink / raw)
  To: Rick Lindsley
  Cc: mbligh, Simon.Derr, colpatch, pwil3058, frankeh, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

> The only thing it *can't* do is assure
> exclusivity, today .. correct?

No.  Could you look back to my other posts of this
last week and let us know if I've answered your query
in more detail already?  Thanks.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-07 18:25                                                             ` Andrew Morton
  2004-10-07 19:52                                                               ` Paul Jackson
@ 2004-10-10  3:22                                                               ` Paul Jackson
  1 sibling, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-10  3:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: mbligh, Simon.Derr, colpatch, pwil3058, frankeh, dipankar,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

Andrew wrote:
> As you say, it's a matter of coordinated poking at cpus_allowed.  

No - I said I concluded that three years ago.  And then later learned
the hard way this wasn't enough.

See further my earlier (like 2.5 days and 2 boxes of Kleenex ago) reply
to this same post.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-07 14:49                                                         ` Martin J. Bligh
  2004-10-07 17:54                                                           ` Paul Jackson
@ 2004-10-10  5:12                                                           ` Paul Jackson
  1 sibling, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-10  5:12 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Simon.Derr, colpatch, pwil3058, frankeh, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

> That makes no sense to me whatsoever, I'm afraid. Why if they were allowed
> "to steal a few cycles" are they so fervently banned from being in there?

One substantial advantage of cpusets (as in the kernel patch in *-mm's
tree), over variations that "just poke the affinity masks from user
space" is the task->cpuset pointer.  This tracks to what cpuset a task
is attached.  The fork and exit code duplicates and nukes this pointer,
managing the cpuset reference counter.

It matters to batch schedulers and the like which cpuset a task is in,
and which tasks are in a cpuset, when it comes time to do things like
suspend or migrate the tasks currently in a cpuset.

Just because it's ok to share a little compute time in a cpuset doesn't
mean you don't care to know who is in it.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-10  2:15                                                             ` [ckrm-tech] " Paul Jackson
@ 2004-10-11 22:06                                                               ` Matthew Dobson
  2004-10-11 22:58                                                                 ` Paul Jackson
  2004-10-12  8:50                                                                 ` Simon Derr
  0 siblings, 2 replies; 233+ messages in thread
From: Matthew Dobson @ 2004-10-11 22:06 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Rick Lindsley, Martin J. Bligh, Simon.Derr, pwil3058, frankeh,
	dipankar, Andrew Morton, ckrm-tech, efocht, LSE Tech, hch,
	steiner, Jesse Barnes, sylvain.jeaugey, djh, LKML, Andi Kleen,
	sivanich

On Sat, 2004-10-09 at 19:15, Paul Jackson wrote:
> Rick replying to Paul:
> > And what I'm hearing is that if you're a job running in a set of shared
> > resources (i.e., non-exclusive) then by definition you are *not* a job
> > who cares about which processor you run on.  I can't think of a situation
> > where I'd care about the physical locality, and the proximity of memory
> > and other nodes, but NOT care that other tasks might steal my cycles.
> 
> There are at least these situations:
>  1) proximity to special hardware (graphics, networking, storage, ...)
>  2) non-dedicated tightly coupled multi-threaded apps (OpenMP, MPI)
>  3) batch managers switching resources between jobs
> 
> On (2), if say you want to run eight copies of an application, on a
> system that only has eight CPUs, where each copy of the app is an
> eight-way tightly coupled app, they will go much faster if each app is
> placed across all 8 CPUs, one thread per CPU, than if they are placed
> willy-nilly.  Or a bit more realistically, if you have a random input
> queue of such tightly coupled apps, each with a predetermined number of
> threads between one and eight, you will get more work done by pinning
> the threads of any given app on different CPUs.  The users submitting
> the jobs may well not care which CPUs are used for their job, but an
> intermediate batch manager probably will care, as it may be solving the
> knapsack problem of how to fit a stream of varying sized jobs onto a
> given size of hardware.
> 
> On (3), a batch manager might say have two small cpusets, and also one
> larger cpuset that is the two small ones combined.  It might run one job
> in each of the two small cpusets for a while, then suspend these two
> jobs, in order to run a third job in the larger cpuset.  The two small
> cpusets don't go away while the third job runs -- you don't want to lose
> or have to tear down and rebuild the detailed inter-cpuset placement of
> the two small jobs while they are suspended.

I think these situations, particularly the first two, are the times you
*want* to use the cpus_allowed mechanism.  Pinning a specific thread to
a specific processor (cases (1) & (2)) is *exactly* why the cpus_allowed
mechanism was put into the kernel.

And (3) can pretty easily be achieved by using a combination of
sched_domains and cpus_allowed.  In your example of one 4 CPU cpuset and
two 2 CPU sub cpusets (cpu-subsets? :), one could easily create a 4 CPU
domain for the larger job and two 2 CPU domains for the smaller jobs. 
Those 2 2 CPU subdomains can be created & destroyed at will, or they
could be simply tagged as "exclusive" when you don't want tasks moving
back and forth between them, and tagged as "non-exclusive" when you want
tasks to be freely balanced across all 4 CPUs in the larger parent
domain.

One of the cool thing about using sched_domains as your partitioning
element is that in reality, tasks run on *CPUs*, not *domains*.  So if
you have threads 'a1' & 'a2' running on CPUs 0 & 1 (small job 'a') and
threads 'b1' & 'b2' running on CPUs 2 & 3 (small job 'b'), you can
suspend threads a1, a2, b1 & b2 and remove the domains they were running
in to allow job A (big job with threads A1, A2, A3, & A4) to run on the
larger 4 CPU domain.  When you then suspend A1-A4 again to allow the
smaller jobs to proceed, you can pretty trivially create the 2 CPU
domains underneath the 4 CPU domain and resume the jobs.  Those jobs (a
& b) have been suspended on the CPUs they were originally running on,
and thus will resume on the same CPUs without any extra effort.  They
will simply run on those CPUs, and at load balance time, the domains
attached to those CPUs will be consulted to determine where the tasks
can be relocated to if there is a heavy load.  The domains will tell the
scheduler that the tasks cannot be relocated outside the 2 CPUs in each
respective domain.  Viola!  (sorta ;)

-Matt


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] Re: [PATCH] cpusets - big numa cpu and memory placement
  2004-10-09 20:08                                                                     ` [Lse-tech] " Paul Jackson
@ 2004-10-11 22:16                                                                       ` Matthew Dobson
  2004-10-11 22:42                                                                         ` Paul Jackson
  0 siblings, 1 reply; 233+ messages in thread
From: Matthew Dobson @ 2004-10-11 22:16 UTC (permalink / raw)
  To: Paul Jackson
  Cc: frankeh, Rick Lindsley, Martin J. Bligh, Simon.Derr, pwil3058,
	dipankar, Andrew Morton, ckrm-tech, efocht, LSE Tech, hch,
	steiner, Jesse Barnes, sylvain.jeaugey, djh, LKML, Andi Kleen,
	sivanich

On Sat, 2004-10-09 at 13:08, Paul Jackson wrote:
> Matthew, responding to Paul:
> > > If for whatever reason, you don't think it is worth the effort to morph
> > > the virtual resouce manager that is currently embedded within CKRM into
> > > an independent, neutral framework, then don't expect the rest of us to
> > > embrace it.  Do you think Reiser would have gladly used vfs to plug in
> > > his file system if it had been called "ext"?  In my personal opinion, it
> > > would be foolhardy for SGI, NEC, Bull, Platform (LSF) or Altair (PBS) to
> > > rely on critical technology so clearly biased toward and dominated by a
> > > natural competitor.
> > 
> > I don't think that is terribly fair.  I can honestly say that I'm not
> > opposing your implementation because of who you work for. 
> 
> Good point.  I was painting with too wide a brush (hmmm ... someday I
> should see if I can get through an entire post without an analogy ...)

Doubtful.  I've read too many of your posts to think that it's very
likely! ;)


> My suggestion to separate the virtual resource management framework
> (which I named 'vrm') from CKRM's other elements, such as fair share
> scheduling, was an attempt to establish such a minimum verifiable
> deliverable.  That suggestion was clearly dead on arrival.

My (completely uninformed) guess is that the CKRM folks thought it would
be extremely unlikely to be able to get the 'vrm' into the kernel
without something to use it.  Linus, and the rest of the community, has
been understandably reluctant to pick up large chunks of code on the
assurance that "someone, someday will use these hooks".  The fair share
scheduler is thus both a proof of concept that the 'vrm' works and a
user of the 'vrm'.  The 'vrm' and the fair share scheduler, should be
logically separate pieces of code, though.  I should *really* read
through the CKRM code before I continue any further as I am purely
speculating now...


> My apologies for implicating everyone whose email ends in "ibm.com" in
> my earlier comment.  IBM is a big place, and all manner and variety
> of people work there.  It's a pleasure working with yourself, Matthew,
> and many others from IBM.

Apology accepted, Paul.  IBM is a large company, and this thread in
particular has had many @ibm.com posters.  It can seem there is some
large IBM conspiracy to block your efforts, but I can assure you that
isn't the case.  Unless the small, painless chips on the backs of our
necks are working far better than I think they do.... :)

-Matt


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] Re: [PATCH] cpusets - big numa cpu and memory placement
  2004-10-10  0:05                                                                     ` Paul Jackson
@ 2004-10-11 22:18                                                                       ` Matthew Dobson
  2004-10-11 22:39                                                                         ` Paul Jackson
  0 siblings, 1 reply; 233+ messages in thread
From: Matthew Dobson @ 2004-10-11 22:18 UTC (permalink / raw)
  To: Paul Jackson
  Cc: frankeh, Rick Lindsley, Martin J. Bligh, Simon.Derr, pwil3058,
	dipankar, Andrew Morton, ckrm-tech, efocht, LSE Tech, hch,
	steiner, Jesse Barnes, sylvain.jeaugey, djh, LKML, Andi Kleen,
	sivanich

On Sat, 2004-10-09 at 17:05, Paul Jackson wrote:
> Matthew writes:
> > > CKRM aspires to be both a general purpose resource management framework
> > > and the embodiment of fair share scheduling.
> > 
> > I think your missing something here.  CKRM, as I understand it, aspires
> > to be a general purpose resource management framework.  To that point I
> > will accede.  But the second part, about CKRM being the embodiment of
> > fair share scheduling, is secondary.
> 
> Ok - you may well be right that CKRM does not aspire to be the embodiment
> of fair share scheduling.  But doesn't it embody a fair share sheduler
> (and no other such policy) as a matter of current implementation fact?

Yes.  That is true, but it is by no means meant to be the end-all,
be-all of CKRM.  It is my understanding that the fair share scheduler is
a proof-of-concept and an example of how to write a 'controller' for
others, but not the full extent of CKRM's power.

-Matt


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] Re: [PATCH] cpusets - big numa cpu and memory placement
  2004-10-11 22:18                                                                       ` Matthew Dobson
@ 2004-10-11 22:39                                                                         ` Paul Jackson
  0 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-11 22:39 UTC (permalink / raw)
  To: colpatch
  Cc: frankeh, ricklind, mbligh, Simon.Derr, pwil3058, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

> Yes.  That is true, but it is by no means meant to be the end-all,
> be-all of CKRM.  

All well and good.  Except that it has taken me an inordinate amount of
effort to grok what CKRM is, and this mingling of a framework for
resource management with one particular instance of such, a fair share
scheduler, has contributed to my confusions.  My teen age son can no
doubt offer additional explanations for my confusions.

And indeed, while a new kernel framework should come with at least one
good example of something worth so framing, still it's better to keep
the two clearly distinguished.  If these two are well distinguished now,
then I am unaware of that.

Perhaps this effort to add a placement manager to the existing fair
share manager in CKRM's repetoire will result in a clearer separation of
the CKRM framework from that which it frames.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] Re: [PATCH] cpusets - big numa cpu and memory placement
  2004-10-11 22:16                                                                       ` Matthew Dobson
@ 2004-10-11 22:42                                                                         ` Paul Jackson
  0 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-11 22:42 UTC (permalink / raw)
  To: colpatch
  Cc: frankeh, ricklind, mbligh, Simon.Derr, pwil3058, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

Matthrew wrote:
> My (completely uninformed) guess is that the CKRM folks thought it would
> be extremely unlikely to be able to get the 'vrm' into the kernel
> without something to use it.

I'd guess the same thing.

> The 'vrm' and the fair share scheduler, should be
> logically separate pieces of code, though. 

I agree - should be.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-11 22:06                                                               ` Matthew Dobson
@ 2004-10-11 22:58                                                                 ` Paul Jackson
  2004-10-12 21:22                                                                   ` Matthew Dobson
  2004-10-12  8:50                                                                 ` Simon Derr
  1 sibling, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-10-11 22:58 UTC (permalink / raw)
  To: colpatch
  Cc: ricklind, mbligh, Simon.Derr, pwil3058, frankeh, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

Matthew wrote:
> One of the cool thing about using sched_domains as your partitioning
> element is that in reality, tasks run on *CPUs*, not *domains*. 

Unfortunately, my manager has reminded me of an essential deliverable
that I have for another project, due in two weeks.  I'm going to need
every one of those days.  So I will have to take a two week sabbatical
from this design work.

It might make sense to reconvene this work on a new thread, with a last
message on this monster thread inviting all interested parties to come
on over.  I suspect a few folks will be happy to see this thread wind
down.

I'd guess lse-tech (my preference) or ckrm-tech would be a suitable
forum for this new thread.

Carry on.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-09  0:18                                                         ` Nick Piggin
@ 2004-10-11 23:00                                                           ` Matthew Dobson
  2004-10-11 23:09                                                             ` Nick Piggin
  0 siblings, 1 reply; 233+ messages in thread
From: Matthew Dobson @ 2004-10-11 23:00 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Paul Jackson, Martin J. Bligh, Simon.Derr, pwil3058, frankeh,
	dipankar, Andrew Morton, ckrm-tech, efocht, LSE Tech, hch,
	steiner, Jesse Barnes, sylvain.jeaugey, djh, LKML, Andi Kleen,
	sivanich

On Fri, 2004-10-08 at 17:18, Nick Piggin wrote:
> Matthew Dobson wrote:
> > I think this example is easily achievable with the sched_domains
> > modifications I am proposing.  You can still create your 128 CPU
> > exclusive domain, called big_domain (due to my lack of naming
> > creativity), and further divide big_domain into smaller, non-exclusive
> > sched_domains.  We do this all the time, albeit statically at boot time,
> > with the current sched_domains code.  When we create a 4-node domain on
> > IA64, and underneath it we create 4 1-node domains.  We've now
> > partitioned the system into 4 sched_domains, each containing 4 cpus. 
> > Balancing between these 4 node-level sched_domains is allowed, but can
> > be disallowed by not setting the SD_LOAD_BALANCE flag.  Your example
> > does show that it can be more than just a convenient way to group tasks,
> > but your example can be done with what I'm proposing.
> 
> You wouldn't be able to do this just with sched domains, because
> it doesn't know anything about individual tasks. As soon as you
> have some overlap, all your tasks can escape out of your domain.
> 
> I don't think there is a really nice way to do overlapping sets.
> Those that want them need to just use cpu affinity for now.

Well, the tasks can escape out of the domain iff you have the 
SD_LOAD_BALANCE flag set on that domain.  If SD_LOAD_BALANCE isn't set,
then when the scheduler tick goes off, and the code looks at the domain,
it will see the lack of the flag and will not attempt to balance the
domain, correct?  This is what we currently do with the 'isolated'
domains, right?

You're right that you can get some of the more obscure semantics of the
various flavors of cpusets by leveraging sched_domains AND
cpus_allowed.  I don't have any desire to remove that ability, just keep
it as the exception.

-Matt


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-11 23:00                                                           ` Matthew Dobson
@ 2004-10-11 23:09                                                             ` Nick Piggin
  0 siblings, 0 replies; 233+ messages in thread
From: Nick Piggin @ 2004-10-11 23:09 UTC (permalink / raw)
  To: colpatch
  Cc: Paul Jackson, Martin J. Bligh, Simon.Derr, pwil3058, frankeh,
	dipankar, Andrew Morton, ckrm-tech, efocht, LSE Tech, hch,
	steiner, Jesse Barnes, sylvain.jeaugey, djh, LKML, Andi Kleen,
	sivanich

Matthew Dobson wrote:
> On Fri, 2004-10-08 at 17:18, Nick Piggin wrote:
> 
>>Matthew Dobson wrote:
>>
>>>I think this example is easily achievable with the sched_domains
>>>modifications I am proposing.  You can still create your 128 CPU
>>>exclusive domain, called big_domain (due to my lack of naming
>>>creativity), and further divide big_domain into smaller, non-exclusive
>>>sched_domains.  We do this all the time, albeit statically at boot time,
>>>with the current sched_domains code.  When we create a 4-node domain on
>>>IA64, and underneath it we create 4 1-node domains.  We've now
>>>partitioned the system into 4 sched_domains, each containing 4 cpus. 
>>>Balancing between these 4 node-level sched_domains is allowed, but can
>>>be disallowed by not setting the SD_LOAD_BALANCE flag.  Your example
>>>does show that it can be more than just a convenient way to group tasks,
>>>but your example can be done with what I'm proposing.
>>
>>You wouldn't be able to do this just with sched domains, because
>>it doesn't know anything about individual tasks. As soon as you
>>have some overlap, all your tasks can escape out of your domain.
>>
>>I don't think there is a really nice way to do overlapping sets.
>>Those that want them need to just use cpu affinity for now.
> 
> 
> Well, the tasks can escape out of the domain iff you have the 
> SD_LOAD_BALANCE flag set on that domain.  If SD_LOAD_BALANCE isn't set,
> then when the scheduler tick goes off, and the code looks at the domain,
> it will see the lack of the flag and will not attempt to balance the
> domain, correct?  This is what we currently do with the 'isolated'
> domains, right?
> 

Yeah that's right. Well you have to remove some of the other SD_
flags as well (eg. SD_BALANCE_EXEC, SD_WAKE_BALANCE).

But I don't think there is much point in overlapping sets which
don't do any balancing. They might as well not exist at all.

> You're right that you can get some of the more obscure semantics of the
> various flavors of cpusets by leveraging sched_domains AND
> cpus_allowed.  I don't have any desire to remove that ability, just keep
> it as the exception.
> 

I think at this stage, overlapping cpu sets are the exception. It
is pretty logical that they're going to require some per-task info,
because the balancer can't otherwise differentiate between two tasks
on the same runqueue but in different cpu sets.

sched-domains gives you a nice clean way to do exclusive partitioning,
and I can't imagine it would be too common to want to do overlapping
partitioning.

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-11 22:06                                                               ` Matthew Dobson
  2004-10-11 22:58                                                                 ` Paul Jackson
@ 2004-10-12  8:50                                                                 ` Simon Derr
  2004-10-12 21:25                                                                   ` Matthew Dobson
  1 sibling, 1 reply; 233+ messages in thread
From: Simon Derr @ 2004-10-12  8:50 UTC (permalink / raw)
  To: Matthew Dobson
  Cc: Paul Jackson, Rick Lindsley, Martin J. Bligh, Simon.Derr,
	pwil3058, frankeh, dipankar, Andrew Morton, ckrm-tech, efocht,
	LSE Tech, hch, steiner, Jesse Barnes, sylvain.jeaugey, djh, LKML,
	Andi Kleen, sivanich

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1555 bytes --]

> One of the cool thing about using sched_domains as your partitioning
> element is that in reality, tasks run on *CPUs*, not *domains*.  So if
> you have threads 'a1' & 'a2' running on CPUs 0 & 1 (small job 'a') and
> threads 'b1' & 'b2' running on CPUs 2 & 3 (small job 'b'), you can
> suspend threads a1, a2, b1 & b2 and remove the domains they were running
> in to allow job A (big job with threads A1, A2, A3, & A4) to run on the
> larger 4 CPU domain.  When you then suspend A1-A4 again to allow the
> smaller jobs to proceed, you can pretty trivially create the 2 CPU
> domains underneath the 4 CPU domain and resume the jobs.  Those jobs (a
> & b) have been suspended on the CPUs they were originally running on,
> and thus will resume on the same CPUs without any extra effort.  They
> will simply run on those CPUs, and at load balance time, the domains
> attached to those CPUs will be consulted to determine where the tasks
> can be relocated to if there is a heavy load.  The domains will tell the
> scheduler that the tasks cannot be relocated outside the 2 CPUs in each
> respective domain.  Viola!  (sorta ;)
Voilà ;-)

I agree that this looks really smooth from a scheduler point of view.

>From a user point of view, remains the issue of suspending the tasks:
-find which tasks to suspend : how do you know that job 'a' consists 
exactly of 'a1' and 'a2'
-suspend them (btw, how do you achieve this ? kill -STOP ?)


I've been away from my mail and still trying to catch up, nevermind if the 
above does not makes sense to you.

	Simon.

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-11 22:58                                                                 ` Paul Jackson
@ 2004-10-12 21:22                                                                   ` Matthew Dobson
  0 siblings, 0 replies; 233+ messages in thread
From: Matthew Dobson @ 2004-10-12 21:22 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Rick Lindsley, Martin J. Bligh, Simon.Derr, pwil3058, frankeh,
	dipankar, Andrew Morton, ckrm-tech, efocht, LSE Tech, hch,
	steiner, Jesse Barnes, sylvain.jeaugey, djh, LKML, Andi Kleen,
	sivanich

On Mon, 2004-10-11 at 15:58, Paul Jackson wrote:
> Matthew wrote:
> > One of the cool thing about using sched_domains as your partitioning
> > element is that in reality, tasks run on *CPUs*, not *domains*. 
> 
> Unfortunately, my manager has reminded me of an essential deliverable
> that I have for another project, due in two weeks.  I'm going to need
> every one of those days.  So I will have to take a two week sabbatical
> from this design work.
> 
> It might make sense to reconvene this work on a new thread, with a last
> message on this monster thread inviting all interested parties to come
> on over.  I suspect a few folks will be happy to see this thread wind
> down.
> 
> I'd guess lse-tech (my preference) or ckrm-tech would be a suitable
> forum for this new thread.
> 
> Carry on.

Sounds good, Paul.  I think the discussion on this thread was kind of
winding down anyway.  In two weeks I'll have some more work done on my
code, particularly trying to get the cpusets/CKRM filesystem interface
to play with my sched_domains code.  We'll be able to digest all the the
information, requirements, requests, etc. on this thread and start a
fresh discussion on (or at least closer to) the same page.

-Matt


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-12  8:50                                                                 ` Simon Derr
@ 2004-10-12 21:25                                                                   ` Matthew Dobson
  0 siblings, 0 replies; 233+ messages in thread
From: Matthew Dobson @ 2004-10-12 21:25 UTC (permalink / raw)
  To: Simon Derr
  Cc: Paul Jackson, Rick Lindsley, Martin J. Bligh, pwil3058, frankeh,
	dipankar, Andrew Morton, ckrm-tech, efocht, LSE Tech, hch,
	steiner, Jesse Barnes, sylvain.jeaugey, djh, LKML, Andi Kleen,
	sivanich

On Tue, 2004-10-12 at 01:50, Simon Derr wrote:
> > One of the cool thing about using sched_domains as your partitioning
> > element is that in reality, tasks run on *CPUs*, not *domains*.  So if
> > you have threads 'a1' & 'a2' running on CPUs 0 & 1 (small job 'a') and
> > threads 'b1' & 'b2' running on CPUs 2 & 3 (small job 'b'), you can
> > suspend threads a1, a2, b1 & b2 and remove the domains they were running
> > in to allow job A (big job with threads A1, A2, A3, & A4) to run on the
> > larger 4 CPU domain.  When you then suspend A1-A4 again to allow the
> > smaller jobs to proceed, you can pretty trivially create the 2 CPU
> > domains underneath the 4 CPU domain and resume the jobs.  Those jobs (a
> > & b) have been suspended on the CPUs they were originally running on,
> > and thus will resume on the same CPUs without any extra effort.  They
> > will simply run on those CPUs, and at load balance time, the domains
> > attached to those CPUs will be consulted to determine where the tasks
> > can be relocated to if there is a heavy load.  The domains will tell the
> > scheduler that the tasks cannot be relocated outside the 2 CPUs in each
> > respective domain.  Viola!  (sorta ;)
> Voilà ;-)

hehe...  My French spelling obviously isn't quite up to par. ;)


> I agree that this looks really smooth from a scheduler point of view.
> 
> From a user point of view, remains the issue of suspending the tasks:
> -find which tasks to suspend : how do you know that job 'a' consists 
> exactly of 'a1' and 'a2'
> -suspend them (btw, how do you achieve this ? kill -STOP ?)
> 
> 
> I've been away from my mail and still trying to catch up, nevermind if the 
> above does not makes sense to you.
> 
> 	Simon.

Paul didn't go into specifics about how to suspend the job, so neither
did I.  Sending SIGSTOP & SIGCONT should work, as you mention...  Those
are implementation details which really aren't *that* important to the
discussion.  We're still trying to figure out the overall framework and
API to work with, so which method of suspending a thread we'll
eventually use can be tackled down the road.  :)

-Matt


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] Re: [PATCH] cpusets - big numa cpu and memory placement
  2004-10-09  0:22                                                             ` Matthew Dobson
@ 2004-10-12 22:24                                                               ` Hanna Linder
  2004-10-13 20:56                                                                 ` Matthew Dobson
  0 siblings, 1 reply; 233+ messages in thread
From: Hanna Linder @ 2004-10-12 22:24 UTC (permalink / raw)
  To: colpatch, Hubertus Franke
  Cc: Paul Jackson, Rick Lindsley, Martin J. Bligh, Simon.Derr,
	pwil3058, dipankar, Andrew Morton, ckrm-tech, efocht, LSE Tech,
	hch, steiner, Jesse Barnes, sylvain.jeaugey, djh, LKML,
	Andi Kleen, sivanich

--On Friday, October 08, 2004 05:22:10 PM -0700 Matthew Dobson <colpatch@us.ibm.com> wrote:
> 
> It'd be really nice if we could all get together with a wall of
> whiteboards, some markers, and a few pots of coffee.  I think we'd all
> get this pretty much hashed out in an hour or two.  This isn't directed

I can easily set up another conference call if you all want. You can also
use the #lse channel on irc.oftc.net for talking about this.

Let me know.

Hanna 


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] Re: [PATCH] cpusets - big numa cpu and memory placement
  2004-10-12 22:24                                                               ` [Lse-tech] " Hanna Linder
@ 2004-10-13 20:56                                                                 ` Matthew Dobson
  0 siblings, 0 replies; 233+ messages in thread
From: Matthew Dobson @ 2004-10-13 20:56 UTC (permalink / raw)
  To: Hanna Linder
  Cc: Hubertus Franke, Paul Jackson, Rick Lindsley, Martin J. Bligh,
	Simon.Derr, pwil3058, dipankar, Andrew Morton, ckrm-tech, efocht,
	LSE Tech, hch, steiner, Jesse Barnes, sylvain.jeaugey, djh, LKML,
	Andi Kleen, sivanich

On Tue, 2004-10-12 at 15:24, Hanna Linder wrote:
> --On Friday, October 08, 2004 05:22:10 PM -0700 Matthew Dobson <colpatch@us.ibm.com> wrote:
> > 
> > It'd be really nice if we could all get together with a wall of
> > whiteboards, some markers, and a few pots of coffee.  I think we'd all
> > get this pretty much hashed out in an hour or two.  This isn't directed
> 
> I can easily set up another conference call if you all want. You can also
> use the #lse channel on irc.oftc.net for talking about this.
> 
> Let me know.
> 
> Hanna 

When Paul gets back from his forced sabbatical we may well take you up
on this.

Thanks!

-Matt


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-07 18:13                                                             ` Martin J. Bligh
  2004-10-08  9:23                                                               ` Erich Focht
@ 2004-10-14 10:35                                                               ` Eric W. Biederman
  2004-10-14 11:22                                                                 ` Erich Focht
                                                                                   ` (2 more replies)
  1 sibling, 3 replies; 233+ messages in thread
From: Eric W. Biederman @ 2004-10-14 10:35 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Paul Jackson, Simon.Derr, colpatch, pwil3058, frankeh, dipankar,
	akpm, ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

"Martin J. Bligh" <mbligh@aracnet.com> writes:

> My main problem is that I don't think we want lots of overlapping complex 
> interfaces in the kernel. Plus I think some of the stuff proposed is fairly 
> klunky as an interface (physical binding where it's mostly not needed, and
> yes I sort of see your point about keeping jobs on separate CPUs, though I
> still think it's tenuous), and makes heavy use of stuff that doesn't work 
> well (e.g. cpus_allowed). So I'm searching for various ways to address that.

Sorry I spotted this thread late.  People seem to be looking at how things
are done on clusters and then apply them to numa machines.  Which I agree
looks totally backwards.  

The actual application requirement (ignoring the sucky batch schedulers)
is for a group of processes (a magic process group?) to all be
simultaneously runnable.  On a cluster that is accomplished by having
an extremely stupid scheduler place one process per machine.   On a
NUMA machine you can do better because you can suspend and migrate
processes.  

The other difference on these large machines is these compute jobs
that are cpu hogs will often have priority over all of the other
processes in the system.  

A batch scheduler should be able to prevent a machine from being
overloaded by simply not putting too many processes on the machine at
a time.  Or if a higher priority job comes in suspending all of
the processes that of some lower priority job to make run for the
new job.  Being able to swap page tables is likely a desirable feature
in that scenario so all of the swapped out jobs resources can be
removed from memory.

> It all just seems like a lot of complexity for a fairly obscure set of
> requirements for a very limited group of users, to be honest. 

I think that is correct to some extent.  I think the requirements are
much more reasonable when people stop hanging on to the cludges they
have been using because they cannot migrate jobs, or suspend
sufficiently jobs to get out of the way of other jobs. 

Martin does enhancing the scheduler to deal with a group of processes 
that all run in lock-step, usually simultaneously computing or
communicating sound sane?  Where preempting one is effectively preempting
all of them.

I have been quite confused by this thread in that I have not seen
any mechanism that looks beyond an individual processes at a time,
which seems so completely wrong.


Eric

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-14 10:35                                                               ` Eric W. Biederman
@ 2004-10-14 11:22                                                                 ` Erich Focht
  2004-10-14 11:23                                                                 ` Paul Jackson
  2004-10-14 19:39                                                                 ` Paul Jackson
  2 siblings, 0 replies; 233+ messages in thread
From: Erich Focht @ 2004-10-14 11:22 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Martin J. Bligh, Paul Jackson, Simon.Derr, colpatch, pwil3058,
	frankeh, dipankar, akpm, ckrm-tech, lse-tech, hch, steiner,
	jbarnes, sylvain.jeaugey, djh, linux-kernel, ak, sivanich

On Thursday 14 October 2004 12:35, Eric W. Biederman wrote:
> Sorry I spotted this thread late. 

The thread was actually d(r)ying out...

> People seem to be looking at how things
> are done on clusters and then apply them to numa machines.  Which I agree
> looks totally backwards.  
> 
> The actual application requirement (ignoring the sucky batch schedulers)
> is for a group of processes (a magic process group?) to all be
> simultaneously runnable.  On a cluster that is accomplished by having
> an extremely stupid scheduler place one process per machine.   On a
> NUMA machine you can do better because you can suspend and migrate
> processes.  

Eric, beyond wanting all processes scheduled at the same time we also
want separation and real isolation (CPU and memory-wise) of processes
belonging to different users. The first emails in the thread describe
the requirements well. They are too complex to be simply handled by
cpus_allowed and mems_allowed masks, basically a hierarchy is needed
in the cpusets allocation.

> > It all just seems like a lot of complexity for a fairly obscure set of
> > requirements for a very limited group of users, to be honest. 
> 
> I think that is correct to some extent.  I think the requirements are
> much more reasonable when people stop hanging on to the cludges they
> have been using because they cannot migrate jobs, or suspend
> sufficiently jobs to get out of the way of other jobs. 

Cpusets and alike have a long history originating from ccNUMA
machines. It is not simply simulating replicating cluster
behavior. Batch schedulers may be an unelegant solution but they are
reality and used since computers were invented (more or less).

> Martin does enhancing the scheduler to deal with a group of processes 
> that all run in lock-step, usually simultaneously computing or
> communicating sound sane?  Where preempting one is effectively preempting
> all of them.
> 
> I have been quite confused by this thread in that I have not seen
> any mechanism that looks beyond an individual processes at a time,
> which seems so completely wrong.

You seem to be suggesting a gang scheduler!!! YES!!! I would love
that! But I remember that 2 years ago there were some emails from
major kernel maintainers (I don't exactly remember whom) saying that a
gang scheduler will never go into Linux. So ... here's something which
somewhat simulates that behavior. Anyhow, cpusets makes sense (for
isolation of resources) anyway, no matter whether we have gang
scheduling or not.

> Eric

Regards,
Erich



^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-14 10:35                                                               ` Eric W. Biederman
  2004-10-14 11:22                                                                 ` Erich Focht
@ 2004-10-14 11:23                                                                 ` Paul Jackson
  2004-10-14 19:39                                                                 ` Paul Jackson
  2 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-14 11:23 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: mbligh, Simon.Derr, colpatch, pwil3058, frankeh, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

Eric wrote:
> I have been quite confused by this thread in that I have not seen
> any mechanism that looks beyond an individual processes at a time,
> which seems so completely wrong.

In the simplest form, we obtain the equivalent of gang scheduling for
the several threads of a tightly coupled job by arranging to have only
one runnable thread per cpu, each such thread pinned on one cpu, and all
threads in a given job simultaneously runnable.

For compute bound jobs, this is often sufficient.  Time share (to a
coarse granularity of minutes or hours) and overlap of various sized
jobs is handled using suspension and migration in order to obtain the
above invariants of one runnable thread per cpu at any given time, and
of having all threads in a tightly coupled job pinned to distinct cpus
and runnable simultaneously.

For jobs that are not compute bound, where other delays such as i/o
would allow for running more than one such job at a time (both
intermittendly runnable on a finer scale of seconds), then one needs
something like gang scheduling in order to keep all the threads in a
tightly coupled job running together, while still obtaining maximum
utilization of cpu/memory hardware from jobs with cpu duty cycles of
less than 50%.

The essential purpose of cpusets is to take the placement of individual
threads by the sched_setaffinity and mbind/set_mempolicy calls, and
extend that to manage placing groups of tasks on administratively
designated and controlled groups of cpus/nodes.

If you see nothing beyond individual processes, then I think you are
missing that.

However, it is correct that we haven't (so far as I recall) considered
the gang scheduling that you describe.  My crystal ball says we might
get to that next year.

Gang scheduling isn't needed for the compute bound jobs, because just
running a single job at a time on a given subset of a systems cpus and
memory obtains the same result.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-14 10:35                                                               ` Eric W. Biederman
  2004-10-14 11:22                                                                 ` Erich Focht
  2004-10-14 11:23                                                                 ` Paul Jackson
@ 2004-10-14 19:39                                                                 ` Paul Jackson
  2004-10-14 22:38                                                                   ` Hubertus Franke
  2 siblings, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2004-10-14 19:39 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: mbligh, Simon.Derr, colpatch, pwil3058, frankeh, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

Kevin McMahon <n6965@sgi.com> pointed out to me a link to an interesting
article on gang scheduling:

  http://www.linuxjournal.com/article.php?sid=7690
  Issue 127: Improving Application Performance on HPC Systems with Process Synchronization
  Posted on Monday, November 01, 2004 by Paul Terry Amar Shan Pentti Huttunen

It's amazingly current - won't even be posted for another couple of weeks ;).

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-14 19:39                                                                 ` Paul Jackson
@ 2004-10-14 22:38                                                                   ` Hubertus Franke
  2004-10-15  1:26                                                                     ` Paul Jackson
  0 siblings, 1 reply; 233+ messages in thread
From: Hubertus Franke @ 2004-10-14 22:38 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Eric W. Biederman, mbligh, Simon.Derr, colpatch, pwil3058,
	dipankar, akpm, ckrm-tech, efocht, lse-tech, hch, steiner,
	jbarnes, sylvain.jeaugey, djh, linux-kernel, ak, sivanich

Paul, there are also other means for gang scheduling then having
to architect a tightly synchronized global clock into the communication 
device.

Particularly, in a batch oriented environment of compute intensive 
applications, one does not really need/want to switch frequently.
Often, the communication devices are memory mapped straight into the
application OS involvement with limited available channels.

However, as shown in previous work, gang scheduling and other forms of 
scheduling tricks (e.g. backfilling) can provide for significant higher 
utilization. So, if a high context switching rate (read interactivity) 
is not required, then a user space daemon scheduling network can be used.

We have a slew of pubs on this. An example readup can be obtained here:

Y. Zhang, H. Franke, J. Moreira, A. Sivasubramaniam. Improving Parallel 
Job Scheduling by Combining Gang Scheduling and Backfilling Techniques. 
In Proceedings of the International Parallel and Distributed Processing 
Symposium (IPDPS), pages 113-142 May 2000.
http://www.cse.psu.edu/~anand/csl/papers/ipdps00.pdf

Or for a final sum up of that research as a journal.

Y. Zhang, H. Franke, J. Moreira, A. Sivasubramaniam. An Integrated 
Approach to Parallel Scheduling Using Gang-Scheduling, Backfilling and 
Migration. IEEE Transactions on Parallel and Distributed Systems, 
14(3):236-247, March 2003.

This was implemented for the IBM SP2 cluster and ASCI machine at 
Livermore National Lab in the late 90's.

If you are interested in short scheduling cycles we also discovered that
dependent on the synchronity of the applications gang scheduling is not 
necessarily the best.

Y. Zhang, A. Sivasubramaniam, J. Moreira, H. Franke. A Simulation-based 
Study of Scheduling Mechanisms for a Dynamic Cluster Environment. In 
Proceedings of the ACM International Conference on Supercomputing (ICS), 
pages 100-109, May 2000. http://www.cse.psu.edu/~anand/csl/papers/ics00a.pdf

If I remember correctly this tight gang scheduling based on slots was 
already implemented on IRIX in 95/96 ( read a paper on that ).

Moral of the story here is that its unlikely that Linux will support 
gang scheduling in its core anytime soon or will allow network adapters 
to drive scheduling strategies. So likely these are out.
An less frequent gang scheduling can be implemented with user level 
daemons, so an adequate solution is available for most instances.

-- Hubertus

Paul Jackson wrote:

> Kevin McMahon <n6965@sgi.com> pointed out to me a link to an interesting
> article on gang scheduling:
> 
>   http://www.linuxjournal.com/article.php?sid=7690
>   Issue 127: Improving Application Performance on HPC Systems with Process Synchronization
>   Posted on Monday, November 01, 2004 by Paul Terry Amar Shan Pentti Huttunen
> 
> It's amazingly current - won't even be posted for another couple of weeks ;).
> 


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-14 22:38                                                                   ` Hubertus Franke
@ 2004-10-15  1:26                                                                     ` Paul Jackson
  0 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2004-10-15  1:26 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: ebiederm, mbligh, Simon.Derr, colpatch, pwil3058, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, ak, sivanich

Huertus wrote:
> Paul, there are also other means for gang scheduling then having
> to architect a tightly synchronized global clock into the communication 
> device.

We agree.  

My reply to the post of Eric W. Biederman at the start of this
sub-thread began:

> In the simplest form, we obtain the equivalent of gang scheduling for
> the several threads of a tightly coupled job by arranging to have only
> one runnable thread per cpu, each such thread pinned on one cpu, and all
> threads in a given job simultaneously runnable.
> 
> For compute bound jobs, this is often sufficient. 

You reply adds substantial detail and excellent references.

Thank-you.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2004-10-05 22:19                               ` Matthew Dobson
                                                   ` (2 preceding siblings ...)
  2004-10-06  8:02                                 ` Simon Derr
@ 2005-02-07 23:59                                 ` Matthew Dobson
  2005-02-08  0:20                                   ` Andrew Morton
                                                     ` (2 more replies)
  3 siblings, 3 replies; 233+ messages in thread
From: Matthew Dobson @ 2005-02-07 23:59 UTC (permalink / raw)
  To: Martin J. Bligh, Paul Jackson, pwil3058, frankeh, dipankar,
	Andrew Morton, ckrm-tech, efocht, LSE Tech, hch, steiner,
	Jesse Barnes, sylvain.jeaugey, djh, LKML, Simon.Derr, Andi Kleen,
	sivanich

Matthew Dobson wrote:
> On Sun, 2004-10-03 at 16:53, Martin J. Bligh wrote:
> 
>>>Martin wrote:
>>>
>>>>Matt had proposed having a separate sched_domain tree for each cpuset, which
>>>>made a lot of sense, but seemed harder to do in practice because "exclusive"
>>>>in cpusets doesn't really mean exclusive at all.
>>>
>>>See my comments on this from yesterday on this thread.
>>>
>>>I suspect we don't want a distinct sched_domain for each cpuset, but
>>>rather a sched_domain for each of several entire subtrees of the cpuset
>>>hierarchy, such that every CPU is in exactly one such sched domain, even
>>>though it be in several cpusets in that sched_domain.
>>
>>Mmmm. The fundamental problem I think we ran across (just whilst pondering,
>>not in code) was that some things (eg ... init) are bound to ALL cpus (or
>>no cpus, depending how you word it); i.e. they're created before the cpusets
>>are, and are a member of the grand-top-level-uber-master-thingummy.
>>
>>How do you service such processes? That's what I meant by the exclusive
>>domains aren't really exclusive. 
>>
>>Perhaps Matt can recall the problems better. I really liked his idea, aside
>>from the small problem that it didn't seem to work ;-)
> 
> 
> Well that doesn't seem like a fair statement.  It's potentially true,
> but it's really hard to say without an implementation! ;)
> 
> I think that the idea behind cpusets is really good, essentially
> creating isolated areas of CPUs and memory for tasks to run
> undisturbed.  I feel that the actual implementation, however, is taking
> a wrong approach, because it attempts to use the cpus_allowed mask to
> override the scheduler in the general case.  cpus_allowed, in my
> estimation, is meant to be used as the exception, not the rule.  If we
> wish to change that, we need to make the scheduler more aware of it, so
> it can do the right thing(tm) in the presence of numerous tasks with
> varying cpus_allowed masks.  The other option is to implement cpusets in
> a way that doesn't use cpus_allowed.  That is the option that I am
> pursuing.  
> 
> My idea is to make sched_domains much more flexible and dynamic.  By
> adding locking and reference counting, and simplifying the way in which
> sched_domains are created, linked, unlinked and eventually destroyed we
> can use sched_domains as the implementation of cpusets.  IA64 already
> allows multiple sched_domains trees without a shared top-level domain. 
> My proposal is to make this functionality more generally available. 
> Extending the "isolated domains" concept a little further will buy us
> most (all?) the functionality of "exclusive" cpusets without the need to
> use cpus_allowed at all.
> 
> I've got some code.  I'm in the midst of pushing it forward to rc3-mm2. 
> I'll post an RFC later today or tomorrow when it's cleaned up.
> 
> -Matt

Sorry to reply a long quiet thread, but I've been trading emails with Paul 
Jackson on this subject recently, and I've been unable to convince either him 
or myself that merging CPUSETs and CKRM is as easy as I once believed.  I'm 
still convinced the CPU side is doable, but I haven't managed as much success 
with the memory binding side of CPUSETs.  In light of this, I'd like to remove 
my previous objections to CPUSETs moving forward.  If others still have things 
they want discussed before CPUSETs moves into mainline, that's fine, but it 
seems to me that CPUSETs offer legitimate functionality and that the code has 
certainly "done its time" in -mm to convince me it's stable and usable.

-Matt

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2005-02-07 23:59                                 ` Matthew Dobson
@ 2005-02-08  0:20                                   ` Andrew Morton
  2005-02-08  0:34                                     ` Paul Jackson
  2005-02-08  9:54                                   ` Dinakar Guniguntala
  2005-02-08 16:15                                   ` Martin J. Bligh
  2 siblings, 1 reply; 233+ messages in thread
From: Andrew Morton @ 2005-02-08  0:20 UTC (permalink / raw)
  To: Matthew Dobson
  Cc: mbligh, pj, pwil3058, frankeh, dipankar, ckrm-tech, efocht,
	lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, Simon.Derr, ak, sivanich

Matthew Dobson <colpatch@us.ibm.com> wrote:
>
> Sorry to reply a long quiet thread,

Is appreciated, thanks.

> but I've been trading emails with Paul 
> Jackson on this subject recently, and I've been unable to convince either him 
> or myself that merging CPUSETs and CKRM is as easy as I once believed.  I'm 
> still convinced the CPU side is doable, but I haven't managed as much success 
> with the memory binding side of CPUSETs.  In light of this, I'd like to remove 
> my previous objections to CPUSETs moving forward.  If others still have things 
> they want discussed before CPUSETs moves into mainline, that's fine, but it 
> seems to me that CPUSETs offer legitimate functionality and that the code has 
> certainly "done its time" in -mm to convince me it's stable and usable.

OK, I'll add cpusets to the 2.6.12 queue.

going once, going twice...

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2005-02-08  0:20                                   ` Andrew Morton
@ 2005-02-08  0:34                                     ` Paul Jackson
  0 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2005-02-08  0:34 UTC (permalink / raw)
  To: Andrew Morton
  Cc: colpatch, mbligh, pwil3058, frankeh, dipankar, ckrm-tech, efocht,
	lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, Simon.Derr, ak, sivanich

Andrew wrote:
> OK, I'll add cpusets to the 2.6.12 queue.

I'd like that ;).

Thank-you, Matthew, for the work you put into making sense of this.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.650.933.1373, 1.925.600.0401

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2005-02-08  9:54                                   ` Dinakar Guniguntala
@ 2005-02-08  9:49                                     ` Nick Piggin
  2005-02-08 16:13                                       ` Martin J. Bligh
  2005-02-08 19:32                                       ` Matthew Dobson
  2005-02-08 19:00                                     ` Matthew Dobson
  1 sibling, 2 replies; 233+ messages in thread
From: Nick Piggin @ 2005-02-08  9:49 UTC (permalink / raw)
  To: dino
  Cc: Matthew Dobson, Martin J. Bligh, Paul Jackson, pwil3058, frankeh,
	dipankar, Andrew Morton, ckrm-tech, efocht, LSE Tech, hch,
	steiner, Jesse Barnes, sylvain.jeaugey, djh, LKML, Simon.Derr,
	Andi Kleen, sivanich

Dinakar Guniguntala wrote:
> On Mon, Feb 07, 2005 at 03:59:49PM -0800, Matthew Dobson wrote:
> 
> 
>>Sorry to reply a long quiet thread, but I've been trading emails with Paul 
>>Jackson on this subject recently, and I've been unable to convince either 
>>him or myself that merging CPUSETs and CKRM is as easy as I once believed.  
>>I'm still convinced the CPU side is doable, but I haven't managed as much 
>>success with the memory binding side of CPUSETs.  In light of this, I'd 
>>like to remove my previous objections to CPUSETs moving forward.  If others 
>>still have things they want discussed before CPUSETs moves into mainline, 
>>that's fine, but it seems to me that CPUSETs offer legitimate functionality 
>>and that the code has certainly "done its time" in -mm to convince me it's 
>>stable and usable.
>>
>>-Matt
>>
> 
> 
> What about your proposed sched domain changes?
> Cant sched domains be used handle the CPU groupings and the
> existing code in cpusets that handle memory continue as is?
> Weren't sched somains supposed to give the scheduler better knowledge
> of the CPU groupings afterall ?
> 

sched domains can provide non overlapping top level partitions.
It would basically just stop the multiprocessor balancing from
moving tasks between these partitions (they would be manually
moved by setting explicit cpu affinities).

I didn't really follow where that idea went, but I think at least
a few people thought that sort of functionality wasn't nearly
fancy enough! :)


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2005-02-07 23:59                                 ` Matthew Dobson
  2005-02-08  0:20                                   ` Andrew Morton
@ 2005-02-08  9:54                                   ` Dinakar Guniguntala
  2005-02-08  9:49                                     ` Nick Piggin
  2005-02-08 19:00                                     ` Matthew Dobson
  2005-02-08 16:15                                   ` Martin J. Bligh
  2 siblings, 2 replies; 233+ messages in thread
From: Dinakar Guniguntala @ 2005-02-08  9:54 UTC (permalink / raw)
  To: Matthew Dobson
  Cc: Martin J. Bligh, Paul Jackson, pwil3058, frankeh, dipankar,
	Andrew Morton, ckrm-tech, efocht, LSE Tech, hch, steiner,
	Jesse Barnes, sylvain.jeaugey, djh, LKML, Simon.Derr, Andi Kleen,
	sivanich

On Mon, Feb 07, 2005 at 03:59:49PM -0800, Matthew Dobson wrote:

> Sorry to reply a long quiet thread, but I've been trading emails with Paul 
> Jackson on this subject recently, and I've been unable to convince either 
> him or myself that merging CPUSETs and CKRM is as easy as I once believed.  
> I'm still convinced the CPU side is doable, but I haven't managed as much 
> success with the memory binding side of CPUSETs.  In light of this, I'd 
> like to remove my previous objections to CPUSETs moving forward.  If others 
> still have things they want discussed before CPUSETs moves into mainline, 
> that's fine, but it seems to me that CPUSETs offer legitimate functionality 
> and that the code has certainly "done its time" in -mm to convince me it's 
> stable and usable.
> 
> -Matt
> 

What about your proposed sched domain changes?
Cant sched domains be used handle the CPU groupings and the
existing code in cpusets that handle memory continue as is?
Weren't sched somains supposed to give the scheduler better knowledge
of the CPU groupings afterall ?

Regards,

Dinakar

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2005-02-08  9:49                                     ` Nick Piggin
@ 2005-02-08 16:13                                       ` Martin J. Bligh
  2005-02-08 23:26                                         ` Nick Piggin
  2005-02-08 19:32                                       ` Matthew Dobson
  1 sibling, 1 reply; 233+ messages in thread
From: Martin J. Bligh @ 2005-02-08 16:13 UTC (permalink / raw)
  To: Nick Piggin, dino
  Cc: Matthew Dobson, Paul Jackson, pwil3058, frankeh, dipankar,
	Andrew Morton, ckrm-tech, efocht, LSE Tech, hch, steiner,
	Jesse Barnes, sylvain.jeaugey, djh, LKML, Simon.Derr, Andi Kleen,
	sivanich

>> What about your proposed sched domain changes?
>> Cant sched domains be used handle the CPU groupings and the
>> existing code in cpusets that handle memory continue as is?
>> Weren't sched somains supposed to give the scheduler better knowledge
>> of the CPU groupings afterall ?
>> 
> 
> sched domains can provide non overlapping top level partitions.
> It would basically just stop the multiprocessor balancing from
> moving tasks between these partitions (they would be manually
> moved by setting explicit cpu affinities).
> 
> I didn't really follow where that idea went, but I think at least
> a few people thought that sort of functionality wasn't nearly
> fancy enough! :)

Not fancy seems like a positive thing to me ;-)

M.


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2005-02-07 23:59                                 ` Matthew Dobson
  2005-02-08  0:20                                   ` Andrew Morton
  2005-02-08  9:54                                   ` Dinakar Guniguntala
@ 2005-02-08 16:15                                   ` Martin J. Bligh
  2005-02-08 22:17                                     ` Matthew Dobson
  2 siblings, 1 reply; 233+ messages in thread
From: Martin J. Bligh @ 2005-02-08 16:15 UTC (permalink / raw)
  To: Matthew Dobson, Paul Jackson, pwil3058, frankeh, dipankar,
	Andrew Morton, ckrm-tech, efocht, LSE Tech, hch, steiner,
	Jesse Barnes, sylvain.jeaugey, djh, LKML, Simon.Derr, Andi Kleen,
	sivanich

> Sorry to reply a long quiet thread, but I've been trading emails with 
> Paul Jackson on this subject recently, and I've been unable to convince 
> either him or myself that merging CPUSETs and CKRM is as easy as I once 
> believed.  I'm still convinced the CPU side is doable, but I haven't 
> managed as much success with the memory binding side of CPUSETs.  

Can you describe what the difficulty is with the mem binding side?

Thanks,

M.

PS. If you could also make your mailer line-wrap, that'd be splendid ;-)


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2005-02-08  9:54                                   ` Dinakar Guniguntala
  2005-02-08  9:49                                     ` Nick Piggin
@ 2005-02-08 19:00                                     ` Matthew Dobson
  2005-02-08 20:42                                       ` Paul Jackson
  1 sibling, 1 reply; 233+ messages in thread
From: Matthew Dobson @ 2005-02-08 19:00 UTC (permalink / raw)
  To: dino
  Cc: Martin J. Bligh, Paul Jackson, pwil3058, frankeh, dipankar,
	Andrew Morton, ckrm-tech, efocht, LSE Tech, hch, steiner,
	Jesse Barnes, sylvain.jeaugey, djh, LKML, Simon.Derr, Andi Kleen,
	sivanich

Dinakar Guniguntala wrote:
> On Mon, Feb 07, 2005 at 03:59:49PM -0800, Matthew Dobson wrote:
> 
> 
>>Sorry to reply a long quiet thread, but I've been trading emails with Paul 
>>Jackson on this subject recently, and I've been unable to convince either 
>>him or myself that merging CPUSETs and CKRM is as easy as I once believed.  
>>I'm still convinced the CPU side is doable, but I haven't managed as much 
>>success with the memory binding side of CPUSETs.  In light of this, I'd 
>>like to remove my previous objections to CPUSETs moving forward.  If others 
>>still have things they want discussed before CPUSETs moves into mainline, 
>>that's fine, but it seems to me that CPUSETs offer legitimate functionality 
>>and that the code has certainly "done its time" in -mm to convince me it's 
>>stable and usable.
>>
>>-Matt
>>
> 
> 
> What about your proposed sched domain changes?
> Cant sched domains be used handle the CPU groupings and the
> existing code in cpusets that handle memory continue as is?
> Weren't sched somains supposed to give the scheduler better knowledge
> of the CPU groupings afterall ?
> 
> Regards,
> 
> Dinakar

Yes.  I still think that there is room for merging on the CPU scheduling side 
between CPUSETs and sched domains, and I will continue to work on that aspect. 
  The reason Paul and I decided that they weren't totally reconcilable is 
because of the memory binding side of the CPUSETs code.

-Matt

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2005-02-08  9:49                                     ` Nick Piggin
  2005-02-08 16:13                                       ` Martin J. Bligh
@ 2005-02-08 19:32                                       ` Matthew Dobson
  2005-02-09  2:53                                         ` Nick Piggin
  1 sibling, 1 reply; 233+ messages in thread
From: Matthew Dobson @ 2005-02-08 19:32 UTC (permalink / raw)
  To: Nick Piggin
  Cc: dino, Martin J. Bligh, Paul Jackson, pwil3058, frankeh, dipankar,
	Andrew Morton, ckrm-tech, efocht, LSE Tech, hch, steiner,
	Jesse Barnes, sylvain.jeaugey, djh, LKML, Simon.Derr, Andi Kleen,
	sivanich

Nick Piggin wrote:
> Dinakar Guniguntala wrote:
> 
>> On Mon, Feb 07, 2005 at 03:59:49PM -0800, Matthew Dobson wrote:
>>
>>
>>> Sorry to reply a long quiet thread, but I've been trading emails with 
>>> Paul Jackson on this subject recently, and I've been unable to 
>>> convince either him or myself that merging CPUSETs and CKRM is as 
>>> easy as I once believed.  I'm still convinced the CPU side is doable, 
>>> but I haven't managed as much success with the memory binding side of 
>>> CPUSETs.  In light of this, I'd like to remove my previous objections 
>>> to CPUSETs moving forward.  If others still have things they want 
>>> discussed before CPUSETs moves into mainline, that's fine, but it 
>>> seems to me that CPUSETs offer legitimate functionality and that the 
>>> code has certainly "done its time" in -mm to convince me it's stable 
>>> and usable.
>>>
>>> -Matt
>>>
>>
>>
>> What about your proposed sched domain changes?
>> Cant sched domains be used handle the CPU groupings and the
>> existing code in cpusets that handle memory continue as is?
>> Weren't sched somains supposed to give the scheduler better knowledge
>> of the CPU groupings afterall ?
>>
> 
> sched domains can provide non overlapping top level partitions.
> It would basically just stop the multiprocessor balancing from
> moving tasks between these partitions (they would be manually
> moved by setting explicit cpu affinities).

Yep.  That's the idea! :)


> I didn't really follow where that idea went, but I think at least
> a few people thought that sort of functionality wasn't nearly
> fancy enough! :)

Well, that's about how far the idea was supposed to go. ;)  I think named 
hierarchical sched_domains would offer the same functionality (at least for CPU 
partitioning) as CPUSETs.  I'm not sure who didn't think it was fancy enough, 
but if you or anyone else can describe CPUSETs configurations that couldn't be 
represented by sched_domains trees, I'd be very curious to hear about them.

-Matt

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2005-02-08 19:00                                     ` Matthew Dobson
@ 2005-02-08 20:42                                       ` Paul Jackson
  2005-02-08 22:14                                         ` Matthew Dobson
  2005-02-09 17:59                                         ` [ckrm-tech] " Chandra Seetharaman
  0 siblings, 2 replies; 233+ messages in thread
From: Paul Jackson @ 2005-02-08 20:42 UTC (permalink / raw)
  To: Matthew Dobson
  Cc: dino, mbligh, pwil3058, frankeh, dipankar, akpm, ckrm-tech,
	efocht, lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, Simon.Derr, ak, sivanich

Matthew wrote:
>   The reason Paul and I decided that they weren't totally reconcilable is 
> because of the memory binding side of the CPUSETs code.

Speak for yourself, Matthew ;).

I agree you that the scheduler experts (I'm not one, nor do I aspire to
be one) may well find that it makes sense someday to better integrate
scheduler domains and cpusets.  It seems a little inefficient on the
surface for scheduler domain code to spend time trying to choose the
best task to run on a CPU, only to find out that the chosen task is not
allowed, because that tasks cpus_allowed does not allow execution on the
intended CPU.  Since in some systems, cpusets will provide a better
indication of the natural clustering of various cpus_allowed values than
a simple boottime hierarchical partitioning of the system, it makes
sense to me that there might be a way to improve the integration of
cpusets and scheduler domains, at least as an option on systems that are
making heavy use of cpusets.  This might have the downside of making
sched domains more dynamic than they are now, which might cost more
performance than it gained.  Others will have to evaluate those
tradeoffs.

But when you write the phrase "they weren't totally reconcilable,"
I presume you mean "cpusets and CKRM weren't totally reconcilable."

I would come close to turning this phrasing around, and state that
they were (nearly) totally unreconcilable <grin>.

I found no useful and significant basis for integration of cpusets and
CKRM either involving CPU or Memory Node management.

As best as I can figure out, CKRM is a fair share scheduler with a
gussied up more modular architecture, so that the components to track
usage, control (throttle) tasks, and classify tasks are separate
plugins.  I can find no significant and useful overlap on any of these
fronts, either the existing plugins or their infrastructure, with what
cpusets has and needs.

There are claims that CKRM has some generalized resource management
architecture that should be able to handle cpusets needs, but despite my
repeated (albeit not entirely successful) efforts to find documentation
and read source and my pleadings with Matthew and earlier on this
thread, I was never able to figure out what this meant, or find anything
that could profitably integrate with cpusets.

In sum -- I see a potential for useful integration of cpusets and
scheduler domains, I'll have to leave it up to those with expertise in
the scheduler to evaluate and perhaps accomplish this.  I do not see any
useful integration of cpusets and CKRM.

I continue to be befuddled as to why, Matthew, you confound potential
cpuset-scheddomain integration with potential cpuset-CKRM integration.
Scheduler domains and CKRM are distinct beasts, in my book, and the
contemplations of cpuset integration with these two beasts are also
distinct efforts.

And cpusets and CKRM are distinct beasts.

But I repeat myself ...

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.650.933.1373, 1.925.600.0401

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2005-02-08 20:42                                       ` Paul Jackson
@ 2005-02-08 22:14                                         ` Matthew Dobson
  2005-02-08 23:58                                           ` Shailabh Nagar
  2005-02-09  0:24                                           ` Paul Jackson
  2005-02-09 17:59                                         ` [ckrm-tech] " Chandra Seetharaman
  1 sibling, 2 replies; 233+ messages in thread
From: Matthew Dobson @ 2005-02-08 22:14 UTC (permalink / raw)
  To: Paul Jackson
  Cc: dino, mbligh, pwil3058, frankeh, dipankar, akpm, ckrm-tech,
	efocht, lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, Simon.Derr, ak, sivanich

Paul Jackson wrote:
> Matthew wrote:
> 
>>  The reason Paul and I decided that they weren't totally reconcilable is 
>>because of the memory binding side of the CPUSETs code.
> 
> 
> Speak for yourself, Matthew ;).
> 
> I agree you that the scheduler experts (I'm not one, nor do I aspire to
> be one) may well find that it makes sense someday to better integrate
> scheduler domains and cpusets.  It seems a little inefficient on the
> surface for scheduler domain code to spend time trying to choose the
> best task to run on a CPU, only to find out that the chosen task is not
> allowed, because that tasks cpus_allowed does not allow execution on the
> intended CPU.  Since in some systems, cpusets will provide a better
> indication of the natural clustering of various cpus_allowed values than
> a simple boottime hierarchical partitioning of the system, it makes
> sense to me that there might be a way to improve the integration of
> cpusets and scheduler domains, at least as an option on systems that are
> making heavy use of cpusets.  This might have the downside of making
> sched domains more dynamic than they are now, which might cost more
> performance than it gained.  Others will have to evaluate those
> tradeoffs.

Indeed.  There are tradeoffs involved in changing sched_domains from a single 
static, boot-time setup to a more dynamic, configurable setup.  Most notably 
the inevitable locking necessary to ensure a consistent view of the domain 
trees.  Those tradeoffs, design decisions, etc. are fodder for another thread.


> But when you write the phrase "they weren't totally reconcilable,"
> I presume you mean "cpusets and CKRM weren't totally reconcilable."
> 
> I would come close to turning this phrasing around, and state that
> they were (nearly) totally unreconcilable <grin>.
> 
> I found no useful and significant basis for integration of cpusets and
> CKRM either involving CPU or Memory Node management.

Yes, I misspoke.  I should have been more clear that CKRM and CPUSETs (seem) to 
be unreconcilable.  Sched_domains and CPUSETs (seem) to have some potential 
functionality overlap that leads me to (still) believe there is hope to 
integrate these two systems.


> As best as I can figure out, CKRM is a fair share scheduler with a
> gussied up more modular architecture, so that the components to track
> usage, control (throttle) tasks, and classify tasks are separate
> plugins.  I can find no significant and useful overlap on any of these
> fronts, either the existing plugins or their infrastructure, with what
> cpusets has and needs.
> 
> There are claims that CKRM has some generalized resource management
> architecture that should be able to handle cpusets needs, but despite my
> repeated (albeit not entirely successful) efforts to find documentation
> and read source and my pleadings with Matthew and earlier on this
> thread, I was never able to figure out what this meant, or find anything
> that could profitably integrate with cpusets.
> 
> In sum -- I see a potential for useful integration of cpusets and
> scheduler domains, I'll have to leave it up to those with expertise in
> the scheduler to evaluate and perhaps accomplish this.  I do not see any
> useful integration of cpusets and CKRM.

I'm not an expert on CKRM, so I'll leave the refuting (or not refuting) of your 
claims as to CKRM's usefulness to someone with more background and expertise on 
the subject.  Anyone want to pipe up and defend the alleged "gussied up" 
fair-share scheduler?


> I continue to be befuddled as to why, Matthew, you confound potential
> cpuset-scheddomain integration with potential cpuset-CKRM integration.
> Scheduler domains and CKRM are distinct beasts, in my book, and the
> contemplations of cpuset integration with these two beasts are also
> distinct efforts.
> 
> And cpusets and CKRM are distinct beasts.

My clever attempts to befuddle you have obviously succeeded beyond my wildest 
dreams, Paul.  You are now mired in a web of acronyms with no way out.  You may 
be eaten by a grue. :p


> But I repeat myself ...

It's the surest way to get someone to hear you, right!? ;)

-Matt

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2005-02-08 16:15                                   ` Martin J. Bligh
@ 2005-02-08 22:17                                     ` Matthew Dobson
  0 siblings, 0 replies; 233+ messages in thread
From: Matthew Dobson @ 2005-02-08 22:17 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Paul Jackson, pwil3058, frankeh, dipankar, Andrew Morton,
	ckrm-tech, efocht, LSE Tech, hch, steiner, Jesse Barnes,
	sylvain.jeaugey, djh, LKML, Simon.Derr, Andi Kleen, sivanich

Martin J. Bligh wrote:
>>Sorry to reply a long quiet thread, but I've been trading emails with 
>>Paul Jackson on this subject recently, and I've been unable to convince 
>>either him or myself that merging CPUSETs and CKRM is as easy as I once 
>>believed.  I'm still convinced the CPU side is doable, but I haven't 
>>managed as much success with the memory binding side of CPUSETs.  
> 
> 
> Can you describe what the difficulty is with the mem binding side?

Well, basically we need to ensure that when CPUSETs are marked "mems_exclusive" 
that no one else in the system is allowed to allocate from those "exclusive" 
nodes.  This can't be guaranteed without hooks in the allocation code much like 
what Paul has already written in his CPUSETs patchset.

> Thanks,
> 
> M.
> 
> PS. If you could also make your mailer line-wrap, that'd be splendid ;-)

I believe my mailer is line-wrapping correctly, but it's hard to be sure 
without feedback.  I switched to Thunderbird last week, and I think I've 
(un)checked all the appropriate boxes.  And yes, line wrapping is splendid. 
Splendiferous, even.

-Matt

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2005-02-08 16:13                                       ` Martin J. Bligh
@ 2005-02-08 23:26                                         ` Nick Piggin
  2005-02-09  4:23                                           ` Paul Jackson
  0 siblings, 1 reply; 233+ messages in thread
From: Nick Piggin @ 2005-02-08 23:26 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: dino, Matthew Dobson, Paul Jackson, pwil3058, frankeh, dipankar,
	Andrew Morton, ckrm-tech, efocht, LSE Tech, hch, steiner,
	Jesse Barnes, sylvain.jeaugey, djh, LKML, Simon.Derr, Andi Kleen,
	sivanich

Martin J. Bligh wrote:
>>>What about your proposed sched domain changes?
>>>Cant sched domains be used handle the CPU groupings and the
>>>existing code in cpusets that handle memory continue as is?
>>>Weren't sched somains supposed to give the scheduler better knowledge
>>>of the CPU groupings afterall ?
>>>
>>
>>sched domains can provide non overlapping top level partitions.
>>It would basically just stop the multiprocessor balancing from
>>moving tasks between these partitions (they would be manually
>>moved by setting explicit cpu affinities).
>>
>>I didn't really follow where that idea went, but I think at least
>>a few people thought that sort of functionality wasn't nearly
>>fancy enough! :)
> 
> 
> Not fancy seems like a positive thing to me ;-)
> 

Yes :)

I was thinking the sched domains soft-partitioning could be a
useful feature in its own right, considering the runtime impact
would be exactly zero, and the setup code should already be mostly
there.

If anyone was interested, I could try to cook up an implementation
on the scheduler side. The biggest issues may be the userspace
interface and a decent userspace management tool.


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2005-02-08 22:14                                         ` Matthew Dobson
@ 2005-02-08 23:58                                           ` Shailabh Nagar
  2005-02-09  0:27                                             ` Paul Jackson
  2005-02-09  0:24                                           ` Paul Jackson
  1 sibling, 1 reply; 233+ messages in thread
From: Shailabh Nagar @ 2005-02-08 23:58 UTC (permalink / raw)
  To: Matthew Dobson
  Cc: Paul Jackson, dino, mbligh, pwil3058, frankeh, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, Simon.Derr, ak, sivanich


>> As best as I can figure out, CKRM is a fair share scheduler with a
>> gussied up more modular architecture, so that the components to track
>> usage, control (throttle) tasks, and classify tasks are separate
>> plugins.  

 > I'm not an expert on CKRM, so I'll leave the refuting (or notrefuting)
 > of your claims as to CKRM's usefulness to someone with more background
 > and expertise on the subject.  Anyone want to pipe up and defend the
 > alleged "gussied up" fair-share scheduler?

Well, I'm not sure I want to minutely examine Paul's choice of words !
I would have thought that two OLS and one KS presentation would suffice 
to clarify what CKRM is and isn't but that doesn't seem to be the case 
:-) So here we go again

CKRM is both a resource management infrastructure AND
a set of controllers. The two are independent.

The infrastructure provides for
a) grouping of kernel objects (currently only tasks & sockets but can be 
extended to any others)
b) an external interface for manipulating attributes of the grouping 
such as shares, statistics and members
c) an internal interface for controllers to exploit this grouping 
information in whatever way it wants.

The controllers do whatever they want with the grouping info.
The IBM folks on the project have written ONE set of controllers for 
cpu, mem, io, net and numtasks which HAPPENS to be/aspire to be 
fair-share. Others are free to write ones which ignore share settings 
and be unfair, callous or whatever else they want.

We would love to have people write alternate controllers for the same 
resources (cpu,mem,io,net,numtasks) and others. The former will provide 
alternatives to our implementation, the latter will validate the 
architecture's utility.


>> I can find no significant and useful overlap on any of these
>> fronts, either the existing plugins or their infrastructure, with what
>> cpusets has and needs.
>> There are claims that CKRM has some generalized resource management
>> architecture that should be able to handle cpusets needs, but despite my
>> repeated (albeit not entirely successful) efforts to find documentation
>> and read source and my pleadings with Matthew and earlier on this
>> thread, I was never able to figure out what this meant, or find anything
>> that could profitably integrate with cpusets.

Rereading the earlier posts on the thread, I'd agree. There are some 
similarities in our interfaces but not enough to warrant a merger.


-- Shailabh

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2005-02-08 22:14                                         ` Matthew Dobson
  2005-02-08 23:58                                           ` Shailabh Nagar
@ 2005-02-09  0:24                                           ` Paul Jackson
  1 sibling, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2005-02-09  0:24 UTC (permalink / raw)
  To: Matthew Dobson
  Cc: dino, mbligh, pwil3058, frankeh, dipankar, akpm, ckrm-tech,
	efocht, lse-tech, hch, steiner, jbarnes, sylvain.jeaugey, djh,
	linux-kernel, Simon.Derr, ak, sivanich

Matthew wrote:
> I should have been more clear that CKRM and CPUSETs (seem) to 
> be unreconcilable.  Sched_domains and CPUSETs (seem) to have some potential 
> functionality overlap that leads me to (still) believe there is hope to 
> integrate these two systems.

Aha - now we're getting somewhere.

I was under the illusion these last four months that you were going to
serve as priest at the shotgun wedding that Andrew had requested be
arranged between cpusets and CKRM.  All this time, you were hoping to
get cpusets hooked up with sched domains.

My daughter 'cpusets' sure is popular ;).

If cpusets were somehow to be subsumed into CKRM, it would likely have
meant reincarnating cpusets in a new form, varying in some degree, large
or small, from its current form.  If that had been in our forseeable
future, then we would not have wanted to put cpusets in its current form
in the main tree.  It's alot easier to change API's that aren't API's
yet.

I remain certain that cpusets don't fit in CKRM.  Not even close.

The merger of cpusets and sched domains is an entirely different affair,
in my view.  It's an internal optimization, having next to zero impact
on any API's that the kernel presents to userland.  On most systems, it
would be of no particular benefit.  But on big honkin numa boxes making
heavy use of cpusets, it might make the schedulers work more efficient. 
Or might not.  I will leave that up to others to figure out, when and if
they choose to.  I'll be glad to help with such an effort, what little
I can, if it comes about.

If such an integration between cpusets and sched domains is in our
future, we should first get cpusets into the kernel, and then the
appropriate experts can refine the interaction of cpusets with sched
domains.  In this case, the sooner cpusets goes in, the better, so that
the integration effort with sched domains can commence, confident that
cpusets are here to stay.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.650.933.1373, 1.925.600.0401

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2005-02-08 23:58                                           ` Shailabh Nagar
@ 2005-02-09  0:27                                             ` Paul Jackson
  0 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2005-02-09  0:27 UTC (permalink / raw)
  To: nagar
  Cc: colpatch, dino, mbligh, pwil3058, frankeh, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, Simon.Derr, ak, sivanich

Shailabh wrote:
> Well, I'm not sure I want to minutely examine Paul's choice of words !

You're a wise man ;).


> Rereading the earlier posts on the thread, I'd agree. There are some 
> similarities in our interfaces but not enough to warrant a merger.

As I said ... a wise man !

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.650.933.1373, 1.925.600.0401

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2005-02-08 19:32                                       ` Matthew Dobson
@ 2005-02-09  2:53                                         ` Nick Piggin
  0 siblings, 0 replies; 233+ messages in thread
From: Nick Piggin @ 2005-02-09  2:53 UTC (permalink / raw)
  To: Matthew Dobson
  Cc: dino, Martin J. Bligh, Paul Jackson, pwil3058, frankeh, dipankar,
	Andrew Morton, ckrm-tech, efocht, LSE Tech, hch, steiner,
	Jesse Barnes, sylvain.jeaugey, djh, LKML, Simon.Derr, Andi Kleen,
	sivanich

Matthew Dobson wrote:
> Nick Piggin wrote:

>> I didn't really follow where that idea went, but I think at least
>> a few people thought that sort of functionality wasn't nearly
>> fancy enough! :)
> 
> 
> Well, that's about how far the idea was supposed to go. ;)  I think 
> named hierarchical sched_domains would offer the same functionality (at 
> least for CPU partitioning) as CPUSETs.  I'm not sure who didn't think 
> it was fancy enough, but if you or anyone else can describe CPUSETs 
> configurations that couldn't be represented by sched_domains trees, I'd 
> be very curious to hear about them.
> 

OK. Someone mentioned wanting to do overlapping sets of CPUs. For
example, 3 groups, first can run on cpus 0 and 1, second 1 and 2,
third 2 and 0. However this in itself doesn't preculde the use of
sched-domains.

In the (hopefully) common case where there are disjoint partitions
_somewhere_, sched domains can do the job in a much better
way than task cpu affinities (better isolation, multiprocessor
balancing shouldn't break down).

Those users with overlapping CPU sets can then use task affinities
on top of sched domains partitions to get the desired result.


^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2005-02-08 23:26                                         ` Nick Piggin
@ 2005-02-09  4:23                                           ` Paul Jackson
  0 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2005-02-09  4:23 UTC (permalink / raw)
  To: Nick Piggin
  Cc: mbligh, dino, colpatch, pwil3058, frankeh, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, Simon.Derr, ak, sivanich

Nick wrote:
> The biggest issues may be the userspace
> interface and a decent userspace management tool.

One possibility, perhaps, would be to have a boolean flag "sched_domain"
on each cpuset, indicating whether it was a sched domain or not.  If a
cpuset had its sched_domain flag set, then that cpusets cpus_allowed
mask would define a sched domain.

Later Nick wrote:
> In the (hopefully) common case where there are disjoint partitions
> _somewhere_, sched domains can do the job in a much better
> way than task cpu affinities (better isolation, multiprocessor
> balancing shouldn't break down).
> 
> Those users with overlapping CPU sets can then use task affinities
> on top of sched domains partitions to get the desired result.

Ok - seems it should work with the above cpuset flag marking sched
domains, and a rule that _those_ cpusets so marked can't overlap.  Other
cpusets that are not so marked, and any sched_setaffinity calls, can do
whatever they want.  Trying to turn on the sched_domain flag on a cpuset
that overlapped with existing such cpuset sched_domains, or trying to
mess with the CPUs (cpus_allowed) in an existing cpuset sched_domain so
as to force it to overlap, would return an error to user space on that
write(2).

If the sysadmin didn't mark any cpusets as sched_domains, then fall back
to something automatic and useful.

Inside the kernel, we'll need someway for the cpuset code to tell the
sched code about sched_domain changes.  This might mean something like
the following.  Have the sched code provide the cpuset code a couple of
routines, one to setup and and the other to tear down sched_domains.

Both calls would take a cpumask_t argument, and return void.  The setup
call must pass a cpumask that does not overlap any existing sched
domains defined via cpusets.  The tear down call must pass a cpumask
value exactly matching a previous, still active, setup call.

So if someone made a single CPU change to an existing sched_domain
defining cpuset, the kernel cpuset code would have to call the kernel
sched code twice, first to tear down the old sched_domain, and then to
setup the new, slightly different, one.  The cpuset code would likely be
holding the single global cpuset_sem semaphore across this pair of
calls.

-- 
                          I won't rest till it's the best ...
                          Programmer, Linux Scalability
                          Paul Jackson <pj@sgi.com> 1.650.933.1373

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2005-02-08 20:42                                       ` Paul Jackson
  2005-02-08 22:14                                         ` Matthew Dobson
@ 2005-02-09 17:59                                         ` Chandra Seetharaman
  2005-02-11  2:46                                           ` Chandra Seetharaman
  1 sibling, 1 reply; 233+ messages in thread
From: Chandra Seetharaman @ 2005-02-09 17:59 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Matthew Dobson, dino, mbligh, pwil3058, frankeh, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, Simon.Derr, ak, sivanich

On Tue, Feb 08, 2005 at 12:42:34PM -0800, Paul Jackson wrote:
> Matthew wrote:
> 
> I found no useful and significant basis for integration of cpusets and
> CKRM either involving CPU or Memory Node management.
> 
> As best as I can figure out, CKRM is a fair share scheduler with a
> gussied up more modular architecture, so that the components to track
> usage, control (throttle) tasks, and classify tasks are separate
> plugins.  I can find no significant and useful overlap on any of these
> fronts, either the existing plugins or their infrastructure, with what
> cpusets has and needs.
> 
> There are claims that CKRM has some generalized resource management
> architecture that should be able to handle cpusets needs, but despite my
> repeated (albeit not entirely successful) efforts to find documentation
> and read source and my pleadings with Matthew and earlier on this
> thread, I was never able to figure out what this meant, or find anything
> that could profitably integrate with cpusets.

I thought Hubertus did talk about this when the last time the thread
was active. Anyways, Here is how one could do cpuset/memset under the
ckrm framework(Note that I am not pitching for a marriage :) as there are 
some small problems, like supporting 128 cpus, changing the parameter names
that ckrm currently uses):

First off cpuset and memset has to be implemented as two different
controllers.

cpuset controller:
- 'guarantee' parameter to be used for representing cpuset(bitwise)
- 'limit' parameter to be used for exclusivity and other flags.
- Highest level class(/rcfs/taskclass) will have all cpus in its list
- Every class will maintain two sets of cpusets, one that can be inherited,
  inherit_cpuset(needed when exclusive is set in a child) and the other
  for use by the class itself, my_cpuset.
- when a new class is created (under /rcfs/taskclass), it inherits all the 
  CPUS (from inherit_cpuset).
- admin can change the cpuset of this class by echoing the new 
  cpuset(guarantee) into the 'shares' file.
- admin can set/change the exclusivity(like) flags by echoing the value(limit)
  to the 'shares' file.
- When the exclusivity flag is set in a class, the cpuset bits in this class
  will be cleared in the inherit_cpuset of the parent, and all its other
  children.
- At the time of scheduling, my_cpuset in the class of the task will be
  consulted.

memset_controller would be similar to this, before pitching it I will talk
with Matt about why he thought that there is a problem.

If I missed some feature of cpuset that shows a bigger problem, please
let me know.
> 
> In sum -- I see a potential for useful integration of cpusets and
> scheduler domains, I'll have to leave it up to those with expertise in
> the scheduler to evaluate and perhaps accomplish this.  I do not see any
> useful integration of cpusets and CKRM.
> 
> I continue to be befuddled as to why, Matthew, you confound potential
> cpuset-scheddomain integration with potential cpuset-CKRM integration.
> Scheduler domains and CKRM are distinct beasts, in my book, and the
> contemplations of cpuset integration with these two beasts are also
> distinct efforts.
> 
> And cpusets and CKRM are distinct beasts.
> 
> But I repeat myself ...
> 
> -- 
>                   I won't rest till it's the best ...
>                   Programmer, Linux Scalability
>                   Paul Jackson <pj@sgi.com> 1.650.933.1373, 1.925.600.0401
> 
> 
> -------------------------------------------------------
> SF email is sponsored by - The IT Product Guide
> Read honest & candid reviews on hundreds of IT Products from real users.
> Discover which products truly live up to the hype. Start reading now.
> http://ads.osdn.com/?ad_id=6595&alloc_id=14396&op=click
> _______________________________________________
> ckrm-tech mailing list
> https://lists.sourceforge.net/lists/listinfo/ckrm-tech

-- 

----------------------------------------------------------------------
    Chandra Seetharaman               | Be careful what you choose....
              - sekharan@us.ibm.com   |      .......you may get it.
----------------------------------------------------------------------

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2005-02-09 17:59                                         ` [ckrm-tech] " Chandra Seetharaman
@ 2005-02-11  2:46                                           ` Chandra Seetharaman
  2005-02-11  9:21                                             ` Paul Jackson
  2005-02-11 16:54                                             ` Jesse Barnes
  0 siblings, 2 replies; 233+ messages in thread
From: Chandra Seetharaman @ 2005-02-11  2:46 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Matthew Dobson, dino, mbligh, pwil3058, frankeh, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, Simon.Derr, ak, sivanich

On Wed, Feb 09, 2005 at 09:59:28AM -0800, Chandra Seetharaman wrote:
> On Tue, Feb 08, 2005 at 12:42:34PM -0800, Paul Jackson wrote:
--stuff deleted---
> memset_controller would be similar to this, before pitching it I will talk
> with Matt about why he thought that there is a problem.

Talked to Matt Dobson and explained him the CKRM architecture and how
cpuset/memset can be implemented as a ckrm controller. He is now convinced
that there is no problem in making memset also a ckrm controller.

As explained in the earlier mail, memset also can be implemented in the
same way as cpuset.

> 
> If I missed some feature of cpuset that shows a bigger problem, please
> let me know.
> > 
> > In sum -- I see a potential for useful integration of cpusets and
> > scheduler domains, I'll have to leave it up to those with expertise in
> > the scheduler to evaluate and perhaps accomplish this.  I do not see any
> > useful integration of cpusets and CKRM.
> > 
> > I continue to be befuddled as to why, Matthew, you confound potential
> > cpuset-scheddomain integration with potential cpuset-CKRM integration.
> > Scheduler domains and CKRM are distinct beasts, in my book, and the
> > contemplations of cpuset integration with these two beasts are also
> > distinct efforts.
> > 
> > And cpusets and CKRM are distinct beasts.
> > 
> > But I repeat myself ...
> > 
> > -- 
> >                   I won't rest till it's the best ...
> >                   Programmer, Linux Scalability
> >                   Paul Jackson <pj@sgi.com> 1.650.933.1373, 1.925.600.0401
> > 
> > 
> > -------------------------------------------------------
> > SF email is sponsored by - The IT Product Guide
> > Read honest & candid reviews on hundreds of IT Products from real users.
> > Discover which products truly live up to the hype. Start reading now.
> > http://ads.osdn.com/?ad_id=6595&alloc_id=14396&op=click
> > _______________________________________________
> > ckrm-tech mailing list
> > https://lists.sourceforge.net/lists/listinfo/ckrm-tech
> 
> -- 
> 
> ----------------------------------------------------------------------
>     Chandra Seetharaman               | Be careful what you choose....
>               - sekharan@us.ibm.com   |      .......you may get it.
> ----------------------------------------------------------------------
> 
> 
> -------------------------------------------------------
> SF email is sponsored by - The IT Product Guide
> Read honest & candid reviews on hundreds of IT Products from real users.
> Discover which products truly live up to the hype. Start reading now.
> http://ads.osdn.com/?ad_id=6595&alloc_id=14396&op=click
> _______________________________________________
> ckrm-tech mailing list
> https://lists.sourceforge.net/lists/listinfo/ckrm-tech

-- 

----------------------------------------------------------------------
    Chandra Seetharaman               | Be careful what you choose....
              - sekharan@us.ibm.com   |      .......you may get it.
----------------------------------------------------------------------

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2005-02-11  2:46                                           ` Chandra Seetharaman
@ 2005-02-11  9:21                                             ` Paul Jackson
  2005-02-12  1:37                                               ` Chandra Seetharaman
  2005-02-11 16:54                                             ` Jesse Barnes
  1 sibling, 1 reply; 233+ messages in thread
From: Paul Jackson @ 2005-02-11  9:21 UTC (permalink / raw)
  To: Chandra Seetharaman
  Cc: colpatch, dino, mbligh, pwil3058, frankeh, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, Simon.Derr, ak, sivanich


[ For those who have already reached a conclusion on this
  subject, there is little that is new below.  It's just
  cast in a different light, as an analysis of how well
  the CKRM cpuset/memset task class that Chandra describes
  meets the needs of cpusets.  The conclusion is: not well.

  A pickup truck and a motorcycle both have their uses.
  It's just difficult to combine them in a useful fashion.

  Feel free to skim or skip the rest of this message. -pj ]


Chandra writes:
> If I missed some feature of cpuset that shows a bigger problem, please
> let me know.

Perhaps it would be better if first you ask yourself what
features your cpuset/memset taskclasses provide beyond
what's available in the basic sched_setaffinity (for cpu)
and mbind/set_mempolicy (for memory) calls.  Offhand, I don't
see any.

But, I will grant, with my apologies, that I wrote the above
more in irritation than in a sincere effort to explain.

So, let me come at this through another door.

Since it seems apparent by now that both numa placement and
workload management cause some form of mutually exclusive brain
damage to its practitioners, making it difficult for either to
understand the other, let me:
 1) describe the important properties of cpusets,
 2) examine how well your proposal provides such, and
 3) examine its additional costs compared to cpusets.

1. The important properties of cpusets.
=======================================
 
Cpusets facilitate integrated processor and memory placement
of jobs on large systems, especially useful on numa systems,
where the co-ordinated placement of jobs on cpus and memory is
important, sometimes critical, to obtaining good performance.

It is becoming increasingly obvious, as Intel, IBM and AMD
push more and more cores into one package at one end, and as
NEC, IBM, Bull, SGI and others push more and more packages into
single image systems at the other end, that complex layered numa
topologies are here to stay, in increasing number and complexity.

Cpusets helps manage numa placement of jobs in a way that
numa folks seem to find makes sense.  The names of key
interface elements, and the opening remarks in commentary and
documentation are specific and relevant to the needs of those
doing numa placement.

It does so with a minimal, low cost patch in the main kernel.
Running diffstat on the cpuset* patches in 2.6.11-rc1-mm2 shows
the following summary stats:

  19 files changed, 2362 insertions(+), 253 deletions(-)

The runtime costs are nearly zero, consisting in the usual
case on any hot paths of a usage counter increment at fork, a
usage counter decrement at exit, a usually inconsequential
bitmask test in mm/page_alloc.c, and a generation number
check in the mm/mempolicy.c alloc_page_vma() wrapper to
__alloc_pages().

Cpusets handles any number of CPUs and Memory Nodes, with no
practical hard limit imposed by the API or data types.

Cpusets can be used in combination with a workload manager
such as CKRM.  You can use cpusets to create "soft partitions"
that are subsets of the entire system, and then in each such
partition, you can run a separate instance of a workload manager
to obtain the desired resource sharing.

Cpusets may provide a practical API to support administrative
refinements of scheduler domains, along more optimal natural
job boundaries, instead of just along automatic, artificial
architecture boundaries.  Matthew and Nick both seem to be
making mumblings in this direction, but the jury is still out.
Indeed, we're still investigating.  I have not heard of anyone
proposing to integrate CKRM and sched domains in this manner,
nor do I expect to.

There is no reason to artificially limit the depth of the cpuset
hierarchy, which represents subsets of subsets of cpus and nodes.
The rules (invariants) of cpusets have been carefully chosen
so as to never require any global or wide ranging analysis of
the cpuset hierarchy in order to enforce.  Each child must be
a subset of its parent, and exclusive cpusets cannot overlap
their siblings.  That's about it.  Both rules can be evaluated
locally, using just the nearest relatives of an affected cpuset.

An essential feature of the cpuset proposal is its file system
model of the 'nested subsets of cpus and nodes'.  This provides
a name space, and permission model, that supports sensible
administration of numa friendly subsets of the compute resources
of large systems in complex administration environments.
A system can be dynamically 'partitioned' and 'sub-partitioned',
with sensible names and permissions for the partitions, while
maintaining the benefits of a single system image.  This is
a classic use of a kernel, to manage a system wide resource
with a name space, structure rules, resource attributes, and
a permission/access model.

In sum, cpusets provides substantial benefit past the individual
sched_setaffinity/mbind/set_mempolicy calls for managing the
numa placement of jobs on large systems, at modest cost in
code size, runtime, maintenance and intellectual mastery.


2. How much of the above does your proposal provide?
====================================================

Not much.  As best as I can tell, it provides an alternative
to the existing numa cpu and memory calls, at the cost of
considerable code, complexity and obtuseness above and beyond
cpusets.  That additional complexity may well be necessary,
for the more difficult job it is trying to accomplish.  But it
is not necessary for the simpler task of numa placement of jobs
on named, controlled, subsets of cpus and memory nodes.

Your proposal doesn't provide a distinguished "numa computation
unit" (cpu + memory), but rather tends to lose those two elements
in a longer list of task class elements.

I can't tell if it's just because you didn't take much time to
study cpusets, or if it's due to more essential limitations
of the CKRM implementation, but you got the subsetting and
exclusive rules wrong (or at least different).

The CKRM documentation and the names of key flags and such are
not intuitive to those doing numa work.  If one comes at CKRM
from the perspective of someone trying to solve a numa placement
problem, the interfaces, documentation and naming really don't
make sense.  Even if your architecture is more general and
powerful, I suspect your presentation is not widely accessible
outside those with a workload focus.  Or perhaps I'm just more
dimwitted than most.  It's difficult for me to know which.
But certainly both Matthew and I have struggled to make sense
of CKRM from a numa perspective.

You state you'd have a 128 CPU limitation.  I don't know why
that would be, but it would be a critical imitation for SGI --
no small problem.

As explained below, with your proposal, one could not readily do
both workload management and numa placement at the same time,
because the task class hierarchy needed for the two is not
the same.

As noted above, while there seems to be a decent chance that
cpusets will provide some benefit to scheduler domains, allowing
the option of organizing sched domains along actual job usage
lines instead of artificial architecture lines, I have seen
no suggestion that CKRM task classes have that potential to
improve sched domains.

Elsewhere I recall you've had to impose fairly modest bounds
on the depth of your class hierarchy, because your resource
balancing rules are expensive to evaluate across deep, large
trees.  The cpuset hierarchy has no such restraint.

Your task class hierarchy, if hijacked for numa placement,
might provide the kernel managed naming, structure and
access control of dynamic (soft) numa partitions that cpusets
does.  I haven't looked closely at the permission model of
CKRM to see if it matches the needs of cpusets, so I can't
speak to that detail.

In sum, your cpuset/memset CKRM proposal provides few, if any,
of the additional benefits to numa placement work that cpusets
provides over the existing affinity and numa system calls.


3. What are the additional costs of your proposal over cpusets?
===============================================================

Your proposal, while it seems to offer little advantage for
numa placement to what we already have without cpusets, comes
at a substantial cost great than cpusets.

The CKRM patch is five times the size of the cpuset patch,
with diffstat on the ckrm-e17.2610.patch showing:

  65 files changed, 13020 insertions(+), 19 deletions(-)

The CKRM runtime, from what I can tell on the lmbench slide
from OLS 2004, costs several percent of available cycles.

You propose to include the cpu/mem placement hierarchy in the
task class hierarchy.  This presents difficulties.  Essentially,
they are not the same hierarchies.  A jobs placement is
independent of its priority.  Both high and low priority jobs
may well require proper numa placement, and both high and low
priority tasks may well run within the same cpuset.

So if your task class hierarchy is hijacked for numa placement,
it will not serve you well for workload management.  On a system
that required numa placement using something like cpusets, the
fives times larger size of the kernel patch required for CKRM
would be entirely unjustified, as CKRM would only be usable
for its cpuset-like capabilities.

Much of what you have now in CKRM would be useless for cpuset
work.  As you observed in your proposal, you would need new
cpuset related rules for the subset and exclusive properties.

The cpuset scheduler hook is none - it only needs the
existing cpus_allowed check that Ingo already added, years ago.
You propose having the scheduler check the appropriate cpu mask
in the task class, which would definitely increase the cache
footprint size of the scheduler.

The papers for CKRM speak of providing policy driven
classification and differentiated service.  The focus is on
managing resource sharing, to allow different classes of tasks
to get controlled allocations of proportions of shared resources.

Cpusets is not about sharing proportions of a common resource,
but rather about dedicating entire resources.  Granted,
mathematically, there might be a mapping between these two.
But is it certainly an impediment to those having to understand
something, if it is implemented by abusing something quite
larger and quite foreign in intention.

This flows through to the names of the specific files in the
directory representing a cpuset or class.  The names for CKRM
class directories are necessarily rather generic and abstract,
whereas those for cpusets directly represent the particular
need of placing tasks on cpus and memory nodes.  For someone
doing numa placement, the latter are much easier to understand.

And as noted above, since you can't do both at the same time
(both use the CKRM infrastructure for its traditional workload
management and use it for numa placement) it's not like the
administrator of such a system gains any from the more abstract
names, if they are just using it for cpusets (numa placement).

There is no synergy in the kernel hooks required in the scheduler
and memory allocator.  The hooks required by cpusets check
bitmasks in order to allow or prohibit scheduling a task on
a CPU, or allocating a page from a particular node to a task.
These are quite distinct from the hooks required by CKRM when
used as a fair share scheduler and workload manager, which
requires adding delays to tasks in order to obtain the desired
proportion of resource usage between classes.  Similarly, the
CKRM memory allocator hooks manage the number of pages in use
by each task class and/or the rate of page faults, while the
cpuset memory allocator hooks manage which memory nodes are
available to satisfy an allocation request.

The share usage hooks that monitor each resource, and its usage
by each class, are useless for cpusets, which has no dependency
on resource usage.  In cpusets, a task can use as much of its
allowed CPUs and Memory Nodes, without throttling.  There is
no feedback loop based on rates of resource usage per class.

Most of the hooks required by the CKRM classification engine to
check for possible changes in a tasks class, such as in fork,
exec, setuid, listen, and other points where a kernel object
might change are not needed for cpusets.  The cpuset patch only
requires such state change hooks in fork, exit and allocation,
and only requires to increment or decrement a usage count in
the fork and exit, and check a generation number in allocation.

Cpusets has no use for a kernel classification engine.  Outside
of the trivial, automatic propagation of cpusets in fork and
exit, the only changes in cpusets are mandated from user space.

Nor do cpusets have any need for the kernel to support externally
defined policy rules.  Cpusets has no use for the classification
engines callback mechanism.  In cpusets, no events that might
affect state, such as fork, exit, reclassifications, changes in
uid, or resource rate usage samples, need to be reported to any
state agent, and there is no state agent, nor any communication
channel thereto.

Cpusets has no use for a facility that lets server tasks tell
some external classifier what phase they are operating in.
Cpusets has no need for some workload manager to be sampling
resource consumption and task state to determine resource
consumption.  Cpusets has no need to track, in user space or
kernel, the state of tasks after they exit. Cpusets has no use
for delays nor for tracking them in the task struct.

Cpusets has no need for the hooks at the entry to, and exit from,
memory allocation routines to distinguish delays due to memory
allocation from those due to application i/o.  Cpusets has no
need for sampling task state at fixed intervals, and our big
iron scientific customers would without a doubt not tolerate a
scan of the entire set of tasks every second for such resource
and task state data collection.  Such a scan does _not_ scale
well on big honkin numa boxes.  Whereas CKRM requires something
like relayfs to pass back to user space the constant stream of
such data, cpusets has no such needs and no such data.

Certainly, none of the network hooks that CKRM requires to
provide differentiated service across priority classes would be
of any use in a system (ab)using CKRM to provide cpuset style
numa placement.

It is true that both cpusets and CKRM make good use of the Linux
kernel's virtual file system (vfs).  Cpusets uses vfs to model
the hierarchy of 'soft partitions' in the system.  CKRM uses vfs
to model a resource priority hierarchy, essentially replacing a
single 'task priority' with hierarchical resource allocations,
managing what proportion, out of what is available, of fungible
resources such as ticks, cycles, bytes or data transfers a
given class of tasks is allowed to use in the aggregate.

Just because two facilities use vfs is certainly not sufficient
basis for deciding that they should be combined into one
facility.

The shares and stats control files in each task_class
directory are not needed by cpusets, but new control files,
for cpus_allowed and mems_allowed are needed.  That, or the
existing names have to be overloaded, at the cost of obfuscating
the interface.

The kernel hooks for cpusets are fewer, simpler and more specific
than those for CKRM.  Our high performance customers would want
the cpuset hooks compiled in, not the more generic ones for
CKRM (which they could not easily use for any other workload
management purpose anyway, if the task class hierarchy were
hijacked for the needs of cpusets, as noted above).

The development costs of cpusets so far, which are perhaps the
best predictor we have of future costs, have been substantially
lower than they have been for CKRM.

In sum, your proposal costs alot more than cpusets, by a variety
of metrics.

=================================================

In summary, I find that your cpuset/memset CKRM proposal provides
little or no benefit past the simpler cpu and memory placement
calls already available, while costing substantially more in
a variety of ways than my cpuset proposal, when evaluated for
its usefulness for numa placement.

(Of course, if evaluated for suitability for workload management,
the table is turned, and your CKRM patch provides essential
capability that my cpuset patch could never dream of doing.)

Moreover, the additional workload management benefits that your
CKRM facility provides, and that some of my customers might
want to use in combination with numa placement, would probably
become unavailable to them if we integrated cpusets and CKRM,
because cpusets would have to hijack the task class hierarchy
for its own nefarious purposes.

Such an attempt to integrate cpusets and CKRM would be a major
setback for cpusets, substantially increasing its costs and
reducing is value, probably well past the point of it even being
worth persuing further, in the mainstream kernel.  Adding all
that foreign logic of cpusets to the CKRM patch probably
wouldn't help CKRM much either.  The CKRM patch is already one
that requires a bright mind and some careful thought to master.
Adding cpuset numa placement logic, which is typically different
in detail, would add a complexity burden to the CKRM code that
would serve no one well.


> Note that I am not pitching for a marriage

We agree.

I just took more words to say it ').



-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.650.933.1373, 1.925.600.0401

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2005-02-11  2:46                                           ` Chandra Seetharaman
  2005-02-11  9:21                                             ` Paul Jackson
@ 2005-02-11 16:54                                             ` Jesse Barnes
  2005-02-11 18:42                                               ` Chandra Seetharaman
  1 sibling, 1 reply; 233+ messages in thread
From: Jesse Barnes @ 2005-02-11 16:54 UTC (permalink / raw)
  To: Chandra Seetharaman
  Cc: Paul Jackson, Matthew Dobson, dino, mbligh, pwil3058, frankeh,
	dipankar, akpm, ckrm-tech, efocht, lse-tech, hch, steiner,
	sylvain.jeaugey, djh, linux-kernel, Simon.Derr, ak, sivanich

On Thursday, February 10, 2005 6:46 pm, Chandra Seetharaman wrote:
> On Wed, Feb 09, 2005 at 09:59:28AM -0800, Chandra Seetharaman wrote:
> > On Tue, Feb 08, 2005 at 12:42:34PM -0800, Paul Jackson wrote:
>
> --stuff deleted---
>
> > memset_controller would be similar to this, before pitching it I will
> > talk with Matt about why he thought that there is a problem.
>
> Talked to Matt Dobson and explained him the CKRM architecture and how
> cpuset/memset can be implemented as a ckrm controller. He is now convinced
> that there is no problem in making memset also a ckrm controller.
>
> As explained in the earlier mail, memset also can be implemented in the
> same way as cpuset.

Arg!  Look, cpusets is *done* (i.e. it works well) and relatively simple and 
easy to use.  It's also been in -mm for quite some time.  It also solves the 
problem of being able to deal with large jobs on large systems rather 
elegantly.  Why oppose its inclusion upstream?

CKRM seems nice, but why is it not in -mm?  I've heard it talked about a lot, 
but it usually comes up as a response to some other, simpler project, in the 
vein of "ckrm can do this, so your project is not needed" and needless to say 
that's a bit frustrating.  I'm not saying that ckrm isn't useful--indeed it 
seems like an idea with a lot of utility (I liked Rik's ideas for using it to 
manage desktop boxes and multiuser systems as a sort of per-process rlimits 
on steroids), but using it for system partitioning or systemwide accounting 
seems a bit foolish to me...

Jesse

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2005-02-11 16:54                                             ` Jesse Barnes
@ 2005-02-11 18:42                                               ` Chandra Seetharaman
  2005-02-11 18:50                                                 ` Jesse Barnes
  0 siblings, 1 reply; 233+ messages in thread
From: Chandra Seetharaman @ 2005-02-11 18:42 UTC (permalink / raw)
  To: Jesse Barnes
  Cc: Paul Jackson, Matthew Dobson, dino, mbligh, pwil3058, frankeh,
	dipankar, akpm, ckrm-tech, efocht, lse-tech, hch, steiner,
	sylvain.jeaugey, djh, linux-kernel, Simon.Derr, ak, sivanich

On Fri, Feb 11, 2005 at 08:54:52AM -0800, Jesse Barnes wrote:
> On Thursday, February 10, 2005 6:46 pm, Chandra Seetharaman wrote:
> > On Wed, Feb 09, 2005 at 09:59:28AM -0800, Chandra Seetharaman wrote:
> > > On Tue, Feb 08, 2005 at 12:42:34PM -0800, Paul Jackson wrote:
> >
> > --stuff deleted---
> >
> > > memset_controller would be similar to this, before pitching it I will
> > > talk with Matt about why he thought that there is a problem.
> >
> > Talked to Matt Dobson and explained him the CKRM architecture and how
> > cpuset/memset can be implemented as a ckrm controller. He is now convinced
> > that there is no problem in making memset also a ckrm controller.
> >
> > As explained in the earlier mail, memset also can be implemented in the
> > same way as cpuset.
> 
> Arg!  Look, cpusets is *done* (i.e. it works well) and relatively simple and 
> easy to use.  It's also been in -mm for quite some time.  It also solves the 
> problem of being able to deal with large jobs on large systems rather 
> elegantly.  Why oppose its inclusion upstream?

Jesse,

Do note that I did not oppose the cpuset inclusion(by saying that, "I am not
pitching for a marriage"), and here are the reasons:

1.Eventhough cpuset can be implemented under ckrm, currently the cpu controller
  and mem controller(in ckrm) cannot handle the isolating part of the cpuset stuff
  cleanly and provide the resource management capabilities ckrm is supposed to 
  provide. For that reason, one cannot expect both the cpuset and ckrm functionality
  in a same kernel.
2.I doubt that users that need cpuset will need the resource management capabilities
  ckrm provides.

My email was intented mainly to erase the notion that ckrm cannot handle cpuset.
Also, I wanted to understand if there is any real issues and that is why I talked
with Matt about why he thought ckrm cannot accomodate memset before sending the
second piece of mail.

> 
> CKRM seems nice, but why is it not in -mm?  I've heard it talked about a lot, 
> but it usually comes up as a response to some other, simpler project, in the 

We did post to lkml a while back and got comments on it. We are working on it and
will post the fixed code again in few weeks with couple of controllers.

> vein of "ckrm can do this, so your project is not needed" and needless to say 
> that's a bit frustrating.  I'm not saying that ckrm isn't useful--indeed it 
> seems like an idea with a lot of utility (I liked Rik's ideas for using it to 
> manage desktop boxes and multiuser systems as a sort of per-process rlimits 
> on steroids), but using it for system partitioning or systemwide accounting 
> seems a bit foolish to me...
> 
> Jesse

-- 

----------------------------------------------------------------------
    Chandra Seetharaman               | Be careful what you choose....
              - sekharan@us.ibm.com   |      .......you may get it.
----------------------------------------------------------------------

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2005-02-11 18:42                                               ` Chandra Seetharaman
@ 2005-02-11 18:50                                                 ` Jesse Barnes
  0 siblings, 0 replies; 233+ messages in thread
From: Jesse Barnes @ 2005-02-11 18:50 UTC (permalink / raw)
  To: Chandra Seetharaman
  Cc: Paul Jackson, Matthew Dobson, dino, mbligh, pwil3058, frankeh,
	dipankar, akpm, ckrm-tech, efocht, lse-tech, steiner,
	sylvain.jeaugey, djh, linux-kernel, Simon.Derr, sivanich

On Friday, February 11, 2005 10:42 am, Chandra Seetharaman wrote:
> My email was intented mainly to erase the notion that ckrm cannot handle
> cpuset. Also, I wanted to understand if there is any real issues and that
> is why I talked with Matt about why he thought ckrm cannot accomodate
> memset before sending the second piece of mail.

Great!  So cpusets is good to go for the mainline then (i.e. no major 
objections to the interface).  Note that implementation details that don't 
affect the interface are another subject entirely, e.g. the sched domains 
approach for scheduling as opposed to cpus_allowed.

> > CKRM seems nice, but why is it not in -mm?  I've heard it talked about a
> > lot, but it usually comes up as a response to some other, simpler
> > project, in the
>
> We did post to lkml a while back and got comments on it. We are working on
> it and will post the fixed code again in few weeks with couple of
> controllers.

Excellent, I hope that it comes together into a form suitable for the 
mainline, I think there are some really nice aspects to it.

Thanks,
Jesse

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2005-02-11  9:21                                             ` Paul Jackson
@ 2005-02-12  1:37                                               ` Chandra Seetharaman
  2005-02-12  6:16                                                 ` Paul Jackson
  0 siblings, 1 reply; 233+ messages in thread
From: Chandra Seetharaman @ 2005-02-12  1:37 UTC (permalink / raw)
  To: Paul Jackson
  Cc: colpatch, dino, mbligh, pwil3058, frankeh, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, Simon.Derr, ak, sivanich

On Fri, Feb 11, 2005 at 01:21:12AM -0800, Paul Jackson wrote:
> [ For those who have already reached a conclusion on this
>   subject, there is little that is new below.  It's just
>   cast in a different light, as an analysis of how well
>   the CKRM cpuset/memset task class that Chandra describes
>   meets the needs of cpusets.  The conclusion is: not well.
> 
>   A pickup truck and a motorcycle both have their uses.
>   It's just difficult to combine them in a useful fashion.
> 
>   Feel free to skim or skip the rest of this message. -pj ]
> 
[ As replied in a earlier mail I am not advocating for cpuset to be
  a ckrm controller. In this mail I am just providing clarifications
  for some of Paul's comments. -chandra ]

> 
> Chandra writes:
> > If I missed some feature of cpuset that shows a bigger problem, please
> > let me know.
> 
> Perhaps it would be better if first you ask yourself what
> features your cpuset/memset taskclasses provide beyond

First off, I wasn't pitching for 'our' cpuset/memset taskclass. I was 
suggesting that 'your' cpuset can be a ckrm controller.


> what's available in the basic sched_setaffinity (for cpu)
> and mbind/set_mempolicy (for memory) calls.  Offhand, I don't
> see any.

and it don't have to be same as what the above functions provide. cpuset
can function exactly the same way under ckrm as it does otherwise.

> 
> But, I will grant, with my apologies, that I wrote the above
> more in irritation than in a sincere effort to explain.
> 
> So, let me come at this through another door.
> 
> Since it seems apparent by now that both numa placement and
> workload management cause some form of mutually exclusive brain
> damage to its practitioners, making it difficult for either to
> understand the other, let me:
>  1) describe the important properties of cpusets,
>  2) examine how well your proposal provides such, and
>  3) examine its additional costs compared to cpusets.
> 
> 1. The important properties of cpusets.
> =======================================
>  
> Cpusets facilitate integrated processor and memory placement
> of jobs on large systems, especially useful on numa systems,
> where the co-ordinated placement of jobs on cpus and memory is
> important, sometimes critical, to obtaining good performance.
> 
> It is becoming increasingly obvious, as Intel, IBM and AMD
> push more and more cores into one package at one end, and as
> NEC, IBM, Bull, SGI and others push more and more packages into
> single image systems at the other end, that complex layered numa
> topologies are here to stay, in increasing number and complexity.
> 
> Cpusets helps manage numa placement of jobs in a way that
> numa folks seem to find makes sense.  The names of key
> interface elements, and the opening remarks in commentary and
> documentation are specific and relevant to the needs of those
> doing numa placement.
> 
> It does so with a minimal, low cost patch in the main kernel.
> Running diffstat on the cpuset* patches in 2.6.11-rc1-mm2 shows
> the following summary stats:
> 
>   19 files changed, 2362 insertions(+), 253 deletions(-)
> 
> The runtime costs are nearly zero, consisting in the usual
> case on any hot paths of a usage counter increment at fork, a
> usage counter decrement at exit, a usually inconsequential
> bitmask test in mm/page_alloc.c, and a generation number
> check in the mm/mempolicy.c alloc_page_vma() wrapper to
> __alloc_pages().
> 
> Cpusets handles any number of CPUs and Memory Nodes, with no
> practical hard limit imposed by the API or data types.
> 
> Cpusets can be used in combination with a workload manager
> such as CKRM.  You can use cpusets to create "soft partitions"
> that are subsets of the entire system, and then in each such
> partition, you can run a separate instance of a workload manager
> to obtain the desired resource sharing.

CKRM's controllers currently may not play well with cpusets.
> 
> Cpusets may provide a practical API to support administrative
> refinements of scheduler domains, along more optimal natural
> job boundaries, instead of just along automatic, artificial
> architecture boundaries.  Matthew and Nick both seem to be
> making mumblings in this direction, but the jury is still out.
> Indeed, we're still investigating.  I have not heard of anyone
> proposing to integrate CKRM and sched domains in this manner,
> nor do I expect to.

I haven't looked at sched_domains closely. May be I should and see how we
can form a synergy.

> 
> There is no reason to artificially limit the depth of the cpuset
> hierarchy, which represents subsets of subsets of cpus and nodes.
> The rules (invariants) of cpusets have been carefully chosen
> so as to never require any global or wide ranging analysis of
> the cpuset hierarchy in order to enforce.  Each child must be
> a subset of its parent, and exclusive cpusets cannot overlap
> their siblings.  That's about it.  Both rules can be evaluated
> locally, using just the nearest relatives of an affected cpuset.
> 
> An essential feature of the cpuset proposal is its file system
> model of the 'nested subsets of cpus and nodes'.  This provides
> a name space, and permission model, that supports sensible
> administration of numa friendly subsets of the compute resources
> of large systems in complex administration environments.
> A system can be dynamically 'partitioned' and 'sub-partitioned',
> with sensible names and permissions for the partitions, while
> maintaining the benefits of a single system image.  This is
> a classic use of a kernel, to manage a system wide resource
> with a name space, structure rules, resource attributes, and
> a permission/access model.
> 
> In sum, cpusets provides substantial benefit past the individual
> sched_setaffinity/mbind/set_mempolicy calls for managing the
> numa placement of jobs on large systems, at modest cost in
> code size, runtime, maintenance and intellectual mastery.
> 
> 
> 2. How much of the above does your proposal provide?
> ====================================================
> 
> Not much.  As best as I can tell, it provides an alternative
> to the existing numa cpu and memory calls, at the cost of
> considerable code, complexity and obtuseness above and beyond
> cpusets.  That additional complexity may well be necessary,
> for the more difficult job it is trying to accomplish.  But it
> is not necessary for the simpler task of numa placement of jobs
> on named, controlled, subsets of cpus and memory nodes.

I was answering a different question: whether ckrm can accomodate
cpuset or not ? ( i 'll talk about the complexity part later).

> 
> Your proposal doesn't provide a distinguished "numa computation
> unit" (cpu + memory), but rather tends to lose those two elements
> in a longer list of task class elements.

It doesn't readily provide it, but the architecture can provide it.

> 
> I can't tell if it's just because you didn't take much time to
> study cpusets, or if it's due to more essential limitations
> of the CKRM implementation, but you got the subsetting and
> exclusive rules wrong (or at least different).

My understanding was that, if a class/cpuset has an exclusive flag
set, then those cpus can be used only by this cpuset and its parent,
and no other cpusets in the system. 

I did get one thing wrong, I did not realize that you do not allow
setting the exclusive flag in a cpuset if any of its siblings has
any of this cpuset's cpus. (May be i still didn't get it right)....

But, that doesn't change what I wrote in my earlier mail,
because, all these details are controller specific and i do not see
any limitation from ckrm's point of view in this context.

> 
> The CKRM documentation and the names of key flags and such are
> not intuitive to those doing numa work.  If one comes at CKRM
> from the perspective of someone trying to solve a numa placement
> problem, the interfaces, documentation and naming really don't
> make sense.  Even if your architecture is more general and
> powerful, I suspect your presentation is not widely accessible
> outside those with a workload focus.  Or perhaps I'm just more
> dimwitted than most.  It's difficult for me to know which.
> But certainly both Matthew and I have struggled to make sense
> of CKRM from a numa perspective.

I agree. The filenames are not intuitive for cpuset purposes.

> 
> You state you'd have a 128 CPU limitation.  I don't know why
> that would be, but it would be a critical imitation for SGI --
> no small problem.

I understand it is critical for SGI. I said it is a small problem 
because it can be worked out easily.

> 
> As explained below, with your proposal, one could not readily do
> both workload management and numa placement at the same time,
> because the task class hierarchy needed for the two is not
> the same.
> 
> As noted above, while there seems to be a decent chance that
> cpusets will provide some benefit to scheduler domains, allowing
> the option of organizing sched domains along actual job usage
> lines instead of artificial architecture lines, I have seen
> no suggestion that CKRM task classes have that potential to
> improve sched domains.
> 
> Elsewhere I recall you've had to impose fairly modest bounds
> on the depth of your class hierarchy, because your resource
> balancing rules are expensive to evaluate across deep, large
> trees.  The cpuset hierarchy has no such restraint.

We put the limitation in the architecture because of controllers. 
We can open it up to allow deeper hierarchy and let the controllers
decide how deep they can support.

> 
> Your task class hierarchy, if hijacked for numa placement,

I wasn't suggestint the cpuset controller to hijack ckrm's task
hierarchy, I was suggesting to play within.

Controllers don't hijack hierarchy. Hierarchy is only for classes,
controllers have control over only their portion of a class.

> might provide the kernel managed naming, structure and
> access control of dynamic (soft) numa partitions that cpusets
> does.  I haven't looked closely at the permission model of
> CKRM to see if it matches the needs of cpusets, so I can't
> speak to that detail.

Are you talking about allowing users to manage their own class/cpusets ?
If so, we do have them.

> 
> In sum, your cpuset/memset CKRM proposal provides few, if any,
> of the additional benefits to numa placement work that cpusets
> provides over the existing affinity and numa system calls.
> 
> 
> 3. What are the additional costs of your proposal over cpusets?
> ===============================================================
> 
> Your proposal, while it seems to offer little advantage for
> numa placement to what we already have without cpusets, comes
> at a substantial cost great than cpusets.
> 
> The CKRM patch is five times the size of the cpuset patch,
> with diffstat on the ckrm-e17.2610.patch showing:
> 
>   65 files changed, 13020 insertions(+), 19 deletions(-)

ckrm-e17 has the whole stack(core, rcfs, taskclass, socketclass, delay
accounting, rbce, crbce, numtasks controller and listenaq controller).

But, for your purposes or our discussions one would need only 3 modules of
the above (core, rcfs and taskclass). I just compared it with the broken
up patches we posted on lkml recently. The whole stack has 12227 insertions
of which only 4554 insertions correspond to the 3 modules listed.

> 
> The CKRM runtime, from what I can tell on the lmbench slide
> from OLS 2004, costs several percent of available cycles.

The graph you see in the presentation is with the CPU controller. Not
for the core ckrm. We don't have to include CPU controller to get cpuset
working as a controller.

> 
> You propose to include the cpu/mem placement hierarchy in the
> task class hierarchy.  This presents difficulties.  Essentially,
> they are not the same hierarchies.  A jobs placement is
> independent of its priority.  Both high and low priority jobs
> may well require proper numa placement, and both high and low
> priority tasks may well run within the same cpuset.
> 
> So if your task class hierarchy is hijacked for numa placement,
> it will not serve you well for workload management.  On a system
> that required numa placement using something like cpusets, the
> fives times larger size of the kernel patch required for CKRM

As explained above, it is not 5 timer larger.

> would be entirely unjustified, as CKRM would only be usable
> for its cpuset-like capabilities.
> 
> Much of what you have now in CKRM would be useless for cpuset
> work.  As you observed in your proposal, you would need new
> cpuset related rules for the subset and exclusive properties.

ckrm doesn't need new rules, the subset and exclusive property handling
will be the functionality of the cpuset controller.

> 
> The cpuset scheduler hook is none - it only needs the
> existing cpus_allowed check that Ingo already added, years ago.
> You propose having the scheduler check the appropriate cpu mask
> in the task class, which would definitely increase the cache
> footprint size of the scheduler.

agree, one more level of indirection(instead of task->cpuset->cpus_allowed
it will be task->taskclass->res[CPUSET]->cpus_allowed).

> 
> The papers for CKRM speak of providing policy driven
> classification and differentiated service.  The focus is on
> managing resource sharing, to allow different classes of tasks
> to get controlled allocations of proportions of shared resources.
> 
> Cpusets is not about sharing proportions of a common resource,
> but rather about dedicating entire resources.  Granted,
> mathematically, there might be a mapping between these two.
> But is it certainly an impediment to those having to understand
> something, if it is implemented by abusing something quite
> larger and quite foreign in intention.
> 
> This flows through to the names of the specific files in the
> directory representing a cpuset or class.  The names for CKRM
> class directories are necessarily rather generic and abstract,
> whereas those for cpusets directly represent the particular
> need of placing tasks on cpus and memory nodes.  For someone
> doing numa placement, the latter are much easier to understand.
> 
> And as noted above, since you can't do both at the same time
> (both use the CKRM infrastructure for its traditional workload
> management and use it for numa placement) it's not like the
> administrator of such a system gains any from the more abstract
> names, if they are just using it for cpusets (numa placement).
> 
> There is no synergy in the kernel hooks required in the scheduler
> and memory allocator.  The hooks required by cpusets check
> bitmasks in order to allow or prohibit scheduling a task on
> a CPU, or allocating a page from a particular node to a task.
> These are quite distinct from the hooks required by CKRM when
> used as a fair share scheduler and workload manager, which
> requires adding delays to tasks in order to obtain the desired
> proportion of resource usage between classes.  Similarly, the
> CKRM memory allocator hooks manage the number of pages in use
> by each task class and/or the rate of page faults, while the
> cpuset memory allocator hooks manage which memory nodes are
> available to satisfy an allocation request.

I think this is where we go tangential. When you say CKRM you refer
the whole stack.

When we say CKRM, we mean only the framework(core, rcfs and taskclass or
socketclass).  It is the frame work that enables the user to define classes
and classify tasks or sockets.

All the other modules are optional and exchangable.

CKRM has different configurable modules that has their defined purposes.
One doesn't have to include a module if they don't need it.

> 
> The share usage hooks that monitor each resource, and its usage
> by each class, are useless for cpusets, which has no dependency
> on resource usage.  In cpusets, a task can use as much of its
> allowed CPUs and Memory Nodes, without throttling.  There is
> no feedback loop based on rates of resource usage per class.
> 
> Most of the hooks required by the CKRM classification engine to
> check for possible changes in a tasks class, such as in fork,
> exec, setuid, listen, and other points where a kernel object
> might change are not needed for cpusets.  The cpuset patch only
> requires such state change hooks in fork, exit and allocation,
> and only requires to increment or decrement a usage count in
> the fork and exit, and check a generation number in allocation.
> 
> Cpusets has no use for a kernel classification engine.  Outside
> of the trivial, automatic propagation of cpusets in fork and
> exit, the only changes in cpusets are mandated from user space.
> 
> Nor do cpusets have any need for the kernel to support externally
> defined policy rules.  Cpusets has no use for the classification
> engines callback mechanism.  In cpusets, no events that might
> affect state, such as fork, exit, reclassifications, changes in
> uid, or resource rate usage samples, need to be reported to any
> state agent, and there is no state agent, nor any communication
> channel thereto.
> 
> Cpusets has no use for a facility that lets server tasks tell
> some external classifier what phase they are operating in.
> Cpusets has no need for some workload manager to be sampling
> resource consumption and task state to determine resource
> consumption.  Cpusets has no need to track, in user space or
> kernel, the state of tasks after they exit. Cpusets has no use
> for delays nor for tracking them in the task struct.
> 
> Cpusets has no need for the hooks at the entry to, and exit from,
> memory allocation routines to distinguish delays due to memory
> allocation from those due to application i/o.  Cpusets has no
> need for sampling task state at fixed intervals, and our big
> iron scientific customers would without a doubt not tolerate a
> scan of the entire set of tasks every second for such resource
> and task state data collection.  Such a scan does _not_ scale
> well on big honkin numa boxes.  Whereas CKRM requires something
> like relayfs to pass back to user space the constant stream of
> such data, cpusets has no such needs and no such data.
> 
> Certainly, none of the network hooks that CKRM requires to
> provide differentiated service across priority classes would be
> of any use in a system (ab)using CKRM to provide cpuset style
> numa placement.

With the explanations above, I think you would now agree that all 
the above comments are invalidated. Basically you don't have to
bring them in if you don't need them.

> 
> It is true that both cpusets and CKRM make good use of the Linux
> kernel's virtual file system (vfs).  Cpusets uses vfs to model
> the hierarchy of 'soft partitions' in the system.  CKRM uses vfs
> to model a resource priority hierarchy, essentially replacing a
> single 'task priority' with hierarchical resource allocations,
> managing what proportion, out of what is available, of fungible
> resources such as ticks, cycles, bytes or data transfers a
> given class of tasks is allowed to use in the aggregate.
> 
> Just because two facilities use vfs is certainly not sufficient
> basis for deciding that they should be combined into one
> facility.
> 
> The shares and stats control files in each task_class
> directory are not needed by cpusets, but new control files,
> for cpus_allowed and mems_allowed are needed.  That, or the
> existing names have to be overloaded, at the cost of obfuscating
> the interface.

shares file can accomodate these. But, for bigger configuration we 
have to use some file based interface.

> 
> The kernel hooks for cpusets are fewer, simpler and more specific
> than those for CKRM.  Our high performance customers would want
> the cpuset hooks compiled in, not the more generic ones for
> CKRM (which they could not easily use for any other workload
> management purpose anyway, if the task class hierarchy were
> hijacked for the needs of cpusets, as noted above).
> 
> The development costs of cpusets so far, which are perhaps the
> best predictor we have of future costs, have been substantially
> lower than they have been for CKRM.

I think you have to compare the developmental cost of a resource 
controller providing cpusetfunctionality and not ckrm itself.
> 
> In sum, your proposal costs alot more than cpusets, by a variety
> of metrics.
> 
> =================================================
> 
> In summary, I find that your cpuset/memset CKRM proposal provides
> little or no benefit past the simpler cpu and memory placement
> calls already available, while costing substantially more in
> a variety of ways than my cpuset proposal, when evaluated for
> its usefulness for numa placement.
> 
> (Of course, if evaluated for suitability for workload management,
> the table is turned, and your CKRM patch provides essential
> capability that my cpuset patch could never dream of doing.)
> 
> Moreover, the additional workload management benefits that your
> CKRM facility provides, and that some of my customers might
> want to use in combination with numa placement, would probably
> become unavailable to them if we integrated cpusets and CKRM,
> because cpusets would have to hijack the task class hierarchy
> for its own nefarious purposes.
> 
> Such an attempt to integrate cpusets and CKRM would be a major
> setback for cpusets, substantially increasing its costs and
> reducing is value, probably well past the point of it even being
> worth persuing further, in the mainstream kernel.  Adding all
> that foreign logic of cpusets to the CKRM patch probably
> wouldn't help CKRM much either.  The CKRM patch is already one
> that requires a bright mind and some careful thought to master.

If one reads the design and then looks at the broken down patches,
it may not be hard.

> Adding cpuset numa placement logic, which is typically different
> in detail, would add a complexity burden to the CKRM code that
> would serve no one well.
> 
> 
> > Note that I am not pitching for a marriage
> 
> We agree.
> 
> I just took more words to say it ').

The reasons we quote also are very different though. I meant that it
won't be a happy, productive marriage.

But, I infer that you are suggesting their species itself are
different, which I do not agree.

chandra
PS to everyone else: Wow, you have lot of patience :)
> 
> 
> 
> -- 
>                   I won't rest till it's the best ...
>                   Programmer, Linux Scalability
>                   Paul Jackson <pj@sgi.com> 1.650.933.1373, 1.925.600.0401
> 

^ permalink raw reply	[flat|nested] 233+ messages in thread

* Re: [ckrm-tech] Re: [Lse-tech] [PATCH] cpusets - big numa cpu and memory placement
  2005-02-12  1:37                                               ` Chandra Seetharaman
@ 2005-02-12  6:16                                                 ` Paul Jackson
  0 siblings, 0 replies; 233+ messages in thread
From: Paul Jackson @ 2005-02-12  6:16 UTC (permalink / raw)
  To: Chandra Seetharaman
  Cc: colpatch, dino, mbligh, pwil3058, frankeh, dipankar, akpm,
	ckrm-tech, efocht, lse-tech, hch, steiner, jbarnes,
	sylvain.jeaugey, djh, linux-kernel, Simon.Derr, ak, sivanich

I agree with 97% of what you write, Chandra.


> one more level of indirection(instead of task->cpuset->cpus_allowed
> it will be task->taskclass->res[CPUSET]->cpus_allowed).

No -- two more levels of indirection (task->cpus_allowed becomes
task->taskclass->res[CPUSET]->cpus_allowed).


> But, for your purposes or our discussions one would need only 3 modules
> of the above (core, rcfs and taskclass). 

Ok.  That was not obvious to me until now.  If there is a section in
your documentation that explains this, and addresses the needs and
motivations of someone trying to reuse portions of CKRM in such a
manner, I missed it.  Whatever ...

In any case, on the issue that matters to me right now, we agree:

> It won't be a happy, productive marriage.

Good.  Thanks.  Good luck to you.

> PS to everyone else: Wow, you have lot of patience :)

For sure.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.650.933.1373, 1.925.600.0401

^ permalink raw reply	[flat|nested] 233+ messages in thread

end of thread, other threads:[~2005-02-12  6:16 UTC | newest]

Thread overview: 233+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-08-05 10:08 [PATCH] new bitmap list format (for cpusets) Paul Jackson
2004-08-05 10:10 ` [PATCH] cpusets - big numa cpu and memory placement Paul Jackson
2004-08-05 20:55   ` [Lse-tech] " Martin J. Bligh
2004-08-06  2:05     ` Paul Jackson
2004-08-06  3:24       ` Martin J. Bligh
2004-08-06  8:31         ` Paul Jackson
2004-08-06 15:30         ` Erich Focht
2004-08-06 15:35           ` Martin J. Bligh
2004-08-06 15:48             ` Hubertus Franke
2004-08-07  6:30               ` Paul Jackson
2004-08-07  6:45               ` Paul Jackson
2004-08-06 15:49             ` Hubertus Franke
2004-08-06 15:52             ` Hubertus Franke
2004-08-06 15:55             ` Erich Focht
2004-08-07  6:10           ` Paul Jackson
2004-08-07 15:22             ` Erich Focht
2004-08-07 18:59               ` Paul Jackson
2004-08-08  3:17               ` Paul Jackson
2004-08-08 14:50               ` Martin J. Bligh
2004-08-11  0:43                 ` Paul Jackson
2004-08-11  9:40                 ` Erich Focht
2004-08-11 14:49                   ` Martin J. Bligh
2004-08-11 17:50                     ` Paul Jackson
2004-08-11 21:12                       ` Shailabh Nagar
2004-08-12  7:15                         ` Paul Jackson
2004-08-12 12:58                           ` Jack Steiner
2004-08-12 14:50                           ` Martin J. Bligh
2004-08-11 15:12                   ` Shailabh Nagar
2004-08-08 20:22               ` Shailabh Nagar
2004-08-09 15:57                 ` Hubertus Franke
2004-08-10 11:31                   ` [ckrm-tech] " Paul Jackson
2004-08-10 22:38                     ` Shailabh Nagar
2004-08-11 10:42                       ` Erich Focht
2004-08-11 14:56                         ` Shailabh Nagar
2004-08-14  8:51                       ` Paul Jackson
2004-08-08 19:58             ` Shailabh Nagar
2004-10-01 23:41               ` Andrew Morton
2004-10-02  6:06                 ` Paul Jackson
2004-10-02 14:55                   ` Dipankar Sarma
2004-10-02 16:14                     ` Hubertus Franke
2004-10-02 18:04                       ` Paul Jackson
2004-10-02 23:21                       ` Peter Williams
2004-10-02 23:44                         ` Hubertus Franke
2004-10-03  0:00                           ` Peter Williams
2004-10-03  3:44                           ` Paul Jackson
2004-10-05  3:13                           ` [ckrm-tech] " Matthew Helsley
2004-10-05  8:30                             ` Hubertus Franke
2004-10-05 14:20                               ` Paul Jackson
2004-10-03  2:59                         ` Paul Jackson
2004-10-03  3:19                         ` Paul Jackson
2004-10-03  3:53                           ` Peter Williams
2004-10-03  4:47                             ` Paul Jackson
2004-10-03  5:12                               ` Peter Williams
2004-10-03  5:39                                 ` Paul Jackson
2004-10-03  4:02                           ` Paul Jackson
2004-10-03  3:39                         ` Paul Jackson
2004-10-03 14:36                         ` Martin J. Bligh
2004-10-03 15:39                           ` Paul Jackson
2004-10-03 23:53                             ` Martin J. Bligh
2004-10-04  0:02                               ` Martin J. Bligh
2004-10-04  0:53                                 ` Paul Jackson
2004-10-04  3:56                                   ` Martin J. Bligh
2004-10-04  4:24                                     ` Paul Jackson
2004-10-04 15:03                                       ` Martin J. Bligh
2004-10-04 15:53                                         ` [ckrm-tech] " Paul Jackson
2004-10-04 18:17                                           ` Martin J. Bligh
2004-10-04 20:25                                             ` Paul Jackson
2004-10-04 22:15                                               ` Martin J. Bligh
2004-10-05  9:17                                                 ` Paul Jackson
2004-10-05 10:01                                                   ` Paul Jackson
2004-10-05 22:24                                                   ` Matthew Dobson
2004-10-05  9:26                                         ` Simon Derr
2004-10-05  9:58                                           ` Paul Jackson
2004-10-05 19:34                                           ` Martin J. Bligh
2004-10-06  0:28                                             ` Paul Jackson
2004-10-06  1:16                                               ` Martin J. Bligh
2004-10-06  2:08                                                 ` Paul Jackson
2004-10-06 22:59                                                   ` Matthew Dobson
2004-10-06 23:23                                                     ` Peter Williams
2004-10-07  0:16                                                       ` Rick Lindsley
2004-10-07 18:27                                                         ` Paul Jackson
2004-10-07  8:51                                                     ` Paul Jackson
2004-10-07 10:53                                                       ` Rick Lindsley
2004-10-07 14:41                                                         ` Martin J. Bligh
     [not found]                                                         ` <20041007072842.2bafc320.pj@sgi.com>
2004-10-07 19:05                                                           ` Rick Lindsley
2004-10-10  2:15                                                             ` [ckrm-tech] " Paul Jackson
2004-10-11 22:06                                                               ` Matthew Dobson
2004-10-11 22:58                                                                 ` Paul Jackson
2004-10-12 21:22                                                                   ` Matthew Dobson
2004-10-12  8:50                                                                 ` Simon Derr
2004-10-12 21:25                                                                   ` Matthew Dobson
2004-10-10  2:28                                                             ` Paul Jackson
2004-10-09  0:06                                                           ` Matthew Dobson
     [not found]                                                           ` <4165A31E.4070905@watson.ibm.com>
2004-10-08 13:14                                                             ` Paul Jackson
2004-10-08 15:42                                                               ` Hubertus Franke
2004-10-08 18:23                                                                 ` Paul Jackson
2004-10-09  1:00                                                                   ` Matthew Dobson
2004-10-09 20:08                                                                     ` [Lse-tech] " Paul Jackson
2004-10-11 22:16                                                                       ` Matthew Dobson
2004-10-11 22:42                                                                         ` Paul Jackson
2004-10-10  0:05                                                                     ` Paul Jackson
2004-10-11 22:18                                                                       ` Matthew Dobson
2004-10-11 22:39                                                                         ` Paul Jackson
2004-10-09  0:51                                                               ` Matthew Dobson
2004-10-10  0:50                                                                 ` [Lse-tech] " Paul Jackson
2004-10-10  0:59                                                                 ` Paul Jackson
2004-10-09  0:22                                                             ` Matthew Dobson
2004-10-12 22:24                                                               ` [Lse-tech] " Hanna Linder
2004-10-13 20:56                                                                 ` Matthew Dobson
2004-10-07 12:47                                                       ` [Lse-tech] " Simon Derr
2004-10-07 14:49                                                         ` Martin J. Bligh
2004-10-07 17:54                                                           ` Paul Jackson
2004-10-07 18:13                                                             ` Martin J. Bligh
2004-10-08  9:23                                                               ` Erich Focht
2004-10-08  9:50                                                                 ` Andrew Morton
2004-10-08 10:40                                                                   ` Erich Focht
2004-10-08 14:26                                                                     ` Martin J. Bligh
2004-10-08  9:53                                                                 ` Nick Piggin
2004-10-08 11:40                                                                   ` Erich Focht
2004-10-08 14:24                                                                 ` Martin J. Bligh
2004-10-08 22:37                                                                   ` Erich Focht
2004-10-14 10:35                                                               ` Eric W. Biederman
2004-10-14 11:22                                                                 ` Erich Focht
2004-10-14 11:23                                                                 ` Paul Jackson
2004-10-14 19:39                                                                 ` Paul Jackson
2004-10-14 22:38                                                                   ` Hubertus Franke
2004-10-15  1:26                                                                     ` Paul Jackson
2004-10-07 18:25                                                             ` Andrew Morton
2004-10-07 19:52                                                               ` Paul Jackson
2004-10-07 21:04                                                                 ` [ckrm-tech] " Matthew Helsley
2004-10-10  3:22                                                               ` Paul Jackson
2004-10-07 19:16                                                             ` Rick Lindsley
2004-10-10  2:35                                                               ` Paul Jackson
2004-10-10  5:12                                                           ` [ckrm-tech] " Paul Jackson
2004-10-08 23:48                                                       ` Matthew Dobson
2004-10-09  0:18                                                         ` Nick Piggin
2004-10-11 23:00                                                           ` Matthew Dobson
2004-10-11 23:09                                                             ` Nick Piggin
2004-10-05 22:33                                           ` Matthew Dobson
2004-10-06  3:01                                             ` Paul Jackson
2004-10-06 23:12                                               ` Matthew Dobson
2004-10-07  8:59                                                 ` [ckrm-tech] " Paul Jackson
2004-10-04  0:45                               ` Paul Jackson
2004-10-04 11:44                                 ` Rick Lindsley
2004-10-04 22:46                                   ` [ckrm-tech] " Paul Jackson
2004-10-05 22:19                               ` Matthew Dobson
2004-10-06  2:39                                 ` Paul Jackson
2004-10-06 23:21                                   ` Matthew Dobson
2004-10-07  9:41                                     ` [ckrm-tech] " Paul Jackson
2004-10-06  2:47                                 ` Paul Jackson
2004-10-06  9:43                                   ` Simon Derr
2004-10-06 13:27                                     ` Paul Jackson
2004-10-06 21:55                                     ` Peter Williams
2004-10-06 22:49                                       ` Paul Jackson
2004-10-06  8:02                                 ` Simon Derr
2005-02-07 23:59                                 ` Matthew Dobson
2005-02-08  0:20                                   ` Andrew Morton
2005-02-08  0:34                                     ` Paul Jackson
2005-02-08  9:54                                   ` Dinakar Guniguntala
2005-02-08  9:49                                     ` Nick Piggin
2005-02-08 16:13                                       ` Martin J. Bligh
2005-02-08 23:26                                         ` Nick Piggin
2005-02-09  4:23                                           ` Paul Jackson
2005-02-08 19:32                                       ` Matthew Dobson
2005-02-09  2:53                                         ` Nick Piggin
2005-02-08 19:00                                     ` Matthew Dobson
2005-02-08 20:42                                       ` Paul Jackson
2005-02-08 22:14                                         ` Matthew Dobson
2005-02-08 23:58                                           ` Shailabh Nagar
2005-02-09  0:27                                             ` Paul Jackson
2005-02-09  0:24                                           ` Paul Jackson
2005-02-09 17:59                                         ` [ckrm-tech] " Chandra Seetharaman
2005-02-11  2:46                                           ` Chandra Seetharaman
2005-02-11  9:21                                             ` Paul Jackson
2005-02-12  1:37                                               ` Chandra Seetharaman
2005-02-12  6:16                                                 ` Paul Jackson
2005-02-11 16:54                                             ` Jesse Barnes
2005-02-11 18:42                                               ` Chandra Seetharaman
2005-02-11 18:50                                                 ` Jesse Barnes
2005-02-08 16:15                                   ` Martin J. Bligh
2005-02-08 22:17                                     ` Matthew Dobson
2004-10-03 16:02                           ` Paul Jackson
2004-10-03 23:47                             ` Martin J. Bligh
2004-10-04  3:33                               ` Paul Jackson
2004-10-03 20:10                           ` Tim Hockin
2004-10-04  1:56                             ` Paul Jackson
2004-10-03  3:35                     ` Paul Jackson
2004-10-03 20:21                   ` Erich Focht
2004-10-03 20:48                     ` Andrew Morton
2004-10-04 14:05                       ` Erich Focht
2004-10-04 14:57                         ` Martin J. Bligh
2004-10-04 15:30                           ` Paul Jackson
2004-10-04 15:41                             ` Martin J. Bligh
2004-10-04 16:02                               ` Paul Jackson
2004-10-04 18:19                                 ` Martin J. Bligh
2004-10-04 18:29                                   ` Paul Jackson
2004-10-04 15:38                           ` Paul Jackson
2004-10-04 16:46                           ` Paul Jackson
2004-10-04  3:41                     ` Paul Jackson
2004-10-04 13:58                     ` Hubertus Franke
2004-10-04 14:13                       ` Simon Derr
2004-10-04 14:15                       ` Erich Focht
2004-10-04 15:23                         ` Paul Jackson
2004-10-04 14:37                       ` Paul Jackson
2004-10-02 15:46                 ` [ckrm-tech] " Marc E. Fiuczynski
2004-10-02 16:17                   ` Hubertus Franke
2004-10-02 17:53                     ` Paul Jackson
2004-10-02 18:16                       ` Hubertus Franke
2004-10-02 19:14                         ` Paul Jackson
2004-10-02 23:29                         ` Peter Williams
2004-10-02 23:51                           ` Hubertus Franke
2004-10-02 20:40                     ` Andrew Morton
2004-10-02 23:08                       ` Hubertus Franke
2004-10-02 22:26                         ` Alan Cox
2004-10-03  2:49                         ` Paul Jackson
2004-10-03 12:19                           ` Hubertus Franke
2004-10-03  3:25                         ` Paul Jackson
2004-10-03  2:26                       ` Paul Jackson
2004-10-03 14:11                         ` Paul Jackson
2004-10-02 17:47                   ` Paul Jackson
2004-08-05 20:47 ` [Lse-tech] [PATCH] new bitmap list format (for cpusets) Martin J. Bligh
2004-08-05 21:45   ` Paul Jackson
     [not found]     ` <Pine.A41.4.53.0408060930100.20680@isabelle.frec.bull.fr>
2004-08-06 10:14       ` Paul Jackson
2004-08-09  8:01   ` Paul Jackson
2004-08-09 14:49     ` Martin J. Bligh
2004-08-10 23:43       ` Paul Jackson
2004-08-11 13:11 ` Dinakar Guniguntala
2004-08-11 16:17   ` Paul Jackson
2004-08-11 18:05     ` Dinakar Guniguntala
2004-08-11 20:40       ` Paul Jackson
2004-08-12  9:48         ` Dinakar Guniguntala
2004-08-12 10:11           ` Paul Jackson
2004-08-12 12:34             ` Dinakar Guniguntala

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).