linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch 1/6] mempolicy: convert MPOL constants to enum
@ 2008-03-01  0:44 David Rientjes
  2008-03-01  0:45 ` [patch 2/6] mempolicy: support optional mode flags David Rientjes
  2008-03-03  7:17 ` [patch 1/6] mempolicy: convert MPOL constants to enum Andrew Morton
  0 siblings, 2 replies; 22+ messages in thread
From: David Rientjes @ 2008-03-01  0:44 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Paul Jackson, Christoph Lameter, Lee Schermerhorn, Andi Kleen,
	linux-kernel

The mempolicy mode constants, MPOL_DEFAULT, MPOL_PREFERRED, MPOL_BIND,
and MPOL_INTERLEAVE, are better declared as part of an enum since they
are sequentially numbered and cannot be combined.

The policy member of struct mempolicy is also converted from type short
to type unsigned short.  A negative policy does not have any legitimate
meaning, so it is possible to change its type in preparation for adding
optional mode flags later.

The equivalent member of struct shmem_sb_info is also changed from int
to unsigned short.

For compatibility, the policy formal to get_mempolicy() remains as a
pointer to an int:

	int get_mempolicy(int *policy, unsigned long *nmask,
			  unsigned long maxnode, unsigned long addr,
			  unsigned long flags);

although the only possible values is the range of type unsigned short.

Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/mempolicy.h |   19 ++++++++++---------
 include/linux/shmem_fs.h  |    2 +-
 mm/mempolicy.c            |   27 ++++++++++++++++-----------
 mm/shmem.c                |    9 +++++----
 4 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -9,12 +9,13 @@
  */
 
 /* Policies */
-#define MPOL_DEFAULT	0
-#define MPOL_PREFERRED	1
-#define MPOL_BIND	2
-#define MPOL_INTERLEAVE	3
-
-#define MPOL_MAX MPOL_INTERLEAVE
+enum {
+	MPOL_DEFAULT,
+	MPOL_PREFERRED,
+	MPOL_BIND,
+	MPOL_INTERLEAVE,
+	MPOL_MAX,	/* always last member of enum */
+};
 
 /* Flags for get_mem_policy */
 #define MPOL_F_NODE	(1<<0)	/* return next IL mode instead of node mask */
@@ -62,7 +63,7 @@ struct mm_struct;
  */
 struct mempolicy {
 	atomic_t refcnt;
-	short policy; 	/* See MPOL_* above */
+	unsigned short policy; 	/* See MPOL_* above */
 	union {
 		struct zonelist  *zonelist;	/* bind */
 		short 		 preferred_node; /* preferred */
@@ -133,7 +134,7 @@ struct shared_policy {
 	spinlock_t lock;
 };
 
-void mpol_shared_policy_init(struct shared_policy *info, int policy,
+void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
 				nodemask_t *nodes);
 int mpol_set_shared_policy(struct shared_policy *info,
 				struct vm_area_struct *vma,
@@ -200,7 +201,7 @@ static inline int mpol_set_shared_policy(struct shared_policy *info,
 }
 
 static inline void mpol_shared_policy_init(struct shared_policy *info,
-					int policy, nodemask_t *nodes)
+				unsigned short policy, nodemask_t *nodes)
 {
 }
 
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -34,7 +34,7 @@ struct shmem_sb_info {
 	uid_t uid;		    /* Mount uid for root directory */
 	gid_t gid;		    /* Mount gid for root directory */
 	mode_t mode;		    /* Mount mode for root directory */
-	int policy;		    /* Default NUMA memory alloc policy */
+	unsigned short policy;	    /* Default NUMA memory alloc policy */
 	nodemask_t policy_nodes;    /* nodemask for preferred and bind */
 };
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -114,7 +114,7 @@ static void mpol_rebind_policy(struct mempolicy *pol,
                                const nodemask_t *newmask);
 
 /* Do sanity checking on a policy */
-static int mpol_check_policy(int mode, nodemask_t *nodes)
+static int mpol_check_policy(unsigned short mode, nodemask_t *nodes)
 {
 	int was_empty, is_empty;
 
@@ -159,6 +159,8 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
 		if (!was_empty && is_empty)
 			return -EINVAL;
 		break;
+	default:
+		BUG();
 	}
 	return 0;
 }
@@ -201,7 +203,7 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
 }
 
 /* Create a new policy */
-static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
+static struct mempolicy *mpol_new(unsigned short mode, nodemask_t *nodes)
 {
 	struct mempolicy *policy;
 
@@ -235,6 +237,8 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 			return error_code;
 		}
 		break;
+	default:
+		BUG();
 	}
 	policy->policy = mode;
 	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
@@ -479,7 +483,7 @@ static void mpol_set_task_struct_flag(void)
 }
 
 /* Set the process memory policy */
-static long do_set_mempolicy(int mode, nodemask_t *nodes)
+static long do_set_mempolicy(unsigned short mode, nodemask_t *nodes)
 {
 	struct mempolicy *new;
 
@@ -781,7 +785,7 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
 #endif
 
 static long do_mbind(unsigned long start, unsigned long len,
-		     unsigned long mode, nodemask_t *nmask,
+		     unsigned short mode, nodemask_t *nmask,
 		     unsigned long flags)
 {
 	struct vm_area_struct *vma;
@@ -791,9 +795,8 @@ static long do_mbind(unsigned long start, unsigned long len,
 	int err;
 	LIST_HEAD(pagelist);
 
-	if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
+	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
 				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
-	    || mode > MPOL_MAX)
 		return -EINVAL;
 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 		return -EPERM;
@@ -826,7 +829,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 	if (!new)
 		flags |= MPOL_MF_DISCONTIG_OK;
 
-	pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
+	pr_debug("mbind %lx-%lx mode:%d nodes:%lx\n",start,start+len,
 		 mode, nmask ? nodes_addr(*nmask)[0] : -1);
 
 	down_write(&mm->mmap_sem);
@@ -927,6 +930,8 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 	nodemask_t nodes;
 	int err;
 
+	if (mode >= MPOL_MAX)
+		return -EINVAL;
 	err = get_nodes(&nodes, nmask, maxnode);
 	if (err)
 		return err;
@@ -940,7 +945,7 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 	int err;
 	nodemask_t nodes;
 
-	if (mode < 0 || mode > MPOL_MAX)
+	if (mode < 0 || mode >= MPOL_MAX)
 		return -EINVAL;
 	err = get_nodes(&nodes, nmask, maxnode);
 	if (err)
@@ -1206,7 +1211,7 @@ static unsigned interleave_nodes(struct mempolicy *policy)
  */
 unsigned slab_node(struct mempolicy *policy)
 {
-	int pol = policy ? policy->policy : MPOL_DEFAULT;
+	unsigned short pol = policy ? policy->policy : MPOL_DEFAULT;
 
 	switch (pol) {
 	case MPOL_INTERLEAVE:
@@ -1634,7 +1639,7 @@ restart:
 	return 0;
 }
 
-void mpol_shared_policy_init(struct shared_policy *info, int policy,
+void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
 				nodemask_t *policy_nodes)
 {
 	info->root = RB_ROOT;
@@ -1853,7 +1858,7 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	char *p = buffer;
 	int l;
 	nodemask_t nodes;
-	int mode = pol ? pol->policy : MPOL_DEFAULT;
+	unsigned short mode = pol ? pol->policy : MPOL_DEFAULT;
 
 	switch (mode) {
 	case MPOL_DEFAULT:
diff --git a/mm/shmem.c b/mm/shmem.c
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1082,7 +1082,8 @@ redirty:
 
 #ifdef CONFIG_NUMA
 #ifdef CONFIG_TMPFS
-static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+static int shmem_parse_mpol(char *value, unsigned short *policy,
+			    nodemask_t *policy_nodes)
 {
 	char *nodelist = strchr(value, ':');
 	int err = 1;
@@ -1131,7 +1132,7 @@ out:
 	return err;
 }
 
-static void shmem_show_mpol(struct seq_file *seq, int policy,
+static void shmem_show_mpol(struct seq_file *seq, unsigned short policy,
 			    const nodemask_t policy_nodes)
 {
 	char *policy_string;
@@ -1200,13 +1201,13 @@ static struct page *shmem_alloc_page(gfp_t gfp,
 }
 #else /* !CONFIG_NUMA */
 #ifdef CONFIG_TMPFS
-static inline int shmem_parse_mpol(char *value, int *policy,
+static inline int shmem_parse_mpol(char *value, unsigned short *policy,
 						nodemask_t *policy_nodes)
 {
 	return 1;
 }
 
-static inline void shmem_show_mpol(struct seq_file *seq, int policy,
+static inline void shmem_show_mpol(struct seq_file *seq, unsigned short policy,
 			    const nodemask_t policy_nodes)
 {
 }

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [patch 2/6] mempolicy: support optional mode flags
  2008-03-01  0:44 [patch 1/6] mempolicy: convert MPOL constants to enum David Rientjes
@ 2008-03-01  0:45 ` David Rientjes
  2008-03-01  0:45   ` [patch 3/6] mempolicy: add MPOL_F_STATIC_NODES flag David Rientjes
  2008-03-03  7:17 ` [patch 1/6] mempolicy: convert MPOL constants to enum Andrew Morton
  1 sibling, 1 reply; 22+ messages in thread
From: David Rientjes @ 2008-03-01  0:45 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Paul Jackson, Christoph Lameter, Lee Schermerhorn, Andi Kleen,
	linux-kernel

With the evolution of mempolicies, it is necessary to support mempolicy
mode flags that specify how the policy shall behave in certain
circumstances.  The most immediate need for mode flag support is to
suppress remapping the nodemask of a policy at the time of rebind.

Both the mempolicy mode and flags are passed by the user in the 'int
policy' formal of either the set_mempolicy() or mbind() syscall.  A new
constant, MPOL_MODE_FLAGS, represents the union of legal optional flags
that may be passed as part of this int.  Mempolicies that include illegal
flags as part of their policy are rejected as invalid.

An additional member to struct mempolicy is added to support the mode
flags:

	struct mempolicy {
		...
		unsigned short policy;
		unsigned short flags;
	}

The splitting of the 'int' actual passed by the user is done in
sys_set_mempolicy() and sys_mbind() for their respective syscalls.  This
is done by intersecting the actual with MPOL_MODE_FLAGS, rejecting the
syscall of there are additional flags, and storing it in the new 'flags'
member of struct mempolicy.  The intersection of the actual with
~MPOL_MODE_FLAGS is stored in the 'policy' member of the struct and all
current users of pol->policy remain unchanged.

The union of the policy mode and optional mode flags is passed back to
the user in get_mempolicy().

This combination of mode and flags within the same actual does not break 
userspace code that relies on get_mempolicy(&policy, ...) and either

	switch (policy) {
	case MPOL_BIND:
		...
	case MPOL_INTERLEAVE:
		...
	};

statements or

	if (policy == MPOL_INTERLEAVE) {
		...
	}

statements.  Such applications would need to use optional mode flags when
calling set_mempolicy() or mbind() for these previously implemented
statements to stop working.  If an application does start using optional
mode flags, it will need to mask the optional flags off the policy in
switch and conditional statements that only test mode.

An additional member is also added to struct shmem_sb_info to store the
optional mode flags.

Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 fs/hugetlbfs/inode.c      |    2 +-
 include/linux/mempolicy.h |   20 +++++++++++++++--
 include/linux/shmem_fs.h  |    1 +
 mm/mempolicy.c            |   51 +++++++++++++++++++++++++++-----------------
 mm/shmem.c                |   24 ++++++++++++++-------
 5 files changed, 66 insertions(+), 32 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -504,7 +504,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		INIT_LIST_HEAD(&inode->i_mapping->private_list);
 		info = HUGETLBFS_I(inode);
-		mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL);
+		mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, 0, NULL);
 		switch (mode & S_IFMT) {
 		default:
 			init_special_inode(inode, mode, dev);
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -8,6 +8,12 @@
  * Copyright 2003,2004 Andi Kleen SuSE Labs
  */
 
+/*
+ * Both the MPOL_* mempolicy mode and the MPOL_F_* optional mode flags are
+ * passed by the user to either set_mempolicy() or mbind() in an 'int' actual.
+ * The MPOL_MODE_FLAGS macro determines the legal set of optional mode flags.
+ */
+
 /* Policies */
 enum {
 	MPOL_DEFAULT,
@@ -17,7 +23,14 @@ enum {
 	MPOL_MAX,	/* always last member of enum */
 };
 
-/* Flags for get_mem_policy */
+/* Flags for set_mempolicy */
+/*
+ * MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
+ * either set_mempolicy() or mbind().
+ */
+#define MPOL_MODE_FLAGS		(0)
+
+/* Flags for get_mempolicy */
 #define MPOL_F_NODE	(1<<0)	/* return next IL mode instead of node mask */
 #define MPOL_F_ADDR	(1<<1)	/* look up vma using address */
 #define MPOL_F_MEMS_ALLOWED (1<<2) /* return allowed memories */
@@ -64,6 +77,7 @@ struct mm_struct;
 struct mempolicy {
 	atomic_t refcnt;
 	unsigned short policy; 	/* See MPOL_* above */
+	unsigned short flags;	/* See set_mempolicy() MPOL_F_* above */
 	union {
 		struct zonelist  *zonelist;	/* bind */
 		short 		 preferred_node; /* preferred */
@@ -135,7 +149,7 @@ struct shared_policy {
 };
 
 void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
-				nodemask_t *nodes);
+				unsigned short flags, nodemask_t *nodes);
 int mpol_set_shared_policy(struct shared_policy *info,
 				struct vm_area_struct *vma,
 				struct mempolicy *new);
@@ -201,7 +215,7 @@ static inline int mpol_set_shared_policy(struct shared_policy *info,
 }
 
 static inline void mpol_shared_policy_init(struct shared_policy *info,
-				unsigned short policy, nodemask_t *nodes)
+		unsigned short policy, unsigned short flags, nodemask_t *nodes)
 {
 }
 
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -35,6 +35,7 @@ struct shmem_sb_info {
 	gid_t gid;		    /* Mount gid for root directory */
 	mode_t mode;		    /* Mount mode for root directory */
 	unsigned short policy;	    /* Default NUMA memory alloc policy */
+	unsigned short flags;	    /* Optional mempolicy flags */
 	nodemask_t policy_nodes;    /* nodemask for preferred and bind */
 };
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -203,12 +203,13 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
 }
 
 /* Create a new policy */
-static struct mempolicy *mpol_new(unsigned short mode, nodemask_t *nodes)
+static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
+				  nodemask_t *nodes)
 {
 	struct mempolicy *policy;
 
-	pr_debug("setting mode %d nodes[0] %lx\n",
-		 mode, nodes ? nodes_addr(*nodes)[0] : -1);
+	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
+		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 
 	if (mode == MPOL_DEFAULT)
 		return NULL;
@@ -241,6 +242,7 @@ static struct mempolicy *mpol_new(unsigned short mode, nodemask_t *nodes)
 		BUG();
 	}
 	policy->policy = mode;
+	policy->flags = flags;
 	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 	return policy;
 }
@@ -483,13 +485,14 @@ static void mpol_set_task_struct_flag(void)
 }
 
 /* Set the process memory policy */
-static long do_set_mempolicy(unsigned short mode, nodemask_t *nodes)
+static long do_set_mempolicy(unsigned short mode, unsigned short flags,
+			     nodemask_t *nodes)
 {
 	struct mempolicy *new;
 
 	if (mpol_check_policy(mode, nodes))
 		return -EINVAL;
-	new = mpol_new(mode, nodes);
+	new = mpol_new(mode, flags, nodes);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 	mpol_free(current->mempolicy);
@@ -595,7 +598,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 			goto out;
 		}
 	} else
-		*policy = pol->policy;
+		*policy = pol->policy | pol->flags;
 
 	if (vma) {
 		up_read(&current->mm->mmap_sem);
@@ -785,8 +788,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
 #endif
 
 static long do_mbind(unsigned long start, unsigned long len,
-		     unsigned short mode, nodemask_t *nmask,
-		     unsigned long flags)
+		     unsigned short mode, unsigned short mode_flags,
+		     nodemask_t *nmask, unsigned long flags)
 {
 	struct vm_area_struct *vma;
 	struct mm_struct *mm = current->mm;
@@ -818,7 +821,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 	if (mpol_check_policy(mode, nmask))
 		return -EINVAL;
 
-	new = mpol_new(mode, nmask);
+	new = mpol_new(mode, mode_flags, nmask);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 
@@ -829,8 +832,9 @@ static long do_mbind(unsigned long start, unsigned long len,
 	if (!new)
 		flags |= MPOL_MF_DISCONTIG_OK;
 
-	pr_debug("mbind %lx-%lx mode:%d nodes:%lx\n",start,start+len,
-		 mode, nmask ? nodes_addr(*nmask)[0] : -1);
+	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
+		 start, start + len, mode, mode_flags,
+		 nmask ? nodes_addr(*nmask)[0] : -1);
 
 	down_write(&mm->mmap_sem);
 	vma = check_range(mm, start, end, nmask,
@@ -929,13 +933,16 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 {
 	nodemask_t nodes;
 	int err;
+	unsigned short mode_flags;
 
+	mode_flags = mode & MPOL_MODE_FLAGS;
+	mode &= ~MPOL_MODE_FLAGS;
 	if (mode >= MPOL_MAX)
 		return -EINVAL;
 	err = get_nodes(&nodes, nmask, maxnode);
 	if (err)
 		return err;
-	return do_mbind(start, len, mode, &nodes, flags);
+	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
 }
 
 /* Set the process memory policy */
@@ -944,13 +951,16 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 {
 	int err;
 	nodemask_t nodes;
+	unsigned short flags;
 
-	if (mode < 0 || mode >= MPOL_MAX)
+	flags = mode & MPOL_MODE_FLAGS;
+	mode &= ~MPOL_MODE_FLAGS;
+	if ((unsigned int)mode >= MPOL_MAX)
 		return -EINVAL;
 	err = get_nodes(&nodes, nmask, maxnode);
 	if (err)
 		return err;
-	return do_set_mempolicy(mode, &nodes);
+	return do_set_mempolicy(mode, flags, &nodes);
 }
 
 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
@@ -1640,7 +1650,7 @@ restart:
 }
 
 void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
-				nodemask_t *policy_nodes)
+			unsigned short flags, nodemask_t *policy_nodes)
 {
 	info->root = RB_ROOT;
 	spin_lock_init(&info->lock);
@@ -1649,7 +1659,7 @@ void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
 		struct mempolicy *newpol;
 
 		/* Falls back to MPOL_DEFAULT on any error */
-		newpol = mpol_new(policy, policy_nodes);
+		newpol = mpol_new(policy, flags, policy_nodes);
 		if (!IS_ERR(newpol)) {
 			/* Create pseudo-vma that contains just the policy */
 			struct vm_area_struct pvma;
@@ -1670,9 +1680,10 @@ int mpol_set_shared_policy(struct shared_policy *info,
 	struct sp_node *new = NULL;
 	unsigned long sz = vma_pages(vma);
 
-	pr_debug("set_shared_policy %lx sz %lu %d %lx\n",
+	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
 		 vma->vm_pgoff,
-		 sz, npol? npol->policy : -1,
+		 sz, npol ? npol->policy : -1,
+		 npol ? npol->flags : -1,
 		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
 
 	if (npol) {
@@ -1745,14 +1756,14 @@ void __init numa_policy_init(void)
 	if (unlikely(nodes_empty(interleave_nodes)))
 		node_set(prefer, interleave_nodes);
 
-	if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
+	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
 		printk("numa_policy_init: interleaving failed\n");
 }
 
 /* Reset policy of current process to default */
 void numa_default_policy(void)
 {
-	do_set_mempolicy(MPOL_DEFAULT, NULL);
+	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
 }
 
 /* Migrate a policy to a different set of nodes */
diff --git a/mm/shmem.c b/mm/shmem.c
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1083,9 +1083,10 @@ redirty:
 #ifdef CONFIG_NUMA
 #ifdef CONFIG_TMPFS
 static int shmem_parse_mpol(char *value, unsigned short *policy,
-			    nodemask_t *policy_nodes)
+			unsigned short *mode_flags, nodemask_t *policy_nodes)
 {
 	char *nodelist = strchr(value, ':');
+	char *flags = strchr(value, '=');
 	int err = 1;
 
 	if (nodelist) {
@@ -1096,6 +1097,8 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
 		if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY]))
 			goto out;
 	}
+	if (flags)
+		*flags++ = '\0';
 	if (!strcmp(value, "default")) {
 		*policy = MPOL_DEFAULT;
 		/* Don't allow a nodelist */
@@ -1125,6 +1128,8 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
 			*policy_nodes = node_states[N_HIGH_MEMORY];
 		err = 0;
 	}
+	if (flags) {
+	}
 out:
 	/* Restore string for error message */
 	if (nodelist)
@@ -1133,7 +1138,7 @@ out:
 }
 
 static void shmem_show_mpol(struct seq_file *seq, unsigned short policy,
-			    const nodemask_t policy_nodes)
+			unsigned short flags, const nodemask_t policy_nodes)
 {
 	char *policy_string;
 
@@ -1202,13 +1207,13 @@ static struct page *shmem_alloc_page(gfp_t gfp,
 #else /* !CONFIG_NUMA */
 #ifdef CONFIG_TMPFS
 static inline int shmem_parse_mpol(char *value, unsigned short *policy,
-						nodemask_t *policy_nodes)
+				unsigned short *flags, nodemask_t *policy_nodes)
 {
 	return 1;
 }
 
 static inline void shmem_show_mpol(struct seq_file *seq, unsigned short policy,
-			    const nodemask_t policy_nodes)
+			unsigned short flags, const nodemask_t policy_nodes)
 {
 }
 #endif /* CONFIG_TMPFS */
@@ -1578,7 +1583,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 			inode->i_op = &shmem_inode_operations;
 			inode->i_fop = &shmem_file_operations;
 			mpol_shared_policy_init(&info->policy, sbinfo->policy,
-							&sbinfo->policy_nodes);
+					sbinfo->flags, &sbinfo->policy_nodes);
 			break;
 		case S_IFDIR:
 			inc_nlink(inode);
@@ -1592,7 +1597,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 			 * Must not load anything in the rbtree,
 			 * mpol_free_shared_policy will not be called.
 			 */
-			mpol_shared_policy_init(&info->policy, MPOL_DEFAULT,
+			mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, 0,
 						NULL);
 			break;
 		}
@@ -2209,7 +2214,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
 				goto bad_val;
 		} else if (!strcmp(this_char,"mpol")) {
 			if (shmem_parse_mpol(value, &sbinfo->policy,
-					     &sbinfo->policy_nodes))
+				&sbinfo->flags, &sbinfo->policy_nodes))
 				goto bad_val;
 		} else {
 			printk(KERN_ERR "tmpfs: Bad mount option %s\n",
@@ -2261,6 +2266,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 	sbinfo->max_inodes  = config.max_inodes;
 	sbinfo->free_inodes = config.max_inodes - inodes;
 	sbinfo->policy      = config.policy;
+	sbinfo->flags       = config.flags;
 	sbinfo->policy_nodes = config.policy_nodes;
 out:
 	spin_unlock(&sbinfo->stat_lock);
@@ -2282,7 +2288,8 @@ static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_printf(seq, ",uid=%u", sbinfo->uid);
 	if (sbinfo->gid != 0)
 		seq_printf(seq, ",gid=%u", sbinfo->gid);
-	shmem_show_mpol(seq, sbinfo->policy, sbinfo->policy_nodes);
+	shmem_show_mpol(seq, sbinfo->policy, sbinfo->flags,
+			sbinfo->policy_nodes);
 	return 0;
 }
 #endif /* CONFIG_TMPFS */
@@ -2313,6 +2320,7 @@ static int shmem_fill_super(struct super_block *sb,
 	sbinfo->uid = current->fsuid;
 	sbinfo->gid = current->fsgid;
 	sbinfo->policy = MPOL_DEFAULT;
+	sbinfo->flags = 0;
 	sbinfo->policy_nodes = node_states[N_HIGH_MEMORY];
 	sb->s_fs_info = sbinfo;
 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [patch 3/6] mempolicy: add MPOL_F_STATIC_NODES flag
  2008-03-01  0:45 ` [patch 2/6] mempolicy: support optional mode flags David Rientjes
@ 2008-03-01  0:45   ` David Rientjes
  2008-03-01  0:45     ` [patch 4/6] mempolicy: add bitmap_onto() and bitmap_fold() operations David Rientjes
  0 siblings, 1 reply; 22+ messages in thread
From: David Rientjes @ 2008-03-01  0:45 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Paul Jackson, Christoph Lameter, Lee Schermerhorn, Andi Kleen,
	linux-kernel

Add an optional mempolicy mode flag, MPOL_F_STATIC_NODES, that suppresses
the node remap when the policy is rebound.

Adds another member to struct mempolicy, nodemask_t user_nodemask, as
part of a union with cpuset_mems_allowed:

	struct mempolicy {
		...
		union {
			nodemask_t cpuset_mems_allowed;
			nodemask_t user_nodemask;
		} w;
	}

that stores the the nodemask that the user passed when he or she created
the mempolicy via set_mempolicy() or mbind().  When using
MPOL_F_STATIC_NODES, which is passed with any mempolicy mode, the user's
passed nodemask intersected with the VMA or task's allowed nodes is always
used when determining the preferred node, building the MPOL_BIND zonelist,
or creating the interleave nodemask.  This happens whenever the policy is 
rebound, including when a task's cpuset assignment changes or the cpuset's
mems are changed.

This creates an interesting side-effect in that it allows the mempolicy
"intent" to lie dormant and uneffected until it has access to the node(s)
that it desires.  For example, if you currently ask for an interleaved
policy over a set of nodes that you do not have access to, the mempolicy
is not created and the task continues to use the previous policy.  With
this change, however, it is possible to create the same mempolicy; it is
only effected when access to nodes in the nodemask is acquired.

It is also possible to mount tmpfs with the static nodemask behavior when
specifying a node or nodemask.  To do this, simply add "=static"
immediately following the mempolicy mode at mount time:

	mount -o remount mpol=interleave=static:1-3

Also removes mpol_check_policy() and folds its logic into mpol_new() since
it is now obsoleted.  The unused vma_mpol_equal() is also removed.

Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/mempolicy.h |   11 ++-
 mm/mempolicy.c            |  208 ++++++++++++++++++++++++---------------------
 mm/shmem.c                |    2 +
 3 files changed, 119 insertions(+), 102 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -24,11 +24,13 @@ enum {
 };
 
 /* Flags for set_mempolicy */
+#define MPOL_F_STATIC_NODES	(1 << 15)
+
 /*
  * MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
  * either set_mempolicy() or mbind().
  */
-#define MPOL_MODE_FLAGS		(0)
+#define MPOL_MODE_FLAGS		(MPOL_F_STATIC_NODES)
 
 /* Flags for get_mempolicy */
 #define MPOL_F_NODE	(1<<0)	/* return next IL mode instead of node mask */
@@ -84,7 +86,10 @@ struct mempolicy {
 		nodemask_t	 nodes;		/* interleave */
 		/* undefined for default */
 	} v;
-	nodemask_t cpuset_mems_allowed;	/* mempolicy relative to these nodes */
+	union {
+		nodemask_t cpuset_mems_allowed;	/* relative to these nodes */
+		nodemask_t user_nodemask;	/* nodemask passed by user */
+	} w;
 };
 
 /*
@@ -123,7 +128,6 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
 		return 1;
 	return __mpol_equal(a, b);
 }
-#define vma_mpol_equal(a,b) mpol_equal(vma_policy(a), vma_policy(b))
 
 /* Could later add inheritance of the process policy here. */
 
@@ -188,7 +192,6 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
 {
 	return 1;
 }
-#define vma_mpol_equal(a,b) 1
 
 #define mpol_set_vma_default(vma) do {} while(0)
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -113,58 +113,6 @@ struct mempolicy default_policy = {
 static void mpol_rebind_policy(struct mempolicy *pol,
                                const nodemask_t *newmask);
 
-/* Do sanity checking on a policy */
-static int mpol_check_policy(unsigned short mode, nodemask_t *nodes)
-{
-	int was_empty, is_empty;
-
-	if (!nodes)
-		return 0;
-
-	/*
-	 * "Contextualize" the in-coming nodemast for cpusets:
-	 * Remember whether in-coming nodemask was empty,  If not,
-	 * restrict the nodes to the allowed nodes in the cpuset.
-	 * This is guaranteed to be a subset of nodes with memory.
-	 */
-	cpuset_update_task_memory_state();
-	is_empty = was_empty = nodes_empty(*nodes);
-	if (!was_empty) {
-		nodes_and(*nodes, *nodes, cpuset_current_mems_allowed);
-		is_empty = nodes_empty(*nodes);	/* after "contextualization" */
-	}
-
-	switch (mode) {
-	case MPOL_DEFAULT:
-		/*
-		 * require caller to specify an empty nodemask
-		 * before "contextualization"
-		 */
-		if (!was_empty)
-			return -EINVAL;
-		break;
-	case MPOL_BIND:
-	case MPOL_INTERLEAVE:
-		/*
-		 * require at least 1 valid node after "contextualization"
-		 */
-		if (is_empty)
-			return -EINVAL;
-		break;
-	case MPOL_PREFERRED:
-		/*
-		 * Did caller specify invalid nodes?
-		 * Don't silently accept this as "local allocation".
-		 */
-		if (!was_empty && is_empty)
-			return -EINVAL;
-		break;
-	default:
-		BUG();
-	}
-	return 0;
-}
-
 /* Generate a custom zonelist for the BIND policy. */
 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 {
@@ -202,40 +150,48 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
 	return zl;
 }
 
+static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
+{
+	return pol->flags & MPOL_F_STATIC_NODES;
+}
+
 /* Create a new policy */
 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 				  nodemask_t *nodes)
 {
 	struct mempolicy *policy;
+	nodemask_t cpuset_context_nmask;
+	void *error = ERR_PTR(-EINVAL);
 
 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 
 	if (mode == MPOL_DEFAULT)
-		return NULL;
+		return (nodes && nodes_weight(*nodes)) ? error : NULL;
 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 	if (!policy)
 		return ERR_PTR(-ENOMEM);
 	atomic_set(&policy->refcnt, 1);
+	cpuset_update_task_memory_state();
+	nodes_and(cpuset_context_nmask, *nodes, cpuset_current_mems_allowed);
 	switch (mode) {
 	case MPOL_INTERLEAVE:
-		policy->v.nodes = *nodes;
-		if (nodes_weight(policy->v.nodes) == 0) {
-			kmem_cache_free(policy_cache, policy);
-			return ERR_PTR(-EINVAL);
-		}
+		if (nodes_empty(*nodes) || nodes_empty(cpuset_context_nmask))
+			goto free;
+		policy->v.nodes = cpuset_context_nmask;
 		break;
 	case MPOL_PREFERRED:
-		policy->v.preferred_node = first_node(*nodes);
+		policy->v.preferred_node = first_node(cpuset_context_nmask);
 		if (policy->v.preferred_node >= MAX_NUMNODES)
-			policy->v.preferred_node = -1;
+			goto free;
 		break;
 	case MPOL_BIND:
-		policy->v.zonelist = bind_zonelist(nodes);
+		if (nodes_empty(*nodes))
+			goto free;
+		policy->v.zonelist = bind_zonelist(&cpuset_context_nmask);
 		if (IS_ERR(policy->v.zonelist)) {
-			void *error_code = policy->v.zonelist;
-			kmem_cache_free(policy_cache, policy);
-			return error_code;
+			error = policy->v.zonelist;
+			goto free;
 		}
 		break;
 	default:
@@ -243,8 +199,15 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 	}
 	policy->policy = mode;
 	policy->flags = flags;
-	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
+	if (mpol_store_user_nodemask(policy))
+		policy->w.user_nodemask = *nodes;
+	else
+		policy->w.cpuset_mems_allowed = cpuset_mems_allowed(current);
 	return policy;
+
+free:
+	kmem_cache_free(policy_cache, policy);
+	return error;
 }
 
 static void gather_stats(struct page *, void *, int pte_dirty);
@@ -490,15 +453,14 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 {
 	struct mempolicy *new;
 
-	if (mpol_check_policy(mode, nodes))
-		return -EINVAL;
 	new = mpol_new(mode, flags, nodes);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 	mpol_free(current->mempolicy);
 	current->mempolicy = new;
 	mpol_set_task_struct_flag();
-	if (new && new->policy == MPOL_INTERLEAVE)
+	if (new && new->policy == MPOL_INTERLEAVE &&
+	    nodes_weight(new->v.nodes))
 		current->il_next = first_node(new->v.nodes);
 	return 0;
 }
@@ -818,9 +780,6 @@ static long do_mbind(unsigned long start, unsigned long len,
 	if (end == start)
 		return 0;
 
-	if (mpol_check_policy(mode, nmask))
-		return -EINVAL;
-
 	new = mpol_new(mode, mode_flags, nmask);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
@@ -1211,7 +1170,8 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 	next = next_node(nid, policy->v.nodes);
 	if (next >= MAX_NUMNODES)
 		next = first_node(policy->v.nodes);
-	me->il_next = next;
+	if (next < MAX_NUMNODES)
+		me->il_next = next;
 	return nid;
 }
 
@@ -1249,10 +1209,13 @@ static unsigned offset_il_node(struct mempolicy *pol,
 		struct vm_area_struct *vma, unsigned long off)
 {
 	unsigned nnodes = nodes_weight(pol->v.nodes);
-	unsigned target = (unsigned)off % nnodes;
+	unsigned target;
 	int c;
 	int nid = -1;
 
+	if (!nnodes)
+		return numa_node_id();
+	target = (unsigned int)off % nnodes;
 	c = 0;
 	do {
 		nid = next_node(nid, pol->v.nodes);
@@ -1457,6 +1420,16 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
 	return new;
 }
 
+static int mpol_match_intent(const struct mempolicy *a,
+			     const struct mempolicy *b)
+{
+	if (a->flags != b->flags)
+		return 0;
+	if (!mpol_store_user_nodemask(a))
+		return 1;
+	return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
+}
+
 /* Slow path of a mempolicy comparison */
 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 {
@@ -1464,6 +1437,8 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 		return 0;
 	if (a->policy != b->policy)
 		return 0;
+	if (a->policy != MPOL_DEFAULT && !mpol_match_intent(a, b))
+		return 0;
 	switch (a->policy) {
 	case MPOL_DEFAULT:
 		return 1;
@@ -1770,54 +1745,79 @@ void numa_default_policy(void)
 static void mpol_rebind_policy(struct mempolicy *pol,
 			       const nodemask_t *newmask)
 {
-	nodemask_t *mpolmask;
 	nodemask_t tmp;
+	int static_nodes;
 
 	if (!pol)
 		return;
-	mpolmask = &pol->cpuset_mems_allowed;
-	if (nodes_equal(*mpolmask, *newmask))
+	static_nodes = pol->flags & MPOL_F_STATIC_NODES;
+	if (!mpol_store_user_nodemask(pol) &&
+	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 		return;
 
 	switch (pol->policy) {
 	case MPOL_DEFAULT:
 		break;
 	case MPOL_INTERLEAVE:
-		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
+		if (static_nodes)
+			nodes_and(tmp, pol->w.user_nodemask, *newmask);
+		else {
+			nodes_remap(tmp, pol->v.nodes,
+				    pol->w.cpuset_mems_allowed, *newmask);
+			pol->w.cpuset_mems_allowed = *newmask;
+		}
 		pol->v.nodes = tmp;
-		*mpolmask = *newmask;
-		current->il_next = node_remap(current->il_next,
-						*mpolmask, *newmask);
+		if (!node_isset(current->il_next, tmp)) {
+			current->il_next = next_node(current->il_next, tmp);
+			if (current->il_next >= MAX_NUMNODES)
+				current->il_next = first_node(tmp);
+			if (current->il_next >= MAX_NUMNODES)
+				current->il_next = numa_node_id();
+		}
 		break;
 	case MPOL_PREFERRED:
-		pol->v.preferred_node = node_remap(pol->v.preferred_node,
-						*mpolmask, *newmask);
-		*mpolmask = *newmask;
+		if (static_nodes) {
+			int node = first_node(pol->w.user_nodemask);
+
+			if (node_isset(node, *newmask))
+				pol->v.preferred_node = node;
+			else
+				pol->v.preferred_node = -1;
+		} else {
+			pol->v.preferred_node = node_remap(pol->v.preferred_node,
+					pol->w.cpuset_mems_allowed, *newmask);
+			pol->w.cpuset_mems_allowed = *newmask;
+		}
 		break;
 	case MPOL_BIND: {
-		nodemask_t nodes;
-		struct zone **z;
 		struct zonelist *zonelist;
 
-		nodes_clear(nodes);
-		for (z = pol->v.zonelist->zones; *z; z++)
-			node_set(zone_to_nid(*z), nodes);
-		nodes_remap(tmp, nodes, *mpolmask, *newmask);
-		nodes = tmp;
-
-		zonelist = bind_zonelist(&nodes);
+		if (static_nodes)
+			nodes_and(tmp, pol->w.user_nodemask, *newmask);
+		else {
+			nodemask_t nodes;
+			struct zone **z;
+
+			nodes_clear(nodes);
+			for (z = pol->v.zonelist->zones; *z; z++)
+				node_set(zone_to_nid(*z), nodes);
+			nodes_remap(tmp, nodes, pol->w.cpuset_mems_allowed,
+				    *newmask);
+			pol->w.cpuset_mems_allowed = *newmask;
+		}
 
 		/* If no mem, then zonelist is NULL and we keep old zonelist.
 		 * If that old zonelist has no remaining mems_allowed nodes,
 		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
 		 */
-
-		if (!IS_ERR(zonelist)) {
-			/* Good - got mem - substitute new zonelist */
-			kfree(pol->v.zonelist);
-			pol->v.zonelist = zonelist;
+		if (nodes_weight(tmp)) {
+			zonelist = bind_zonelist(&tmp);
+			if (!IS_ERR(zonelist)) {
+				/* Good - got mem - substitute new zonelist */
+				kfree(pol->v.zonelist);
+				pol->v.zonelist = zonelist;
+			}
 		}
-		*mpolmask = *newmask;
 		break;
 	}
 	default:
@@ -1870,6 +1870,7 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	int l;
 	nodemask_t nodes;
 	unsigned short mode = pol ? pol->policy : MPOL_DEFAULT;
+	unsigned short flags = pol ? pol->flags : 0;
 
 	switch (mode) {
 	case MPOL_DEFAULT:
@@ -1901,6 +1902,17 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	strcpy(p, policy_types[mode]);
 	p += l;
 
+	if (flags) {
+		int need_bar = 0;
+
+		if (buffer + maxlen < p + 2)
+			return -ENOSPC;
+		*p++ = '=';
+
+		if (flags & MPOL_F_STATIC_NODES)
+			p += sprintf(p, "%sstatic", need_bar++ ? "|" : "");
+	}
+
 	if (!nodes_empty(nodes)) {
 		if (buffer + maxlen < p + 2)
 			return -ENOSPC;
diff --git a/mm/shmem.c b/mm/shmem.c
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1129,6 +1129,8 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
 		err = 0;
 	}
 	if (flags) {
+		if (!strcmp(flags, "static"))
+			*mode_flags |= MPOL_F_STATIC_NODES;
 	}
 out:
 	/* Restore string for error message */

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [patch 4/6] mempolicy: add bitmap_onto() and bitmap_fold() operations
  2008-03-01  0:45   ` [patch 3/6] mempolicy: add MPOL_F_STATIC_NODES flag David Rientjes
@ 2008-03-01  0:45     ` David Rientjes
  2008-03-01  0:45       ` [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag David Rientjes
  0 siblings, 1 reply; 22+ messages in thread
From: David Rientjes @ 2008-03-01  0:45 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Paul Jackson, Christoph Lameter, Lee Schermerhorn, Andi Kleen,
	kosaki.motohiro, mel, ray-lk, linux-kernel

From: Paul Jackson <pj@sgi.com>

The following adds two more bitmap operators, bitmap_onto() and
bitmap_fold(), with the usual cpumask and nodemask wrappers.

The bitmap_onto() operator computes one bitmap relative to
another.  If the n-th bit in the origin mask is set, then the
m-th bit of the destination mask will be set, where m is
the position of the n-th set bit in the relative mask.

The bitmap_fold() operator folds a bitmap into a second that
has bit m set iff the input bitmap has some bit n set, where
m == n mod sz, for the specified sz value.

There are two substantive changes between this patch and its
predecessor bitmap_relative:
 1) Renamed bitmap_relative() to be bitmap_onto().
 2) Added bitmap_fold().

The essential motivation for bitmap_onto() is to provide
a mechanism for converting a cpuset-relative CPU or Node
mask to an absolute mask.  Cpuset relative masks are written
as if the current task were in a cpuset whose CPUs or
Nodes were just the consecutive ones numbered 0..N-1, for
some N.  The bitmap_onto() operator is provided in anticipation
of adding support for the first such cpuset relative mask,
by the mbind() and set_mempolicy() system calls, using a
planned flag of MPOL_F_RELATIVE_NODES.  These bitmap operators
(and their nodemask wrappers, in particular) will be used in
code that converts the user specified cpuset relative memory
policy to a specific system node numbered policy, given the
current mems_allowed of the tasks cpuset.

Such cpuset relative mempolicies will address two deficiencies
of the existing interface between cpusets and mempolicies:
 1) A task cannot at present reliably establish a cpuset
    relative mempolicy because there is an essential race
    condition, in that the tasks cpuset may be changed in
    between the time the task can query its cpuset placement,
    and the time the task can issue the applicable mbind or
    set_memplicy system call.
 2) A task cannot at present establish what cpuset relative
    mempolicy it would like to have, if it is in a smaller
    cpuset than it might have mempolicy preferences for,
    because the existing interface only allows specifying
    mempolicies for nodes currently allowed by the cpuset.

Cpuset relative mempolicies are useful for tasks that don't
distinguish particularly between one CPU or Node and another,
but only between how many of each are allowed, and the proper
placement of threads and memory pages on the various CPUs and
Nodes available.

The motivation for the added bitmap_fold() can be seen in
the following example.

Let's say an application has specified some mempolicies
that presume 16 memory nodes, including say a mempolicy that
specified MPOL_F_RELATIVE_NODES (cpuset relative) nodes 12-15.
Then lets say that application is crammed into a cpuset that only
has 8 memory nodes, 0-7.  If one just uses bitmap_onto(), this
mempolicy, mapped to that cpuset, would ignore the requested
relative nodes above 7, leaving it empty of nodes.  That's not
good; better to fold the higher nodes down, so that some nodes
are included in the resulting mapped mempolicy.  In this case,
the mempolicy nodes 12-15 are taken modulo 8 (the weight of the
mems_allowed of the confining cpuset), resulting in a mempolicy
specifying nodes 4-7.

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Lee.Schermerhorn@hp.com
Cc: clameter@sgi.com
Cc: ak@suse.de
Cc: mel@csn.ul.ie
Cc: kosaki.motohiro@jp.fujitsu.com
Cc: ray-lk@madrabbit.org
Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/bitmap.h   |    6 ++
 include/linux/cpumask.h  |   22 ++++++-
 include/linux/nodemask.h |   22 ++++++-
 lib/bitmap.c             |  158 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 206 insertions(+), 2 deletions(-)

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -46,6 +46,8 @@
  * bitmap_shift_left(dst, src, n, nbits)	*dst = *src << n
  * bitmap_remap(dst, src, old, new, nbits)	*dst = map(old, new)(src)
  * bitmap_bitremap(oldbit, old, new, nbits)	newbit = map(old, new)(oldbit)
+ * bitmap_onto(dst, orig, relmap, nbits)	*dst = orig relative to relmap
+ * bitmap_fold(dst, orig, sz, nbits)		dst bits = orig bits mod sz
  * bitmap_scnprintf(buf, len, src, nbits)	Print bitmap src to buf
  * bitmap_parse(buf, buflen, dst, nbits)	Parse bitmap dst from kernel buf
  * bitmap_parse_user(ubuf, ulen, dst, nbits)	Parse bitmap dst from user buf
@@ -120,6 +122,10 @@ extern void bitmap_remap(unsigned long *dst, const unsigned long *src,
 		const unsigned long *old, const unsigned long *new, int bits);
 extern int bitmap_bitremap(int oldbit,
 		const unsigned long *old, const unsigned long *new, int bits);
+extern void bitmap_onto(unsigned long *dst, const unsigned long *orig,
+		const unsigned long *relmap, int bits);
+extern void bitmap_fold(unsigned long *dst, const unsigned long *orig,
+		int sz, int bits);
 extern int bitmap_find_free_region(unsigned long *bitmap, int bits, int order);
 extern void bitmap_release_region(unsigned long *bitmap, int pos, int order);
 extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order);
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -14,6 +14,8 @@
  * bitmap_scnlistprintf() and bitmap_parselist(), also in bitmap.c.
  * For details of cpu_remap(), see bitmap_bitremap in lib/bitmap.c
  * For details of cpus_remap(), see bitmap_remap in lib/bitmap.c.
+ * For details of cpus_onto(), see bitmap_onto in lib/bitmap.c.
+ * For details of cpus_fold(), see bitmap_fold in lib/bitmap.c.
  *
  * The available cpumask operations are:
  *
@@ -53,7 +55,9 @@
  * int cpulist_scnprintf(buf, len, mask) Format cpumask as list for printing
  * int cpulist_parse(buf, map)		Parse ascii string as cpulist
  * int cpu_remap(oldbit, old, new)	newbit = map(old, new)(oldbit)
- * int cpus_remap(dst, src, old, new)	*dst = map(old, new)(src)
+ * void cpus_remap(dst, src, old, new)	*dst = map(old, new)(src)
+ * void cpus_onto(dst, orig, relmap)	*dst = orig relative to relmap
+ * void cpus_fold(dst, orig, sz)	dst bits = orig bits mod sz
  *
  * for_each_cpu_mask(cpu, mask)		for-loop cpu over mask
  *
@@ -311,6 +315,22 @@ static inline void __cpus_remap(cpumask_t *dstp, const cpumask_t *srcp,
 	bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits);
 }
 
+#define cpus_onto(dst, orig, relmap) \
+		__cpus_onto(&(dst), &(orig), &(relmap), NR_CPUS)
+static inline void __cpus_onto(cpumask_t *dstp, const cpumask_t *origp,
+		const cpumask_t *relmapp, int nbits)
+{
+	bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits);
+}
+
+#define cpus_fold(dst, orig, sz) \
+		__cpus_fold(&(dst), &(orig), sz, NR_CPUS)
+static inline void __cpus_fold(cpumask_t *dstp, const cpumask_t *origp,
+		int sz, int nbits)
+{
+	bitmap_fold(dstp->bits, origp->bits, sz, nbits);
+}
+
 #if NR_CPUS > 1
 #define for_each_cpu_mask(cpu, mask)		\
 	for ((cpu) = first_cpu(mask);		\
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -14,6 +14,8 @@
  * bitmap_scnlistprintf() and bitmap_parselist(), also in bitmap.c.
  * For details of node_remap(), see bitmap_bitremap in lib/bitmap.c.
  * For details of nodes_remap(), see bitmap_remap in lib/bitmap.c.
+ * For details of nodes_onto(), see bitmap_onto in lib/bitmap.c.
+ * For details of nodes_fold(), see bitmap_fold in lib/bitmap.c.
  *
  * The available nodemask operations are:
  *
@@ -55,7 +57,9 @@
  * int nodelist_scnprintf(buf, len, mask) Format nodemask as list for printing
  * int nodelist_parse(buf, map)		Parse ascii string as nodelist
  * int node_remap(oldbit, old, new)	newbit = map(old, new)(oldbit)
- * int nodes_remap(dst, src, old, new)	*dst = map(old, new)(dst)
+ * void nodes_remap(dst, src, old, new)	*dst = map(old, new)(src)
+ * void nodes_onto(dst, orig, relmap)	*dst = orig relative to relmap
+ * void nodes_fold(dst, orig, sz)	dst bits = orig bits mod sz
  *
  * for_each_node_mask(node, mask)	for-loop node over mask
  *
@@ -326,6 +330,22 @@ static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
 	bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits);
 }
 
+#define nodes_onto(dst, orig, relmap) \
+		__nodes_onto(&(dst), &(orig), &(relmap), MAX_NUMNODES)
+static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp,
+		const nodemask_t *relmapp, int nbits)
+{
+	bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits);
+}
+
+#define nodes_fold(dst, orig, sz) \
+		__nodes_fold(&(dst), &(orig), sz, MAX_NUMNODES)
+static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp,
+		int sz, int nbits)
+{
+	bitmap_fold(dstp->bits, origp->bits, sz, nbits);
+}
+
 #if MAX_NUMNODES > 1
 #define for_each_node_mask(node, mask)			\
 	for ((node) = first_node(mask);			\
diff --git a/lib/bitmap.c b/lib/bitmap.c
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -698,6 +698,164 @@ int bitmap_bitremap(int oldbit, const unsigned long *old,
 }
 EXPORT_SYMBOL(bitmap_bitremap);
 
+/**
+ * bitmap_onto - translate one bitmap relative to another
+ *	@dst: resulting translated bitmap
+ * 	@orig: original untranslated bitmap
+ * 	@relmap: bitmap relative to which translated
+ *	@bits: number of bits in each of these bitmaps
+ *
+ * Set the n-th bit of @dst iff there exists some m such that the
+ * n-th bit of @relmap is set, the m-th bit of @orig is set, and
+ * the n-th bit of @relmap is also the m-th _set_ bit of @relmap.
+ * (If you understood the previous sentence the first time your
+ * read it, you're overqualified for your current job.)
+ *
+ * In other words, @orig is mapped onto (surjectively) @dst,
+ * using the the map { <n, m> | the n-th bit of @relmap is the
+ * m-th set bit of @relmap }.
+ *
+ * Any set bits in @orig above bit number W, where W is the
+ * weight of (number of set bits in) @relmap are mapped nowhere.
+ * In particular, if for all bits m set in @orig, m >= W, then
+ * @dst will end up empty.  In situations where the possibility
+ * of such an empty result is not desired, one way to avoid it is
+ * to use the bitmap_fold() operator, below, to first fold the
+ * @orig bitmap over itself so that all its set bits x are in the
+ * range 0 <= x < W.  The bitmap_fold() operator does this by
+ * setting the bit (m % W) in @dst, for each bit (m) set in @orig.
+ *
+ * Example [1] for bitmap_onto():
+ *  Let's say @relmap has bits 30-39 set, and @orig has bits
+ *  1, 3, 5, 7, 9 and 11 set.  Then on return from this routine,
+ *  @dst will have bits 31, 33, 35, 37 and 39 set.
+ *
+ *  When bit 0 is set in @orig, it means turn on the bit in
+ *  @dst corresponding to whatever is the first bit (if any)
+ *  that is turned on in @relmap.  Since bit 0 was off in the
+ *  above example, we leave off that bit (bit 30) in @dst.
+ *
+ *  When bit 1 is set in @orig (as in the above example), it
+ *  means turn on the bit in @dst corresponding to whatever
+ *  is the second bit that is turned on in @relmap.  The second
+ *  bit in @relmap that was turned on in the above example was
+ *  bit 31, so we turned on bit 31 in @dst.
+ *
+ *  Similarly, we turned on bits 33, 35, 37 and 39 in @dst,
+ *  because they were the 4th, 6th, 8th and 10th set bits
+ *  set in @relmap, and the 4th, 6th, 8th and 10th bits of
+ *  @orig (i.e. bits 3, 5, 7 and 9) were also set.
+ *
+ *  When bit 11 is set in @orig, it means turn on the bit in
+ *  @dst corresponding to whatever is the twelth bit that is
+ *  turned on in @relmap.  In the above example, there were
+ *  only ten bits turned on in @relmap (30..39), so that bit
+ *  11 was set in @orig had no affect on @dst.
+ *
+ * Example [2] for bitmap_fold() + bitmap_onto():
+ *  Let's say @relmap has these ten bits set:
+ *		40 41 42 43 45 48 53 61 74 95
+ *  (for the curious, that's 40 plus the first ten terms of the
+ *  Fibonacci sequence.)
+ *
+ *  Further lets say we use the following code, invoking
+ *  bitmap_fold() then bitmap_onto, as suggested above to
+ *  avoid the possitility of an empty @dst result:
+ *
+ *	unsigned long *tmp;	// a temporary bitmap's bits
+ *
+ *	bitmap_fold(tmp, orig, bitmap_weight(relmap, bits), bits);
+ *	bitmap_onto(dst, tmp, relmap, bits);
+ *
+ *  Then this table shows what various values of @dst would be, for
+ *  various @orig's.  I list the zero-based positions of each set bit.
+ *  The tmp column shows the intermediate result, as computed by
+ *  using bitmap_fold() to fold the @orig bitmap modulo ten
+ *  (the weight of @relmap).
+ *
+ *      @orig           tmp            @dst
+ *      0                0             40
+ *      1                1             41
+ *      9                9             95
+ *      10               0             40 (*)
+ *      1 3 5 7          1 3 5 7       41 43 48 61
+ *      0 1 2 3 4        0 1 2 3 4     40 41 42 43 45
+ *      0 9 18 27        0 9 8 7       40 61 74 95
+ *      0 10 20 30       0             40
+ *      0 11 22 33       0 1 2 3       40 41 42 43
+ *      0 12 24 36       0 2 4 6       40 42 45 53
+ *      78 102 211       1 2 8         41 42 74 (*)
+ *
+ * (*) For these marked lines, if we hadn't first done bitmap_fold()
+ *     into tmp, then the @dst result would have been empty.
+ *
+ * If either of @orig or @relmap is empty (no set bits), then @dst
+ * will be returned empty.
+ *
+ * If (as explained above) the only set bits in @orig are in positions
+ * m where m >= W, (where W is the weight of @relmap) then @dst will
+ * once again be returned empty.
+ *
+ * All bits in @dst not set by the above rule are cleared.
+ */
+void bitmap_onto(unsigned long *dst, const unsigned long *orig,
+			const unsigned long *relmap, int bits)
+{
+	int n, m;       	/* same meaning as in above comment */
+
+	if (dst == orig)	/* following doesn't handle inplace mappings */
+		return;
+	bitmap_zero(dst, bits);
+
+	/*
+	 * The following code is a more efficient, but less
+	 * obvious, equivalent to the loop:
+	 *	for (m = 0; m < bitmap_weight(relmap, bits); m++) {
+	 *		n = bitmap_ord_to_pos(orig, m, bits);
+	 *		if (test_bit(m, orig))
+	 *			set_bit(n, dst);
+	 *	}
+	 */
+
+	m = 0;
+	for (n = find_first_bit(relmap, bits);
+	     n < bits;
+	     n = find_next_bit(relmap, bits, n + 1)) {
+		/* m == bitmap_pos_to_ord(relmap, n, bits) */
+		if (test_bit(m, orig))
+			set_bit(n, dst);
+		m++;
+	}
+}
+EXPORT_SYMBOL(bitmap_onto);
+
+/**
+ * bitmap_fold - fold larger bitmap into smaller, modulo specified size
+ *	@dst: resulting smaller bitmap
+ *	@orig: original larger bitmap
+ *	@sz: specified size
+ *	@bits: number of bits in each of these bitmaps
+ *
+ * For each bit oldbit in @orig, set bit oldbit mod @sz in @dst.
+ * Clear all other bits in @dst.  See further the comment and
+ * Example [2] for bitmap_onto() for why and how to use this.
+ */
+void bitmap_fold(unsigned long *dst, const unsigned long *orig,
+			int sz, int bits)
+{
+	int oldbit;
+
+	if (dst == orig)	/* following doesn't handle inplace mappings */
+		return;
+	bitmap_zero(dst, bits);
+
+	for (oldbit = find_first_bit(orig, bits);
+	     oldbit < bits;
+	     oldbit = find_next_bit(orig, bits, oldbit + 1))
+		set_bit(oldbit % sz, dst);
+}
+EXPORT_SYMBOL(bitmap_fold);
+
 /*
  * Common code for bitmap_*_region() routines.
  *	bitmap: array of unsigned longs corresponding to the bitmap

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag
  2008-03-01  0:45     ` [patch 4/6] mempolicy: add bitmap_onto() and bitmap_fold() operations David Rientjes
@ 2008-03-01  0:45       ` David Rientjes
  2008-03-01  0:45         ` [patch 6/6] mempolicy: update NUMA memory policy documentation David Rientjes
  0 siblings, 1 reply; 22+ messages in thread
From: David Rientjes @ 2008-03-01  0:45 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Paul Jackson, Christoph Lameter, Lee Schermerhorn, Andi Kleen,
	linux-kernel

Adds another optional mode flag, MPOL_F_RELATIVE_NODES, that specifies
nodemasks passed via set_mempolicy() or mbind() should be considered
relative to the current task's mems_allowed.

When the mempolicy is created, the passed nodemask is folded and mapped
onto the current task's mems_allowed.  For example, consider a task
using set_mempolicy() to pass MPOL_INTERLEAVE | MPOL_F_RELATIVE_NODES
with a nodemask of 1-3.  If current's mems_allowed is 4-7, the effected
nodemask is 5-7 (the second, third, and fourth node of mems_allowed).

If the same task is attached to a cpuset, the mempolicy nodemask is
rebound each time the mems are changed.  Some possible rebinds and
results are:

	mems			result
	1-3			1-3
	1-7			2-4
	1,5-6			1,5-6
	1,5-7			5-7

Likewise, the zonelist built for MPOL_BIND acts on the set of zones
assigned to the resultant nodemask from the relative remap.

In the MPOL_PREFERRED case, the preferred node is remapped from the
currently effected nodemask to the relative nodemask.

This mempolicy mode flag was conceived of by Paul Jackson <pj@sgi.com>.

Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/mempolicy.h |    3 ++-
 mm/mempolicy.c            |   36 ++++++++++++++++++++++++++++++++++--
 mm/shmem.c                |    6 ++++++
 3 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -25,12 +25,13 @@ enum {
 
 /* Flags for set_mempolicy */
 #define MPOL_F_STATIC_NODES	(1 << 15)
+#define MPOL_F_RELATIVE_NODES	(1 << 14)
 
 /*
  * MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
  * either set_mempolicy() or mbind().
  */
-#define MPOL_MODE_FLAGS		(MPOL_F_STATIC_NODES)
+#define MPOL_MODE_FLAGS		(MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES)
 
 /* Flags for get_mempolicy */
 #define MPOL_F_NODE	(1<<0)	/* return next IL mode instead of node mask */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -152,7 +152,15 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
 
 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 {
-	return pol->flags & MPOL_F_STATIC_NODES;
+	return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
+}
+
+static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
+				   const nodemask_t *rel)
+{
+	nodemask_t tmp;
+	nodes_fold(tmp, *orig, nodes_weight(*rel));
+	nodes_onto(*ret, tmp, *rel);
 }
 
 /* Create a new policy */
@@ -173,7 +181,12 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 		return ERR_PTR(-ENOMEM);
 	atomic_set(&policy->refcnt, 1);
 	cpuset_update_task_memory_state();
-	nodes_and(cpuset_context_nmask, *nodes, cpuset_current_mems_allowed);
+	if (flags & MPOL_F_RELATIVE_NODES)
+		mpol_relative_nodemask(&cpuset_context_nmask, nodes,
+				       &cpuset_current_mems_allowed);
+	else
+		nodes_and(cpuset_context_nmask, *nodes,
+			  cpuset_current_mems_allowed);
 	switch (mode) {
 	case MPOL_INTERLEAVE:
 		if (nodes_empty(*nodes) || nodes_empty(cpuset_context_nmask))
@@ -898,6 +911,9 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 	mode &= ~MPOL_MODE_FLAGS;
 	if (mode >= MPOL_MAX)
 		return -EINVAL;
+	if ((mode_flags & MPOL_F_STATIC_NODES) &&
+	    (mode_flags & MPOL_F_RELATIVE_NODES))
+		return -EINVAL;
 	err = get_nodes(&nodes, nmask, maxnode);
 	if (err)
 		return err;
@@ -916,6 +932,8 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 	mode &= ~MPOL_MODE_FLAGS;
 	if ((unsigned int)mode >= MPOL_MAX)
 		return -EINVAL;
+	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
+		return -EINVAL;
 	err = get_nodes(&nodes, nmask, maxnode);
 	if (err)
 		return err;
@@ -1747,10 +1765,12 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 {
 	nodemask_t tmp;
 	int static_nodes;
+	int relative_nodes;
 
 	if (!pol)
 		return;
 	static_nodes = pol->flags & MPOL_F_STATIC_NODES;
+	relative_nodes = pol->flags & MPOL_F_RELATIVE_NODES;
 	if (!mpol_store_user_nodemask(pol) &&
 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 		return;
@@ -1761,6 +1781,9 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 	case MPOL_INTERLEAVE:
 		if (static_nodes)
 			nodes_and(tmp, pol->w.user_nodemask, *newmask);
+		else if (relative_nodes)
+			mpol_relative_nodemask(&tmp, &pol->w.user_nodemask,
+					       newmask);
 		else {
 			nodes_remap(tmp, pol->v.nodes,
 				    pol->w.cpuset_mems_allowed, *newmask);
@@ -1783,6 +1806,10 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 				pol->v.preferred_node = node;
 			else
 				pol->v.preferred_node = -1;
+		} else if (relative_nodes) {
+			mpol_relative_nodemask(&tmp, &pol->w.user_nodemask,
+					       newmask);
+			pol->v.preferred_node = first_node(tmp);
 		} else {
 			pol->v.preferred_node = node_remap(pol->v.preferred_node,
 					pol->w.cpuset_mems_allowed, *newmask);
@@ -1794,6 +1821,9 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 
 		if (static_nodes)
 			nodes_and(tmp, pol->w.user_nodemask, *newmask);
+		else if (relative_nodes)
+			mpol_relative_nodemask(&tmp, &pol->w.user_nodemask,
+					       newmask);
 		else {
 			nodemask_t nodes;
 			struct zone **z;
@@ -1911,6 +1941,8 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 
 		if (flags & MPOL_F_STATIC_NODES)
 			p += sprintf(p, "%sstatic", need_bar++ ? "|" : "");
+		if (flags & MPOL_F_RELATIVE_NODES)
+			p += sprintf(p, "%srelative", need_bar++ ? "|" : "");
 	}
 
 	if (!nodes_empty(nodes)) {
diff --git a/mm/shmem.c b/mm/shmem.c
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1131,6 +1131,12 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
 	if (flags) {
 		if (!strcmp(flags, "static"))
 			*mode_flags |= MPOL_F_STATIC_NODES;
+		if (!strcmp(flags, "relative"))
+			*mode_flags |= MPOL_F_RELATIVE_NODES;
+
+		if ((*mode_flags & MPOL_F_STATIC_NODES) &&
+		    (*mode_flags & MPOL_F_RELATIVE_NODES))
+			err = 1;
 	}
 out:
 	/* Restore string for error message */

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [patch 6/6] mempolicy: update NUMA memory policy documentation
  2008-03-01  0:45       ` [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag David Rientjes
@ 2008-03-01  0:45         ` David Rientjes
  2008-03-01  0:55           ` Randy Dunlap
  0 siblings, 1 reply; 22+ messages in thread
From: David Rientjes @ 2008-03-01  0:45 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Paul Jackson, Christoph Lameter, Lee Schermerhorn, Andi Kleen,
	Randy Dunlap, linux-kernel

Updates Documentation/vm/numa_memory_policy.txt and
Documentation/filesystems/tmpfs.txt to describe optional mempolicy mode
flags.

Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Paul Jackson <pj@sgi.com>
---
 Documentation/filesystems/tmpfs.txt     |   12 +++
 Documentation/vm/numa_memory_policy.txt |  131 +++++++++++++++++++++++-------
 2 files changed, 112 insertions(+), 31 deletions(-)

diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -92,6 +92,18 @@ NodeList format is a comma-separated list of decimal numbers and ranges,
 a range being two hyphen-separated decimal numbers, the smallest and
 largest node numbers in the range.  For example, mpol=bind:0-3,5,7,9-15
 
+NUMA memory allocation policies have optional flags that can be used in
+conjunction with their modes.  These optional flags can be specified
+when tmpfs is mounted by appending them to the mode before the NodeList.
+See Documentation/vm/numa_memory_policy.txt for a list of all available
+memory allocation policy mode flags.
+
+	=static		is equivalent to	MPOL_F_STATIC_NODES
+	=relative	is equivalent to	MPOL_F_RELATIVE_NODES
+
+For example, mpol=bind=static:NodeList, is the equivalent of an
+allocation policy of MPOL_BIND | MPOL_F_STATIC_NODES.
+
 Note that trying to mount a tmpfs with an mpol option will fail if the
 running kernel does not support NUMA; and will fail if its nodelist
 specifies a node which is not online.  If your system relies on that
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
--- a/Documentation/vm/numa_memory_policy.txt
+++ b/Documentation/vm/numa_memory_policy.txt
@@ -135,9 +135,11 @@ most general to most specific:
 
 Components of Memory Policies
 
-    A Linux memory policy is a tuple consisting of a "mode" and an optional set
-    of nodes.  The mode determine the behavior of the policy, while the
-    optional set of nodes can be viewed as the arguments to the behavior.
+    A Linux memory policy consists of a "mode", optional mode flags, and an
+    optional set of nodes.  The mode determines the behavior of the policy,
+    the optional mode flags determine the behavior of the mode, and the
+    optional set of nodes can be viewed as the arguments to the policy
+    behavior.
 
    Internally, memory policies are implemented by a reference counted
    structure, struct mempolicy.  Details of this structure will be discussed
@@ -179,7 +181,8 @@ Components of Memory Policies
 	    on a non-shared region of the address space.  However, see
 	    MPOL_PREFERRED below.
 
-	    The Default mode does not use the optional set of nodes.
+	    It is an error for the set of nodes specified for this policy to
+	    be non-empty.
 
 	MPOL_BIND:  This mode specifies that memory must come from the
 	set of nodes specified by the policy.
@@ -231,6 +234,80 @@ Components of Memory Policies
 	    the temporary interleaved system default policy works in this
 	    mode.
 
+   Linux memory policy supports the following optional mode flag:
+
+	MPOL_F_STATIC_NODES:  This flag specifies that the nodemask passed by
+	the user should not be remapped if the task or VMA's set of allowed
+	nodes changes after the memory policy has been defined.
+
+	    Without this flag, anytime a mempolicy is rebound because of a
+	    change in the set of allowed nodes, the node (Preferred) or
+	    nodemask (Bind, Interleave) is remapped to the new set of
+	    allowed nodes.  This may result in nodes being used that were
+	    previously undesired.
+
+	    With this flag, if the user specified nodes overlap with the
+	    nodes allowed by the task's cpuset, then the memory policy is
+	    applied to their intersection.  If the two sets of nodes do not
+	    overlap, the Default policy is used.
+
+	    For example, consider a task that is attached to a cpuset with
+	    mems 1-3 that sets an Interleave policy over the same set.  If
+	    the cpuset's mems change to 3-5, the Interleave will now occur
+	    over nodes 3, 4, and 5.  With this flag, however, since only node
+	    3 is allowed from the user's nodemask, the "interleave" only
+	    occurs over that node.  If no nodes from the user's nodemask are
+	    now allowed, the Default behavior is used.
+
+	    MPOL_F_STATIC_NODES cannot be used with MPOL_F_RELATIVE_NODES.
+
+	MPOL_F_RELATIVE_NODES:  This flag specifies that the nodemask passed
+	by the user will be mapped relative to the set of the task or VMA's
+	set of allowed nodes.  The kernel stores the user-passed nodemask,
+	and if the allowed nodes changes, then that original nodemask will
+	be remapped relative to the new set of allowed nodes.
+
+	    Without this flag (and without MPOL_F_STATIC_NODES), anytime a
+	    mempolicy is rebound because of a change in the set of allowed
+	    nodes, the node (Preferred) or nodemask (Bind, Interleave) is
+	    remapped to the new set of allowed nodes.  That remap may not
+	    preserve the relative nature of the user's passed nodemask to its
+	    set of allowed nodes upon successive rebinds: a nodemask of
+	    1,3,5 may be remapped to 7-9 and then to 1-3 if the set of
+	    allowed nodes is restored to its original state.
+
+	    With this flag, the remap is done so that the node numbers from
+	    the user's passed nodemask are relative to the set of allowed
+	    nodes.  In other words, if nodes 0, 2, and 4 are set in the user's
+	    nodemask, the policy will be effected over the first (and in the
+	    Bind or Interleave case, the third and fifth) nodes in the set of
+	    allowed nodes.  The nodemask passed by the user represents nodes
+	    relative to task or VMA's set of allowed nodes.
+
+	    If the user's nodemask includes nodes that are outside the range
+	    of the new set of allowed nodes (for example, node 5 is set in
+	    the user's nodemask when the set of allowed nodes is only 0-3),
+	    then the remap wraps around to the beginning of the nodemask and,
+	    if not already set, sets the node in the mempolicy nodemask.
+
+	    For example, consider a task that is attached to a cpuset with
+	    mems 2-5 that sets an Interleave policy over the same set with
+	    MPOL_F_RELATIVE_NODES.  If the cpuset's mems change to 3-7, the
+	    interleave now occurs over nodes 3,5-6.  If the cpuset's mems
+	    then change to 0,2-3,5, then the interleave occurs over nodes
+	    0,3,5.
+
+	    Thanks to the consistent remapping, applications preparing
+	    nodemasks to specify memory policies using this flag should
+	    disregard their current, actual cpuset imposed memory placement
+	    and prepare the nodemask as if they were always located on
+	    memory nodes 0 to N-1, where N is the number of memory nodes the
+	    policy is intended to manage.  Let the kernel then remap to the
+	    set of memory nodes allowed by the task's cpuset, as that may
+	    change over time.
+
+	    MPOL_F_RELATIVE_NODES cannot be used with MPOL_F_STATIC_NODES.
+
 MEMORY POLICY APIs
 
 Linux supports 3 system calls for controlling memory policy.  These APIS
@@ -251,7 +328,9 @@ Set [Task] Memory Policy:
 	Set's the calling task's "task/process memory policy" to mode
 	specified by the 'mode' argument and the set of nodes defined
 	by 'nmask'.  'nmask' points to a bit mask of node ids containing
-	at least 'maxnode' ids.
+	at least 'maxnode' ids.  Optional mode flags may be passed by
+	combining the 'mode' argument with the flag (for example:
+	MPOL_INTERLEAVE | MPOL_F_STATIC_NODES).
 
 	See the set_mempolicy(2) man page for more details
 
@@ -303,29 +382,19 @@ MEMORY POLICIES AND CPUSETS
 Memory policies work within cpusets as described above.  For memory policies
 that require a node or set of nodes, the nodes are restricted to the set of
 nodes whose memories are allowed by the cpuset constraints.  If the nodemask
-specified for the policy contains nodes that are not allowed by the cpuset, or
-the intersection of the set of nodes specified for the policy and the set of
-nodes with memory is the empty set, the policy is considered invalid
-and cannot be installed.
-
-The interaction of memory policies and cpusets can be problematic for a
-couple of reasons:
-
-1) the memory policy APIs take physical node id's as arguments.  As mentioned
-   above, it is illegal to specify nodes that are not allowed in the cpuset.
-   The application must query the allowed nodes using the get_mempolicy()
-   API with the MPOL_F_MEMS_ALLOWED flag to determine the allowed nodes and
-   restrict itself to those nodes.  However, the resources available to a
-   cpuset can be changed by the system administrator, or a workload manager
-   application, at any time.  So, a task may still get errors attempting to
-   specify policy nodes, and must query the allowed memories again.
-
-2) when tasks in two cpusets share access to a memory region, such as shared
-   memory segments created by shmget() of mmap() with the MAP_ANONYMOUS and
-   MAP_SHARED flags, and any of the tasks install shared policy on the region,
-   only nodes whose memories are allowed in both cpusets may be used in the
-   policies.  Obtaining this information requires "stepping outside" the
-   memory policy APIs to use the cpuset information and requires that one
-   know in what cpusets other task might be attaching to the shared region.
-   Furthermore, if the cpusets' allowed memory sets are disjoint, "local"
-   allocation is the only valid policy.
+specified for the policy contains nodes that are not allowed by the cpuset and
+MPOL_F_RELATIVE_NODES is not used, the intersection of the set of nodes
+specified for the policy and the set of nodes with memory is used.  If the
+result is the empty set, the policy is considered invalid and cannot be
+installed.  If MPOL_F_RELATIVE_NODES is used, the policy's nodes are mapped
+onto and folded into the task's set of allowed nodes as previously described.
+
+The interaction of memory policies and cpusets can be problematic when tasks
+in two cpusets share access to a memory region, such as shared memory segments
+created by shmget() of mmap() with the MAP_ANONYMOUS and MAP_SHARED flags, and
+any of the tasks install shared policy on the region, only nodes whose
+memories are allowed in both cpusets may be used in the policies.  Obtaining
+this information requires "stepping outside" the memory policy APIs to use the
+cpuset information and requires that one know in what cpusets other task might
+be attaching to the shared region.  Furthermore, if the cpusets' allowed
+memory sets are disjoint, "local" allocation is the only valid policy.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 6/6] mempolicy: update NUMA memory policy documentation
  2008-03-01  0:45         ` [patch 6/6] mempolicy: update NUMA memory policy documentation David Rientjes
@ 2008-03-01  0:55           ` Randy Dunlap
  2008-03-01  1:14             ` David Rientjes
  0 siblings, 1 reply; 22+ messages in thread
From: Randy Dunlap @ 2008-03-01  0:55 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Paul Jackson, Christoph Lameter, Lee Schermerhorn,
	Andi Kleen, linux-kernel

David Rientjes wrote:
> Updates Documentation/vm/numa_memory_policy.txt and
> Documentation/filesystems/tmpfs.txt to describe optional mempolicy mode
> flags.
> 
> Cc: Christoph Lameter <clameter@sgi.com>
> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
> Cc: Andi Kleen <ak@suse.de>
> Cc: Randy Dunlap <randy.dunlap@oracle.com>
> Signed-off-by: David Rientjes <rientjes@google.com>
> Signed-off-by: Paul Jackson <pj@sgi.com>
> ---
>  Documentation/filesystems/tmpfs.txt     |   12 +++
>  Documentation/vm/numa_memory_policy.txt |  131 +++++++++++++++++++++++-------
>  2 files changed, 112 insertions(+), 31 deletions(-)
> 
> diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
> --- a/Documentation/vm/numa_memory_policy.txt
> +++ b/Documentation/vm/numa_memory_policy.txt

> @@ -231,6 +234,80 @@ Components of Memory Policies
>  	    the temporary interleaved system default policy works in this
>  	    mode.
>  
> +   Linux memory policy supports the following optional mode flag:

                                                               flags:

> +
> +	MPOL_F_STATIC_NODES:  This flag specifies that the nodemask passed by
> +	the user should not be remapped if the task or VMA's set of allowed
> +	nodes changes after the memory policy has been defined.
> +
> +	    Without this flag, anytime a mempolicy is rebound because of a
> +	    change in the set of allowed nodes, the node (Preferred) or
> +	    nodemask (Bind, Interleave) is remapped to the new set of
> +	    allowed nodes.  This may result in nodes being used that were
> +	    previously undesired.
> +
> +	    With this flag, if the user specified nodes overlap with the

                                   user-specified

> +	    nodes allowed by the task's cpuset, then the memory policy is
> +	    applied to their intersection.  If the two sets of nodes do not
> +	    overlap, the Default policy is used.
> +
> +	    For example, consider a task that is attached to a cpuset with
> +	    mems 1-3 that sets an Interleave policy over the same set.  If
> +	    the cpuset's mems change to 3-5, the Interleave will now occur
> +	    over nodes 3, 4, and 5.  With this flag, however, since only node
> +	    3 is allowed from the user's nodemask, the "interleave" only
> +	    occurs over that node.  If no nodes from the user's nodemask are
> +	    now allowed, the Default behavior is used.
> +
> +	    MPOL_F_STATIC_NODES cannot be used with MPOL_F_RELATIVE_NODES.
> +


-- 
~Randy

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 6/6] mempolicy: update NUMA memory policy documentation
  2008-03-01  0:55           ` Randy Dunlap
@ 2008-03-01  1:14             ` David Rientjes
  0 siblings, 0 replies; 22+ messages in thread
From: David Rientjes @ 2008-03-01  1:14 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Randy Dunlap, Paul Jackson, linux-kernel

On Fri, 29 Feb 2008, Randy Dunlap wrote:

> David Rientjes wrote:
> > Updates Documentation/vm/numa_memory_policy.txt and
> > Documentation/filesystems/tmpfs.txt to describe optional mempolicy mode
> > flags.
> > 
> > Cc: Christoph Lameter <clameter@sgi.com>
> > Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
> > Cc: Andi Kleen <ak@suse.de>
> > Cc: Randy Dunlap <randy.dunlap@oracle.com>
> > Signed-off-by: David Rientjes <rientjes@google.com>
> > Signed-off-by: Paul Jackson <pj@sgi.com>
> > ---
> >  Documentation/filesystems/tmpfs.txt     |   12 +++
> >  Documentation/vm/numa_memory_policy.txt |  131
> > +++++++++++++++++++++++-------
> >  2 files changed, 112 insertions(+), 31 deletions(-)
> > 
> > diff --git a/Documentation/vm/numa_memory_policy.txt
> > b/Documentation/vm/numa_memory_policy.txt
> > --- a/Documentation/vm/numa_memory_policy.txt
> > +++ b/Documentation/vm/numa_memory_policy.txt
> 
> > @@ -231,6 +234,80 @@ Components of Memory Policies
> >  	    the temporary interleaved system default policy works in this
> >  	    mode.
> >  +   Linux memory policy supports the following optional mode flag:
> 
>                                                               flags:
> 
> > +
> > +	MPOL_F_STATIC_NODES:  This flag specifies that the nodemask passed by
> > +	the user should not be remapped if the task or VMA's set of allowed
> > +	nodes changes after the memory policy has been defined.
> > +
> > +	    Without this flag, anytime a mempolicy is rebound because of a
> > +	    change in the set of allowed nodes, the node (Preferred) or
> > +	    nodemask (Bind, Interleave) is remapped to the new set of
> > +	    allowed nodes.  This may result in nodes being used that were
> > +	    previously undesired.
> > +
> > +	    With this flag, if the user specified nodes overlap with the
> 
>                                   user-specified
> 
> > +	    nodes allowed by the task's cpuset, then the memory policy is
> > +	    applied to their intersection.  If the two sets of nodes do not
> > +	    overlap, the Default policy is used.
> > +
> > +	    For example, consider a task that is attached to a cpuset with
> > +	    mems 1-3 that sets an Interleave policy over the same set.  If
> > +	    the cpuset's mems change to 3-5, the Interleave will now occur
> > +	    over nodes 3, 4, and 5.  With this flag, however, since only node
> > +	    3 is allowed from the user's nodemask, the "interleave" only
> > +	    occurs over that node.  If no nodes from the user's nodemask are
> > +	    now allowed, the Default behavior is used.
> > +
> > +	    MPOL_F_STATIC_NODES cannot be used with MPOL_F_RELATIVE_NODES.
> > +
> 

Andrew, please fold this into 
mempolicy-update-numa-memory-policy-documentation.patch if this patchset is 
added to -mm.

Corrects two errors in Documentation/vm/numa_memory_policy.txt as identified by 
Randy Dunlap <randy.dunlap@oracle.com>.

Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 Documentation/vm/numa_memory_policy.txt |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
--- a/Documentation/vm/numa_memory_policy.txt
+++ b/Documentation/vm/numa_memory_policy.txt
@@ -234,7 +234,7 @@ Components of Memory Policies
 	    the temporary interleaved system default policy works in this
 	    mode.
 
-   Linux memory policy supports the following optional mode flag:
+   Linux memory policy supports the following optional mode flags:
 
 	MPOL_F_STATIC_NODES:  This flag specifies that the nodemask passed by
 	the user should not be remapped if the task or VMA's set of allowed
@@ -246,7 +246,7 @@ Components of Memory Policies
 	    allowed nodes.  This may result in nodes being used that were
 	    previously undesired.
 
-	    With this flag, if the user specified nodes overlap with the
+	    With this flag, if the user-specified nodes overlap with the
 	    nodes allowed by the task's cpuset, then the memory policy is
 	    applied to their intersection.  If the two sets of nodes do not
 	    overlap, the Default policy is used.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 1/6] mempolicy: convert MPOL constants to enum
  2008-03-01  0:44 [patch 1/6] mempolicy: convert MPOL constants to enum David Rientjes
  2008-03-01  0:45 ` [patch 2/6] mempolicy: support optional mode flags David Rientjes
@ 2008-03-03  7:17 ` Andrew Morton
  1 sibling, 0 replies; 22+ messages in thread
From: Andrew Morton @ 2008-03-03  7:17 UTC (permalink / raw)
  To: David Rientjes
  Cc: Paul Jackson, Christoph Lameter, Lee Schermerhorn, Andi Kleen,
	linux-kernel

On Fri, 29 Feb 2008 16:44:59 -0800 (PST) David Rientjes <rientjes@google.com> wrote:

> The mempolicy mode constants, MPOL_DEFAULT, MPOL_PREFERRED, MPOL_BIND,
> and MPOL_INTERLEAVE, are better declared as part of an enum since they
> are sequentially numbered and cannot be combined.
> 
> The policy member of struct mempolicy is also converted from type short
> to type unsigned short.  A negative policy does not have any legitimate
> meaning, so it is possible to change its type in preparation for adding
> optional mode flags later.
> 
> The equivalent member of struct shmem_sb_info is also changed from int
> to unsigned short.
> 
> For compatibility, the policy formal to get_mempolicy() remains as a
> pointer to an int:
> 
> 	int get_mempolicy(int *policy, unsigned long *nmask,
> 			  unsigned long maxnode, unsigned long addr,
> 			  unsigned long flags);
> 
> although the only possible values is the range of type unsigned short.

The collisions between this work and Mel's

mm-use-zonelists-instead-of-zones-when-direct-reclaiming-pages.patch
mm-introduce-node_zonelist-for-accessing-the-zonelist-for-a-gfp-mask.patch
mm-remember-what-the-preferred-zone-is-for-zone_statistics.patch
mm-use-two-zonelist-that-are-filtered-by-gfp-mask.patch
mm-have-zonelist-contains-structs-with-both-a-zone-pointer-and-zone_idx.patch
mm-filter-based-on-a-nodemask-as-well-as-a-gfp_mask.patch

are more than I'm comfortable with fixing, sorry.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag
  2008-02-27 15:37           ` Lee Schermerhorn
  2008-02-27 17:09             ` Paul Jackson
@ 2008-02-28 21:08             ` David Rientjes
  1 sibling, 0 replies; 22+ messages in thread
From: David Rientjes @ 2008-02-28 21:08 UTC (permalink / raw)
  To: Lee Schermerhorn
  Cc: Paul Jackson, Christoph Lameter, Andi Kleen, linux-kernel,
	Michael Kerrisk

On Wed, 27 Feb 2008, Lee Schermerhorn wrote:

> > Here's some examples of the functional changes between the default 
> > actions of the various mempolicy modes and the new behavior with 
> > MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES.
> 
> Nice work.  Would you consider adding this [with the corrections you
> note below] to the memory policy doc under the "interaction with
> cpusets" section?
> 

I think it's overly verbose for adding to documentation.  Other than 
describing the effects of MPOL_F_STATIC_NODES and MPOL_F_RELATIVE_NODES, I 
don't think much more in the way of examples need to be provided (they 
were here to show that the patchset actually works on creation and 
rebind).

> > 			MPOL_INTERLEAVE
> > 			---------------
> > 	mems	nodemask	result		rebind		result
> > 	1-3	0-2		1-2[*]		4-6		4-5
> > 	1-3	1-2		1-2		0-2		0-1
> > 	1-3	1-3		1-3		4-7		4-6
> > 	1-3	2-4		2-3		0-2		1-2
> > 	1-3	2-6		2-3		4-7		5-6
> > 	1-3	4-7		EINVAL
> > 	1-3	0-7		1-3		4-7		4-6
> > 
> > 			MPOL_PREFERRED
> > 			--------------
> > 	mems	nodemask	result		rebind		result
> > 	1-3	0		EINVAL
> > 	1-3	2		2		4-7		5
> > 	1-3	5		EINVAL
> > 
> > 			MPOL_BIND
> > 			---------
> > 	mems	nodemask	result		rebind		result
> > 	1-3	0-2		1-2		0-2		0-1
> > 	1-3	1-2		1-2		2-7		2-3
> > 	1-3	1-3		1-3		0-1		0-1
> > 	1-3	2-4		2-3		3-6		4-5
> > 	1-3	2-6		2-3		5		5
> > 	1-3	4-7		EINVAL
> > 	1-3	0-7		1-3		1-3		1-3
> 
> Just a note here:  If you had used the same set of "rebind targets" for
> _BIND as you did for _INTERLEAVE, I would expect the same results,
> because were just remapping bit masks in both cases.  Do you agree?
>  

Yes, the results are the same for both interleave and bind and that's 
actually why I didn't use the same rebind targets.

> >  [*] Notice how the resulting nodemask for all of these examples when
> >      creating the mempolicy is intersected with mems_allowed.  This is
> >      the current behavior, with contextualize_policy(), and is identical
> >      to the initial result of the MPOL_F_STATIC_NODES case.
> > 
> >      Perhaps it would make more sense to remap the nodemask when it is
> >      created, as well, in the ~MPOL_F_STATIC_NODES case.  For example, in
> >      this case, the "result" would be 1-3 instead.
> > 
> >      That is a departure from what is currently implemented in HEAD (and,
> >      thus, can be used as ample justification for the above behavior) but
> >      makes more sense.  Thoughts?
> 
> Thoughts:
> 
> 1) this IS a change in behavior, right?  My first inclination is to shy
> away from this.  However, ...
> 

Yes, it's the current behavior as a result of contextualize_policy() 
before mpol_new() is ever reached.

> 2) the current interaction of mempolicies with cpusets is not well
> documented--until Paul's cpuset.4 man page hits the streets, anyway.
> That doc does say that mempolicy is not allowed to use a node outside
> the cpuset.  It does NOT say how this is enforced--reject vs masking vs
> remap.  The set_mempolicy(2) and mbind(2) man pages [in at least 2.70
> man pages] says that you get EINVAL if you specify a node outside the
> current cpuset constraints.  This was relaxed by the recent patch to
> "silently restrict" the nodes to mems allowed.
> 
> Since we update the man pages anyway, we COULD change it to say that we
> remap policy to allowed nodes.  However, the application may have chosen
> the nodes based on some knowledge of hardware topology, such as IO
> attachement, interrupt handling cpus, ...  In this case, remapping
> doesn't make so much sense to me.  
> 
> If you need/want a mode that remaps policy to mems allowed on
> installation--e.g., to provide the maximum number of interleave
> nodes--how about yet another flag, such as '_REMAP, to effect this
> behavior?
> 

MPOL_F_RELATIVE_NODES actually does map onto and fold the passed nodemask 
when the mempolicy is created.  That flag specifically refers to relative 
nodes within a task's mems_allowed all the time; MPOL_F_STATIC_NODES 
refers specifically to physical node ids.

So adding MPOL_F_REMAP doesn't make sense for MPOL_F_RELATIVE_NODES since 
its already mapped onto and folded, and doesn't make sense for 
MPOL_F_STATIC_NODES since the nodemask is explicitly never remapped.

It would make sense as an optional third flag that would be disjoint from 
the other two and would only be useful when the mempolicy is created.

> > 			MPOL_INTERLEAVE | MPOL_F_STATIC_NODES
> > 			-------------------------------------
> > 	mems	nodemask	result		rebind		result
> > 	1-3	0-2		1-2		4-6		nil
> > 	1-3	1-2		1-2		0-2		1-2
> > 	1-3	1-3		1-3		4-7		nil
> > 	1-3	2-4		2-3		0-2		2
> > 	1-3	2-6		2-3		4-7		4-6
> > 	1-3	4-7		EINVAL
> > 	1-3	0-7		1-3		4-7		4-7
> 
> 'nil' falls back to local allocation, right?
> 

Yes, it's an interleave over no nodes and is thus a local allocation 
(current->il_next is numa_node_id()).

		David

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag
  2008-02-27 15:37           ` Lee Schermerhorn
@ 2008-02-27 17:09             ` Paul Jackson
  2008-02-28 21:08             ` David Rientjes
  1 sibling, 0 replies; 22+ messages in thread
From: Paul Jackson @ 2008-02-27 17:09 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: rientjes, clameter, ak, linux-kernel, mtk-manpages

Lee wrote:
> 1) this IS a change in behavior, right?  My first inclination is to shy
> away from this.  However, ...
> 
> 2) the current interaction of mempolicies with cpusets is not well
> documented--until Paul's cpuset.4 man page hits the streets, anyway.
> That doc does say that mempolicy is not allowed to use a node outside
> the cpuset.  It does NOT say how this is enforced--reject vs masking vs

We should strive to minimize change in existing behaviour, especially
on points where either way could be arguably a reasonable choice in
some situations, and neither way is a clearly inescapable bug in other
situations.

One almost always breaks someone, somewhere with such changes.  You
need a compelling reason to do that, not just a "on this hand, on that
hand" design choice.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.940.382.4214

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag
  2008-02-27  1:17         ` David Rientjes
  2008-02-27  1:31           ` David Rientjes
  2008-02-27  2:30           ` Paul Jackson
@ 2008-02-27 15:37           ` Lee Schermerhorn
  2008-02-27 17:09             ` Paul Jackson
  2008-02-28 21:08             ` David Rientjes
  2 siblings, 2 replies; 22+ messages in thread
From: Lee Schermerhorn @ 2008-02-27 15:37 UTC (permalink / raw)
  To: David Rientjes
  Cc: Paul Jackson, Christoph Lameter, Andi Kleen, linux-kernel,
	Michael Kerrisk

On Tue, 2008-02-26 at 17:17 -0800, David Rientjes wrote:
> On Mon, 25 Feb 2008, David Rientjes wrote:
> 
> > Adds another optional mode flag, MPOL_F_RELATIVE_NODES, that specifies
> > nodemasks passed via set_mempolicy() or mbind() should be considered
> > relative to the current task's mems_allowed.
> > 
> 
> Here's some examples of the functional changes between the default 
> actions of the various mempolicy modes and the new behavior with 
> MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES.

Nice work.  Would you consider adding this [with the corrections you
note below] to the memory policy doc under the "interaction with
cpusets" section?

> 
> To read this, the logical order follows from the left-most column to the 
> right-most:
> 
>  - "mems" is the task's mems_allowed as constrained by its attached
>    cpuset,
> 
>  - "nodemask" is the mask passed with the set_mempolicy() or mbind() call 
>    for that particular policy,
> 
>  - the first "result" is the nodemask that the policy is effected over,
> 
>  - "rebind" is the nodemask of a subsequent change to the cpuset's mems,
>    and
> 
>  - the second "result" is the nodemask that the policy is now effected 
>    over.
> 
> 			MPOL_INTERLEAVE
> 			---------------
> 	mems	nodemask	result		rebind		result
> 	1-3	0-2		1-2[*]		4-6		4-5
> 	1-3	1-2		1-2		0-2		0-1
> 	1-3	1-3		1-3		4-7		4-6
> 	1-3	2-4		2-3		0-2		1-2
> 	1-3	2-6		2-3		4-7		5-6
> 	1-3	4-7		EINVAL
> 	1-3	0-7		1-3		4-7		4-6
> 
> 			MPOL_PREFERRED
> 			--------------
> 	mems	nodemask	result		rebind		result
> 	1-3	0		EINVAL
> 	1-3	2		2		4-7		5
> 	1-3	5		EINVAL
> 
> 			MPOL_BIND
> 			---------
> 	mems	nodemask	result		rebind		result
> 	1-3	0-2		1-2		0-2		0-1
> 	1-3	1-2		1-2		2-7		2-3
> 	1-3	1-3		1-3		0-1		0-1
> 	1-3	2-4		2-3		3-6		4-5
> 	1-3	2-6		2-3		5		5
> 	1-3	4-7		EINVAL
> 	1-3	0-7		1-3		1-3		1-3

Just a note here:  If you had used the same set of "rebind targets" for
_BIND as you did for _INTERLEAVE, I would expect the same results,
because were just remapping bit masks in both cases.  Do you agree?
 
> 
>  [*] Notice how the resulting nodemask for all of these examples when
>      creating the mempolicy is intersected with mems_allowed.  This is
>      the current behavior, with contextualize_policy(), and is identical
>      to the initial result of the MPOL_F_STATIC_NODES case.
> 
>      Perhaps it would make more sense to remap the nodemask when it is
>      created, as well, in the ~MPOL_F_STATIC_NODES case.  For example, in
>      this case, the "result" would be 1-3 instead.
> 
>      That is a departure from what is currently implemented in HEAD (and,
>      thus, can be used as ample justification for the above behavior) but
>      makes more sense.  Thoughts?

Thoughts:

1) this IS a change in behavior, right?  My first inclination is to shy
away from this.  However, ...

2) the current interaction of mempolicies with cpusets is not well
documented--until Paul's cpuset.4 man page hits the streets, anyway.
That doc does say that mempolicy is not allowed to use a node outside
the cpuset.  It does NOT say how this is enforced--reject vs masking vs
remap.  The set_mempolicy(2) and mbind(2) man pages [in at least 2.70
man pages] says that you get EINVAL if you specify a node outside the
current cpuset constraints.  This was relaxed by the recent patch to
"silently restrict" the nodes to mems allowed.

Since we update the man pages anyway, we COULD change it to say that we
remap policy to allowed nodes.  However, the application may have chosen
the nodes based on some knowledge of hardware topology, such as IO
attachement, interrupt handling cpus, ...  In this case, remapping
doesn't make so much sense to me.  

If you need/want a mode that remaps policy to mems allowed on
installation--e.g., to provide the maximum number of interleave
nodes--how about yet another flag, such as '_REMAP, to effect this
behavior?

Just a thought...

> 
> 			MPOL_INTERLEAVE | MPOL_F_STATIC_NODES
> 			-------------------------------------
> 	mems	nodemask	result		rebind		result
> 	1-3	0-2		1-2		4-6		nil
> 	1-3	1-2		1-2		0-2		1-2
> 	1-3	1-3		1-3		4-7		nil
> 	1-3	2-4		2-3		0-2		2
> 	1-3	2-6		2-3		4-7		4-6
> 	1-3	4-7		EINVAL
> 	1-3	0-7		1-3		4-7		4-7

'nil' falls back to local allocation, right?

> 
> 			MPOL_PREFERRED | MPOL_F_STATIC_NODES
> 			------------------------------------
> 	mems	nodemask	result		rebind		result
> 	1-3	0		EINVAL
> 	1-3	2		2		4-7		-1[**]
> 	1-3	5		EINVAL
> 
>  [**] Upon further rebind with a nodemask of 2, the preferred node would
>       again be 2.

Here, '-1' means 'local allocation'.  [Note for documentation...]

> 
> 			MPOL_BIND | MPOL_F_STATIC_NODES
> 			-------------------------------
> 	mems	nodemask	result		rebind		result
> 	1-3	0-2		1-2		0-2		0-2
> 	1-3	1-2		1-2		2-7		2
> 	1-3	1-3		1-3		0-1		1
> 	1-3	2-4		2-3		3-6		3-4
> 	1-3	2-6		2-3		5		5
> 	1-3	4-7		EINVAL
> 	1-3	0-7		1-3		1-3		1-3
> 
> 			MPOL_INTERLEAVE | MPOL_F_RELATIVE_NODES
> 			---------------------------------------
> 	mems	nodemask	result		rebind		result
> 	1-3	0-2		1-3		4-6		4-6
> 	1-3	1-2		2-3		0-2		1-2
> 	1-3	1-3		1-3		4-7		5-7
> 	1-3	2-4		1-3		0-2		0-2
> 	1-3	2-6		1-3		4-7		4-7
> 	1-3	4-7		1-3		0-1,5		0-1,5
> 	1-3	0-7		1-3		4-7		4-7
> 
> 			MPOL_PREFERRED | MPOL_F_RELATIVE_NODES
> 			--------------------------------------
> 	mems	nodemask	result		rebind		result[***]
> 	1-3	0		1		0		1
> 	1-3	2		3		4-7		3
> 	1-3	5		3		0-7		3
> 
>  [***] All of these results are wrong and will be corrected in the next
>        posting of the patchset.  They change the preferred node in some
>        cases to be a node that is expressly excluded from being accessed
>        by the cpuset mems change.
> 
> 			MPOL_BIND | MPOL_F_RELATIVE_NODES
> 			---------------------------------
> 	mems	nodemask	result		rebind		result
> 	1-3	0-2		1-3		0-2		0-2
> 	1-3	1-2		2-3		2-7		3-4
> 	1-3	1-3		1-3		0-1		0-1
> 	1-3	2-4		1-3		3-6		3,5-6
> 	1-3	2-6		1-3		5		5
> 	1-3	4-7		1-3		0-3,6		0-2,6
> 	1-3	0-7		1-3		1-3		1-3


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag
  2008-02-27  1:17         ` David Rientjes
  2008-02-27  1:31           ` David Rientjes
@ 2008-02-27  2:30           ` Paul Jackson
  2008-02-27 15:37           ` Lee Schermerhorn
  2 siblings, 0 replies; 22+ messages in thread
From: Paul Jackson @ 2008-02-27  2:30 UTC (permalink / raw)
  To: David Rientjes; +Cc: clameter, Lee.Schermerhorn, ak, linux-kernel

Nice presentation ... thanks.

I agree with your patch to fix the MPOL_PREFERRED | MPOL_F_RELATIVE_NODES case.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.940.382.4214

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag
  2008-02-27  1:17         ` David Rientjes
@ 2008-02-27  1:31           ` David Rientjes
  2008-02-27  2:30           ` Paul Jackson
  2008-02-27 15:37           ` Lee Schermerhorn
  2 siblings, 0 replies; 22+ messages in thread
From: David Rientjes @ 2008-02-27  1:31 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Christoph Lameter, Lee Schermerhorn, Andi Kleen, linux-kernel

On Tue, 26 Feb 2008, David Rientjes wrote:

> 			MPOL_PREFERRED | MPOL_F_RELATIVE_NODES
> 			--------------------------------------
> 	mems	nodemask	result		rebind		result[***]
> 	1-3	0		1		0		1
> 	1-3	2		3		4-7		3
> 	1-3	5		3		0-7		3
> 

I'll fold the following patch into my next posting that corrects this 
error as follows:

			MPOL_PREFERRED | MPOL_F_RELATIVE_NODES
			--------------------------------------
	mems	nodemask	result		rebind		result
	1-3	0		1		0		0
	1-3	2		3		4-7		6
	1-3	5		3		0-7		5
---
 mm/mempolicy.c |    3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1809,8 +1809,7 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 		} else if (relative_nodes) {
 			mpol_relative_nodemask(&tmp, &pol->w.user_nodemask,
 					       newmask);
-			pol->v.preferred_node = node_remap(pol->v.preferred_node,
-					pol->w.cpuset_mems_allowed, tmp);
+			pol->v.preferred_node = first_node(tmp);
 		} else {
 			pol->v.preferred_node = node_remap(pol->v.preferred_node,
 					pol->w.cpuset_mems_allowed, *newmask);

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag
  2008-02-25 15:35       ` [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag David Rientjes
  2008-02-26  6:12         ` Paul Jackson
  2008-02-26 17:44         ` Paul Jackson
@ 2008-02-27  1:17         ` David Rientjes
  2008-02-27  1:31           ` David Rientjes
                             ` (2 more replies)
  2 siblings, 3 replies; 22+ messages in thread
From: David Rientjes @ 2008-02-27  1:17 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Christoph Lameter, Lee Schermerhorn, Andi Kleen, linux-kernel

On Mon, 25 Feb 2008, David Rientjes wrote:

> Adds another optional mode flag, MPOL_F_RELATIVE_NODES, that specifies
> nodemasks passed via set_mempolicy() or mbind() should be considered
> relative to the current task's mems_allowed.
> 

Here's some examples of the functional changes between the default 
actions of the various mempolicy modes and the new behavior with 
MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES.

To read this, the logical order follows from the left-most column to the 
right-most:

 - "mems" is the task's mems_allowed as constrained by its attached
   cpuset,

 - "nodemask" is the mask passed with the set_mempolicy() or mbind() call 
   for that particular policy,

 - the first "result" is the nodemask that the policy is effected over,

 - "rebind" is the nodemask of a subsequent change to the cpuset's mems,
   and

 - the second "result" is the nodemask that the policy is now effected 
   over.

			MPOL_INTERLEAVE
			---------------
	mems	nodemask	result		rebind		result
	1-3	0-2		1-2[*]		4-6		4-5
	1-3	1-2		1-2		0-2		0-1
	1-3	1-3		1-3		4-7		4-6
	1-3	2-4		2-3		0-2		1-2
	1-3	2-6		2-3		4-7		5-6
	1-3	4-7		EINVAL
	1-3	0-7		1-3		4-7		4-6

			MPOL_PREFERRED
			--------------
	mems	nodemask	result		rebind		result
	1-3	0		EINVAL
	1-3	2		2		4-7		5
	1-3	5		EINVAL

			MPOL_BIND
			---------
	mems	nodemask	result		rebind		result
	1-3	0-2		1-2		0-2		0-1
	1-3	1-2		1-2		2-7		2-3
	1-3	1-3		1-3		0-1		0-1
	1-3	2-4		2-3		3-6		4-5
	1-3	2-6		2-3		5		5
	1-3	4-7		EINVAL
	1-3	0-7		1-3		1-3		1-3

 [*] Notice how the resulting nodemask for all of these examples when
     creating the mempolicy is intersected with mems_allowed.  This is
     the current behavior, with contextualize_policy(), and is identical
     to the initial result of the MPOL_F_STATIC_NODES case.

     Perhaps it would make more sense to remap the nodemask when it is
     created, as well, in the ~MPOL_F_STATIC_NODES case.  For example, in
     this case, the "result" would be 1-3 instead.

     That is a departure from what is currently implemented in HEAD (and,
     thus, can be used as ample justification for the above behavior) but
     makes more sense.  Thoughts?

			MPOL_INTERLEAVE | MPOL_F_STATIC_NODES
			-------------------------------------
	mems	nodemask	result		rebind		result
	1-3	0-2		1-2		4-6		nil
	1-3	1-2		1-2		0-2		1-2
	1-3	1-3		1-3		4-7		nil
	1-3	2-4		2-3		0-2		2
	1-3	2-6		2-3		4-7		4-6
	1-3	4-7		EINVAL
	1-3	0-7		1-3		4-7		4-7

			MPOL_PREFERRED | MPOL_F_STATIC_NODES
			------------------------------------
	mems	nodemask	result		rebind		result
	1-3	0		EINVAL
	1-3	2		2		4-7		-1[**]
	1-3	5		EINVAL

 [**] Upon further rebind with a nodemask of 2, the preferred node would
      again be 2.

			MPOL_BIND | MPOL_F_STATIC_NODES
			-------------------------------
	mems	nodemask	result		rebind		result
	1-3	0-2		1-2		0-2		0-2
	1-3	1-2		1-2		2-7		2
	1-3	1-3		1-3		0-1		1
	1-3	2-4		2-3		3-6		3-4
	1-3	2-6		2-3		5		5
	1-3	4-7		EINVAL
	1-3	0-7		1-3		1-3		1-3

			MPOL_INTERLEAVE | MPOL_F_RELATIVE_NODES
			---------------------------------------
	mems	nodemask	result		rebind		result
	1-3	0-2		1-3		4-6		4-6
	1-3	1-2		2-3		0-2		1-2
	1-3	1-3		1-3		4-7		5-7
	1-3	2-4		1-3		0-2		0-2
	1-3	2-6		1-3		4-7		4-7
	1-3	4-7		1-3		0-1,5		0-1,5
	1-3	0-7		1-3		4-7		4-7

			MPOL_PREFERRED | MPOL_F_RELATIVE_NODES
			--------------------------------------
	mems	nodemask	result		rebind		result[***]
	1-3	0		1		0		1
	1-3	2		3		4-7		3
	1-3	5		3		0-7		3

 [***] All of these results are wrong and will be corrected in the next
       posting of the patchset.  They change the preferred node in some
       cases to be a node that is expressly excluded from being accessed
       by the cpuset mems change.

			MPOL_BIND | MPOL_F_RELATIVE_NODES
			---------------------------------
	mems	nodemask	result		rebind		result
	1-3	0-2		1-3		0-2		0-2
	1-3	1-2		2-3		2-7		3-4
	1-3	1-3		1-3		0-1		0-1
	1-3	2-4		1-3		3-6		3,5-6
	1-3	2-6		1-3		5		5
	1-3	4-7		1-3		0-3,6		0-2,6
	1-3	0-7		1-3		1-3		1-3

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag
  2008-02-26 21:17           ` David Rientjes
@ 2008-02-26 21:30             ` Paul Jackson
  0 siblings, 0 replies; 22+ messages in thread
From: Paul Jackson @ 2008-02-26 21:30 UTC (permalink / raw)
  To: David Rientjes; +Cc: akpm, clameter, Lee.Schermerhorn, ak, linux-kernel

David wrote:
> I don't understand what Oracle has to do with anything here ...

I'm figuring that the risk is high that we broke the existing calls,
when the new MPOL_F_*_NODES flags are not used.

> If you have any examples of unexpected behavior after applying my 
> patchset, please post it and I'll fix it rather quickly.

That won't cut it for Oracle, or other such major applications
using NUMA mempolicy calls.  If we wait for them to report breakage,
it will be a year or two down the road from now, and it would be
another year or two before they could presume the fix was in the
kernel version they were supporting.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.940.382.4214

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag
  2008-02-26 17:44         ` Paul Jackson
  2008-02-26 21:17           ` David Rientjes
@ 2008-02-26 21:27           ` Lee Schermerhorn
  1 sibling, 0 replies; 22+ messages in thread
From: Lee Schermerhorn @ 2008-02-26 21:27 UTC (permalink / raw)
  To: Paul Jackson, David Rientjes; +Cc: akpm, clameter, ak, linux-kernel, Mel Gorman

On Tue, 2008-02-26 at 11:44 -0600, Paul Jackson wrote:
> David,
> 
> Perhaps I missed it, but could you elaborate on what sort of testing
> these patches for MPOL_F_RELATIVE_NODES and MPOL_F_STATIC_NODES have
> received?
> 
> The main reason I didn't push my version of these patches in December
> was I figured it would take a week or three of obsessive-compulsive
> testing to verify that we didn't break various corner cases of the
> mbind/mempolicy system call interface.
> 
> In particular, do we know that Oracle works with this?  At least in
> years past, when Andi was the primary developer here, he had some
> good and detailed awareness of what it took to keep Oracle happy
> with this NUMA memory policy apparatus.  I don't know if we still
> have that awareness.
> 

I've recently been "playing with" Andi's numactl/libnuma/mempolicy
regression tests on 25-rc2-mm1.  I found a couple of problems caused by
changes in behavior on newer kernels--not in the mempolicy area, but in
a couple of the tests themselves and in numastat, used by the primary
mempolicy regression test.  I haven't seen any discussion of these
issues on the mailing lists, or I missed it.  So, I've attempted my own
fixes.

I have placed a tarball containing patches against numactl-1.0.2 sources
at:
http://free.linux.hp.com/~lts/Patches/Numactl/numactl-1.0.2-patches-080226.tar.gz

See the patch descriptions for more info.  The individual patches and
series live in the same Numactl directory for easy browsing.

The numactl-1.0.2 sources are located in Andi's repository at:

	ftp://ftp.suse.com/pub/people/ak/numa/numactl-1.0.2.tar.gz

With [3(1) of] these patches, the regression tests pass on my x86_64
NUMA test platform [4 socket/node, dual core, 32GB] on 2.6.25-rc2-mm1
with and without Mel Gorman's "two zonelist" patch series.  I haven't
gotten around to checking out David's latest series yet, nor have I
tried Andi's tests with cpusets.  I don't know whether the tests will
pass in arbitrary cpusets--I expect not.  It's on my list.  In any case,
you can grab the patches if you want to regression test your own
patches.

Regards,
Lee

(1) patches required for regression tests:

numastat-01-fix-report-order-for-new-kernels.patch
regress2-01-fix-checkaffinity-numnodes-calc.patch
regress2-02-checktopology-numnodes-calc.patch

regress-01-reorg-and-annotate.patch is just a "beautification" [in the
eye of the beholder, of course] patch so that I could figure out what
was going on.


	


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag
  2008-02-26 17:44         ` Paul Jackson
@ 2008-02-26 21:17           ` David Rientjes
  2008-02-26 21:30             ` Paul Jackson
  2008-02-26 21:27           ` Lee Schermerhorn
  1 sibling, 1 reply; 22+ messages in thread
From: David Rientjes @ 2008-02-26 21:17 UTC (permalink / raw)
  To: Paul Jackson; +Cc: akpm, clameter, Lee.Schermerhorn, ak, linux-kernel

On Tue, 26 Feb 2008, Paul Jackson wrote:

> Perhaps I missed it, but could you elaborate on what sort of testing
> these patches for MPOL_F_RELATIVE_NODES and MPOL_F_STATIC_NODES have
> received?
> 

Both MPOL_F_STATIC_NODES and MPOL_F_RELATIVE_NODES only really change the 
implementation when a new mempolicy is created and on rebind.

So it's possible to test both of these areas by booting with numa=fake=8, 
setting up a cpuset, hacking numactl to allow |static or |relative options 
to be passed, changing the cpuset's mems, and checking the result.  Then 
you can repeat that process with a different set of initial cpuset mems 
and a different syscall nodemask.

	# mount -t cpuset none /dev/cpuset
	# mkdir /dev/cpuset/test
	# cd /dev/cpuset/test
	# echo 0-3 > cpus
	# echo 1-3 > mems
	# echo $$ > tasks
	# numactl --interleave|static=2-4 bash
	# numactl --show
	[should be interleave over 2-3]
	# echo 4-6 > mems
	[should be "interleave" over 4]
	# numactl --show

> In particular, do we know that Oracle works with this?  At least in
> years past, when Andi was the primary developer here, he had some
> good and detailed awareness of what it took to keep Oracle happy
> with this NUMA memory policy apparatus.  I don't know if we still
> have that awareness.
> 

I don't understand what Oracle has to do with anything here, it is up to 
the programmer to determine whether he wants to use MPOL_F_STATIC_NODES or 
MPOL_F_RELATIVE_NODES for the benefit of his or her application.  There's 
now a method for doing that, specifically with optional mempolicy mode 
flags.

If you have any examples of unexpected behavior after applying my 
patchset, please post it and I'll fix it rather quickly.

		David

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag
  2008-02-25 15:35       ` [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag David Rientjes
  2008-02-26  6:12         ` Paul Jackson
@ 2008-02-26 17:44         ` Paul Jackson
  2008-02-26 21:17           ` David Rientjes
  2008-02-26 21:27           ` Lee Schermerhorn
  2008-02-27  1:17         ` David Rientjes
  2 siblings, 2 replies; 22+ messages in thread
From: Paul Jackson @ 2008-02-26 17:44 UTC (permalink / raw)
  To: David Rientjes; +Cc: akpm, clameter, Lee.Schermerhorn, ak, linux-kernel

David,

Perhaps I missed it, but could you elaborate on what sort of testing
these patches for MPOL_F_RELATIVE_NODES and MPOL_F_STATIC_NODES have
received?

The main reason I didn't push my version of these patches in December
was I figured it would take a week or three of obsessive-compulsive
testing to verify that we didn't break various corner cases of the
mbind/mempolicy system call interface.

In particular, do we know that Oracle works with this?  At least in
years past, when Andi was the primary developer here, he had some
good and detailed awareness of what it took to keep Oracle happy
with this NUMA memory policy apparatus.  I don't know if we still
have that awareness.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.940.382.4214

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag
  2008-02-26  6:12         ` Paul Jackson
@ 2008-02-26  6:45           ` David Rientjes
  0 siblings, 0 replies; 22+ messages in thread
From: David Rientjes @ 2008-02-26  6:45 UTC (permalink / raw)
  To: Paul Jackson; +Cc: akpm, clameter, Lee.Schermerhorn, ak, linux-kernel

On Tue, 26 Feb 2008, Paul Jackson wrote:

> David wrote:
> +static nodemask_t mpol_relative_nodemask(const nodemask_t *orig,
> +					 const nodemask_t *rel)
> +{
> +	nodemask_t ret;
> +	nodemask_t tmp;
> 
> Could you avoid needing the nodemask_t 'ret' on the stack, by passing
> in a "nodemask_t *" pointer to where you want the resulting nodemask_t
> written, rather than by returning it by value?
> 
>  static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
> 					 const nodemask_t *rel)
> 

Done, thanks.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag
  2008-02-25 15:35       ` [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag David Rientjes
@ 2008-02-26  6:12         ` Paul Jackson
  2008-02-26  6:45           ` David Rientjes
  2008-02-26 17:44         ` Paul Jackson
  2008-02-27  1:17         ` David Rientjes
  2 siblings, 1 reply; 22+ messages in thread
From: Paul Jackson @ 2008-02-26  6:12 UTC (permalink / raw)
  To: David Rientjes; +Cc: akpm, clameter, Lee.Schermerhorn, ak, linux-kernel

David wrote:
+static nodemask_t mpol_relative_nodemask(const nodemask_t *orig,
+					 const nodemask_t *rel)
+{
+	nodemask_t ret;
+	nodemask_t tmp;

Could you avoid needing the nodemask_t 'ret' on the stack, by passing
in a "nodemask_t *" pointer to where you want the resulting nodemask_t
written, rather than by returning it by value?

 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
					 const nodemask_t *rel)

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.940.382.4214

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag
  2008-02-25 15:35     ` [patch 4/6] mempolicy: add bitmap_onto() and bitmap_fold() operations David Rientjes
@ 2008-02-25 15:35       ` David Rientjes
  2008-02-26  6:12         ` Paul Jackson
                           ` (2 more replies)
  0 siblings, 3 replies; 22+ messages in thread
From: David Rientjes @ 2008-02-25 15:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Paul Jackson, Christoph Lameter, Lee Schermerhorn, Andi Kleen,
	linux-kernel

Adds another optional mode flag, MPOL_F_RELATIVE_NODES, that specifies
nodemasks passed via set_mempolicy() or mbind() should be considered
relative to the current task's mems_allowed.

When the mempolicy is created, the passed nodemask is folded and mapped
onto the current task's mems_allowed.  For example, consider a task
using set_mempolicy() to pass MPOL_INTERLEAVE | MPOL_F_RELATIVE_NODES
with a nodemask of 1-3.  If current's mems_allowed is 4-7, the effected
nodemask is 5-7 (the second, third, and fourth node of mems_allowed).

If the same task is attached to a cpuset, the mempolicy nodemask is
rebound each time the mems are changed.  Some possible rebinds and
results are:

	mems			result
	1-3			1-3
	1-7			2-4
	1,5-6			1,5-6
	1,5-7			5-7

Likewise, the zonelist built for MPOL_BIND acts on the set of zones
assigned to the resultant nodemask from the relative remap.

In the MPOL_PREFERRED case, the preferred node is remapped from the
currently effected nodemask to the relative nodemask.

This mempolicy mode flag was conceived of by Paul Jackson <pj@sgi.com>.

Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/mempolicy.h |    3 ++-
 mm/mempolicy.c            |   40 ++++++++++++++++++++++++++++++++++++++--
 mm/shmem.c                |    6 ++++++
 3 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -25,12 +25,13 @@ enum {
 
 /* Flags for set_mempolicy */
 #define MPOL_F_STATIC_NODES	(1 << 15)
+#define MPOL_F_RELATIVE_NODES	(1 << 14)
 
 /*
  * MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
  * either set_mempolicy() or mbind().
  */
-#define MPOL_MODE_FLAGS		(MPOL_F_STATIC_NODES)
+#define MPOL_MODE_FLAGS		(MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES)
 
 /* Flags for get_mempolicy */
 #define MPOL_F_NODE	(1<<0)	/* return next IL mode instead of node mask */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -152,7 +152,18 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
 
 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 {
-	return !!(pol->flags & MPOL_F_STATIC_NODES);
+	return !!(pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES));
+}
+
+static nodemask_t mpol_relative_nodemask(const nodemask_t *orig,
+					 const nodemask_t *rel)
+{
+	nodemask_t ret;
+	nodemask_t tmp;
+
+	nodes_fold(tmp, *orig, nodes_weight(*rel));
+	nodes_onto(ret, tmp, *rel);
+	return ret;
 }
 
 /* Create a new policy */
@@ -173,7 +184,12 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 		return ERR_PTR(-ENOMEM);
 	atomic_set(&policy->refcnt, 1);
 	cpuset_update_task_memory_state();
-	nodes_and(cpuset_context_nmask, *nodes, cpuset_current_mems_allowed);
+	if (flags & MPOL_F_RELATIVE_NODES)
+		cpuset_context_nmask = mpol_relative_nodemask(nodes,
+						&cpuset_current_mems_allowed);
+	else
+		nodes_and(cpuset_context_nmask, *nodes,
+			  cpuset_current_mems_allowed);
 	switch (mode) {
 	case MPOL_INTERLEAVE:
 		if (nodes_empty(*nodes) || nodes_empty(cpuset_context_nmask))
@@ -898,6 +914,9 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 	mode &= ~MPOL_MODE_FLAGS;
 	if (mode >= MPOL_MAX)
 		return -EINVAL;
+	if ((mode_flags & MPOL_F_STATIC_NODES) &&
+	    (mode_flags & MPOL_F_RELATIVE_NODES))
+		return -EINVAL;
 	err = get_nodes(&nodes, nmask, maxnode);
 	if (err)
 		return err;
@@ -916,6 +935,8 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 	mode &= ~MPOL_MODE_FLAGS;
 	if ((unsigned int)mode >= MPOL_MAX)
 		return -EINVAL;
+	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
+		return -EINVAL;
 	err = get_nodes(&nodes, nmask, maxnode);
 	if (err)
 		return err;
@@ -1747,10 +1768,12 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 {
 	nodemask_t tmp;
 	int static_nodes;
+	int relative_nodes;
 
 	if (!pol)
 		return;
 	static_nodes = pol->flags & MPOL_F_STATIC_NODES;
+	relative_nodes = pol->flags & MPOL_F_RELATIVE_NODES;
 	if (!mpol_store_user_nodemask(pol) &&
 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 		return;
@@ -1761,6 +1784,9 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 	case MPOL_INTERLEAVE:
 		if (static_nodes)
 			nodes_and(tmp, pol->w.user_nodemask, *newmask);
+		else if (relative_nodes)
+			tmp = mpol_relative_nodemask(&pol->w.user_nodemask,
+						     newmask);
 		else {
 			nodes_remap(tmp, pol->v.nodes,
 				    pol->w.cpuset_mems_allowed, *newmask);
@@ -1783,6 +1809,11 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 				pol->v.preferred_node = node;
 			else
 				pol->v.preferred_node = -1;
+		} else if (relative_nodes) {
+			tmp = mpol_relative_nodemask(&pol->w.user_nodemask,
+						     newmask);
+			pol->v.preferred_node = node_remap(pol->v.preferred_node,
+					pol->w.cpuset_mems_allowed, tmp);
 		} else {
 			pol->v.preferred_node = node_remap(pol->v.preferred_node,
 					pol->w.cpuset_mems_allowed, *newmask);
@@ -1794,6 +1825,9 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 
 		if (static_nodes)
 			nodes_and(tmp, pol->w.user_nodemask, *newmask);
+		else if (relative_nodes)
+			tmp = mpol_relative_nodemask(&pol->w.user_nodemask,
+						     newmask);
 		else {
 			nodemask_t nodes;
 			struct zone **z;
@@ -1911,6 +1945,8 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 
 		if (flags & MPOL_F_STATIC_NODES)
 			p += sprintf(p, "%sstatic", need_bar++ ? "|" : "");
+		if (flags & MPOL_F_RELATIVE_NODES)
+			p += sprintf(p, "%srelative", need_bar++ ? "|" : "");
 	}
 
 	if (!nodes_empty(nodes)) {
diff --git a/mm/shmem.c b/mm/shmem.c
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1131,6 +1131,12 @@ static int shmem_parse_mpol(char *value, unsigned short *policy,
 	if (flags) {
 		if (!strcmp(flags, "static"))
 			*mode_flags |= MPOL_F_STATIC_NODES;
+		if (!strcmp(flags, "relative"))
+			*mode_flags |= MPOL_F_RELATIVE_NODES;
+
+		if ((*mode_flags & MPOL_F_STATIC_NODES) &&
+		    (*mode_flags & MPOL_F_RELATIVE_NODES))
+			err = 1;
 	}
 out:
 	/* Restore string for error message */

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2008-03-03  7:18 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-03-01  0:44 [patch 1/6] mempolicy: convert MPOL constants to enum David Rientjes
2008-03-01  0:45 ` [patch 2/6] mempolicy: support optional mode flags David Rientjes
2008-03-01  0:45   ` [patch 3/6] mempolicy: add MPOL_F_STATIC_NODES flag David Rientjes
2008-03-01  0:45     ` [patch 4/6] mempolicy: add bitmap_onto() and bitmap_fold() operations David Rientjes
2008-03-01  0:45       ` [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag David Rientjes
2008-03-01  0:45         ` [patch 6/6] mempolicy: update NUMA memory policy documentation David Rientjes
2008-03-01  0:55           ` Randy Dunlap
2008-03-01  1:14             ` David Rientjes
2008-03-03  7:17 ` [patch 1/6] mempolicy: convert MPOL constants to enum Andrew Morton
  -- strict thread matches above, loose matches on Subject: below --
2008-02-25 15:35 David Rientjes
2008-02-25 15:35 ` [patch 2/6] mempolicy: support optional mode flags David Rientjes
2008-02-25 15:35   ` [patch 3/6] mempolicy: add MPOL_F_STATIC_NODES flag David Rientjes
2008-02-25 15:35     ` [patch 4/6] mempolicy: add bitmap_onto() and bitmap_fold() operations David Rientjes
2008-02-25 15:35       ` [patch 5/6] mempolicy: add MPOL_F_RELATIVE_NODES flag David Rientjes
2008-02-26  6:12         ` Paul Jackson
2008-02-26  6:45           ` David Rientjes
2008-02-26 17:44         ` Paul Jackson
2008-02-26 21:17           ` David Rientjes
2008-02-26 21:30             ` Paul Jackson
2008-02-26 21:27           ` Lee Schermerhorn
2008-02-27  1:17         ` David Rientjes
2008-02-27  1:31           ` David Rientjes
2008-02-27  2:30           ` Paul Jackson
2008-02-27 15:37           ` Lee Schermerhorn
2008-02-27 17:09             ` Paul Jackson
2008-02-28 21:08             ` David Rientjes

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).