Fix argument checking in sched_setaffinity
diff mbox series

Message ID m3zn4bidlx.fsf@averell.firstfloor.org
State New, archived
Headers show
Series
  • Fix argument checking in sched_setaffinity
Related show

Commit Message

Andi Kleen Aug. 31, 2004, 2:30 p.m. UTC
This patch fixes the argument length checking in sched_setaffinity.

Previously it would error out when the length passed was
smaller than sizeof(cpumask_t). And any bits beyond cpumask_s
would be silently ignored.

First this assumes that the user application knows the size
of cpumask_t, which should be kernel internal. When you increase 
cpumask_t old applications break and there is no good way
for the application to find out the cpumask_t size the kernel
uses.

This patch changes it to do similar checking to the NUMA API calls: 

- Any length is ok as long as all online CPUs are covered
(this could still cause application breakage with more CPUs, 
but there is no good way around it) 

- When the user passes more than cpumask_t bytes the excess
bytes are checked to be zero.



-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Comments

Paul Jackson Sept. 1, 2004, 1:36 a.m. UTC | #1
Looks good - thanks, Andi.

I notice that you didn't bother with the fractional byte that is handled
by 'endmask' in mm/mempolicy.c:get_nodes().  But I really don't give a
hoot - either way is fine by me.

I've written a couple of code snippets that manage to intuit the size of
the kernel's cpumask dynamically from user space, by probing with
various sched_getaffinity() calls.  But since your patch only changes
the errors generated by sched_setaffinity() [that's "set", not "get"], I
will not experience any grief from this subtle change in the kernel's
API.

Should you lock hotplug before calling get_user_cpu_mask(), since
get_user_cpu_mask() depends on cpu_online_mask()?
Anton Blanchard Sept. 1, 2004, 1:59 a.m. UTC | #2
> I notice that you didn't bother with the fractional byte that is handled
> by 'endmask' in mm/mempolicy.c:get_nodes().  But I really don't give a
> hoot - either way is fine by me.
> 
> I've written a couple of code snippets that manage to intuit the size of
> the kernel's cpumask dynamically from user space, by probing with
> various sched_getaffinity() calls.  But since your patch only changes
> the errors generated by sched_setaffinity() [that's "set", not "get"], I
> will not experience any grief from this subtle change in the kernel's
> API.
> 
> Should you lock hotplug before calling get_user_cpu_mask(), since
> get_user_cpu_mask() depends on cpu_online_mask()?

FYI the NUMA API and affinity code is broken on 64bit big endian. We
really need a get/set compat bitmap and use it. How does this look?
Not well tested yet...

Anton

diff -puN kernel/compat.c~compat_bitmap kernel/compat.c
--- gr_work/kernel/compat.c~compat_bitmap	2004-06-16 10:32:11.590272927 -0500
+++ gr_work-anton/kernel/compat.c	2004-06-16 10:32:11.607270238 -0500
@@ -561,3 +561,83 @@ long compat_clock_nanosleep(clockid_t wh
 
 /* timer_create is architecture specific because it needs sigevent conversion */
 
+long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask,
+		       unsigned long bitmap_size)
+{
+	int i, j;
+	unsigned long m;
+	compat_ulong_t um;
+	unsigned long nr_compat_longs;
+
+	/* align bitmap up to nearest compat_long_t boundary */
+	bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
+
+	if (verify_area(VERIFY_READ, umask, bitmap_size / 8))
+		return -EFAULT;
+
+	nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
+
+	for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) {
+		m = 0;
+
+		for (j = 0; j < sizeof(m)/sizeof(um); j++) {
+			/*
+			 * We dont want to read past the end of the userspace
+			 * bitmap. We must however ensure the end of the
+			 * kernel bitmap is zeroed.
+			 */
+			if (nr_compat_longs-- > 0) {
+				if (__get_user(um, umask))
+					return -EFAULT;
+			} else {
+				um = 0;
+			}
+
+			umask++;
+			m |= (long)um << (j * BITS_PER_COMPAT_LONG);
+		}
+		*mask++ = m;
+	}
+
+	return 0;
+}
+
+long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
+		       unsigned long bitmap_size)
+{
+	int i, j;
+	unsigned long m;
+	compat_ulong_t um;
+	unsigned long nr_compat_longs;
+
+	/* align bitmap up to nearest compat_long_t boundary */
+	bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
+
+	if (verify_area(VERIFY_WRITE, umask, bitmap_size / 8))
+		return -EFAULT;
+
+	nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
+
+	for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) {
+		m = *mask++;
+
+		for (j = 0; j < sizeof(m)/sizeof(um); j++) {
+			um = m;
+
+			/*
+			 * We dont want to write past the end of the userspace
+			 * bitmap.
+			 */
+			if (nr_compat_longs-- > 0) {
+				if (__put_user(um, umask))
+					return -EFAULT;
+			}
+
+			umask++;
+			m >>= 4*sizeof(um);
+			m >>= 4*sizeof(um);
+		}
+	}
+
+	return 0;
+}
diff -puN include/linux/compat.h~compat_bitmap include/linux/compat.h
--- gr_work/include/linux/compat.h~compat_bitmap	2004-06-16 10:32:11.595272136 -0500
+++ gr_work-anton/include/linux/compat.h	2004-06-16 10:32:11.608270080 -0500
@@ -130,5 +130,15 @@ asmlinkage long compat_sys_select(int n,
 		compat_ulong_t __user *outp, compat_ulong_t __user *exp,
 		struct compat_timeval __user *tvp);
 
+#define BITS_PER_COMPAT_LONG    (8*sizeof(compat_long_t))
+
+#define BITS_TO_COMPAT_LONGS(bits) \
+	(((bits)+BITS_PER_COMPAT_LONG-1)/BITS_PER_COMPAT_LONG)
+
+long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask,
+		       unsigned long bitmap_size);
+long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
+		       unsigned long bitmap_size);
+
 #endif /* CONFIG_COMPAT */
 #endif /* _LINUX_COMPAT_H */
Paul Jackson Sept. 2, 2004, 9:33 a.m. UTC | #3
Anton wrote:
> How does this look?

I haven't developed much of an eye yet for the compat routines - so this
looks ok to me, but such means little ;).

Hopefully someone else with a history here can take a look.  Or if you
whack me again, I might be able to examine it further.

Sorry ... thanks.
Andi Kleen Sept. 4, 2004, 1:37 p.m. UTC | #4
On Tue, Aug 31, 2004 at 06:36:55PM -0700, Paul Jackson wrote:
> Looks good - thanks, Andi.
> 
> I notice that you didn't bother with the fractional byte that is handled
> by 'endmask' in mm/mempolicy.c:get_nodes().  But I really don't give a
> hoot - either way is fine by me.

It is not needed because this function gets bytes instead of bits.


> I've written a couple of code snippets that manage to intuit the size of
> the kernel's cpumask dynamically from user space, by probing with
> various sched_getaffinity() calls.  But since your patch only changes
> the errors generated by sched_setaffinity() [that's "set", not "get"], I
> will not experience any grief from this subtle change in the kernel's
> API.
> 
> Should you lock hotplug before calling get_user_cpu_mask(), since
> get_user_cpu_mask() depends on cpu_online_mask()?

Good point yes, that is missing. 

However Linus has already thrown the code out and replaced
it with something more broken :-/

-Andi
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Andi Kleen Sept. 4, 2004, 1:40 p.m. UTC | #5
On Wed, Sep 01, 2004 at 11:59:22AM +1000, Anton Blanchard wrote:
>  
> > I notice that you didn't bother with the fractional byte that is handled
> > by 'endmask' in mm/mempolicy.c:get_nodes().  But I really don't give a
> > hoot - either way is fine by me.
> > 
> > I've written a couple of code snippets that manage to intuit the size of
> > the kernel's cpumask dynamically from user space, by probing with
> > various sched_getaffinity() calls.  But since your patch only changes
> > the errors generated by sched_setaffinity() [that's "set", not "get"], I
> > will not experience any grief from this subtle change in the kernel's
> > API.
> > 
> > Should you lock hotplug before calling get_user_cpu_mask(), since
> > get_user_cpu_mask() depends on cpu_online_mask()?
> 
> FYI the NUMA API and affinity code is broken on 64bit big endian. We
> really need a get/set compat bitmap and use it. How does this look?
> Not well tested yet...

Looks good from a quick review. But there is nothing to call it? 

-Andi
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Linus Torvalds Sept. 5, 2004, 12:18 a.m. UTC | #6
On Sat, 4 Sep 2004, Paul Jackson wrote:
> 
> How is what Linus left more broken?

It's not. If anything, we should probably remove even more.

I don't see what the problem was with just requiring the right damn size.  
User mode can trivially get the size by asking for it. But if it can't be
bothered, then Andi's code certainly just made things worse.

		Linus
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Paul Jackson Sept. 5, 2004, 1:05 a.m. UTC | #7
Linus wrote:
> It's not. If anything, we should probably remove even more.
>
> I don't see what the problem was with just requiring the right damn size.  
> User mode can trivially get the size by asking for it

I'll second that motion.  Match size, or return -EINVAL.

My understanding of "asking for it" requires at present a user code
loop, to probe for the size that works.  But my user code already does
that, and the first thing for which I audit any changes to this kernel
code is not breaking my sizing loop code in user space.

I'd mildly prefer adding a kernel/user API for explicitly providing the
two values:

	sizeof(cpumask_t)
	sizeof(nodemask_t)

This might help reduce the unending confusions in the user and library
code sitting on top of us.

We could two phase this:
 1) add an obvious way to size these masks, and then
 2) six months later, require sizes to match in all these calls.

I for one could live with a full and sudden change over, no phasing.
But apparently my field exposure is more limited than Andi's is, at
this time.
Linus Torvalds Sept. 5, 2004, 1:38 a.m. UTC | #8
On Sat, 4 Sep 2004, Paul Jackson wrote:
> 
> My understanding of "asking for it" requires at present a user code
> loop, to probe for the size that works.

Yeah, or just make a frigging big area, and asking the kernel for it ;)

Something like

	/* We just assume that 8k CPU's aren't going to happen */
	#define MAX_CPUMASK_BYTES (1024)

	void *cpumask = malloc(MAX_CPUMASK_BYTES);
	int real_size = sched_getaffinity(0, cpumask, MAX_CPUMASK_BYTES);

and no loop needed.

>				  But my user code already does
> that, and the first thing for which I audit any changes to this kernel
> code is not breaking my sizing loop code in user space.

I don't think you can reasonably use the "setaffinity()" call for sizing, 
since that historically just refused to use anything but the exact size. 
Sure, you could loop over every byte value known to man, but it's just a 
lot easier to do the "getaffinity" thing - if it fails, you can double the 
size of your buffer and try again. O(log(n)) rather than O(n) ;)

(And the "just start high enough" approach means that you can basically 
make it O(1) if you don't care about the theoretical possibility of a 
8k-CPU monster machine).

> I'd mildly prefer adding a kernel/user API for explicitly providing the
> two values:
> 
> 	sizeof(cpumask_t)
> 	sizeof(nodemask_t)
> 
> This might help reduce the unending confusions in the user and library
> code sitting on top of us.

I don't know how to sanely expose the damn things. Maybe in the vsyscall 
page or something. Adding YAEAE (yet another ELF aux entry) could be done, 
of course.

> We could two phase this:
>  1) add an obvious way to size these masks, and then
>  2) six months later, require sizes to match in all these calls.

Well, historically we _have_ required sizes to match. You can pass in 
larger sizes to the "get" functions (and they'll tell you how much you 
got), but the "set" functions required the user to know exactly what the 
size was. Which is easy, see above.

		Linus
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Paul Jackson Sept. 5, 2004, 3:48 a.m. UTC | #9
Linus wrote:
>	/* We just assume that 8k CPU's aren't going to happen */

SGI doesn't so assume ;).


> but it's just a lot easier to do the "getaffinity" thing - if it fails,
> you can double the size of your buffer and try again. O(log(n)) rather
> than O(n) ;)

I agree.  That's what my cpumask sizing loop does.

Well ... did.

Now it reads /sys/devices/system/node/node0/cpumap and computes the
size of the cpumask as an arithmetic function of the number of bytes
read (the ascii format uses 9 chars for each 32 bits of mask).

Either way works ...

My nodemask sizing code loops on get_mempolicy() calls of increasing
size, until they stop failing -EINVAL.


> Well, historically we _have_ required sizes to match.

I'm not sure what history you're looking at here, Linus.

Last weeks sys_sched_setaffinity didn't seem to require matching size,
only that user size is >= kernel size.  The kernel ignored the extra
user bits.

For nodemask_t, well let me just say the mbind/mempolicy calls are different.

If we want to go in the direction of requiring sizes to match in the
'set' calls, then instead of this weeks changes to sys_sched_setaffinity
allowing user size < kernel size, shouldn't we be going the other way,
and tightening the check in kernel/sched.c:sys_sched_setaffinity(), from
what it was a week ago:

        if (len < sizeof(new_mask))
                return -EINVAL;

to:

        if (len != sizeof(new_mask))
                return -EINVAL;

Or at least reverting this last weeks changes back to the '<' check?


> I don't know how to sanely expose the damn things

How about:

	$ cd /proc/sys/kernel
	$ head sizeof*
	==> sizeof_cpumask <==
	64

	==> sizeof_nodemask <==
	32
Linus Torvalds Sept. 5, 2004, 3:57 a.m. UTC | #10
On Sat, 4 Sep 2004, Paul Jackson wrote:
> 
> > Well, historically we _have_ required sizes to match.
> 
> I'm not sure what history you're looking at here, Linus.

I have my personal drugged-up history.

Take a toke, man.

IOW: You're obviously right.

> > I don't know how to sanely expose the damn things
> 
> How about:
> 
> 	$ cd /proc/sys/kernel
> 	$ head sizeof*
> 	==> sizeof_cpumask <==
> 	64
> 
> 	==> sizeof_nodemask <==
> 	32

Well, that's so much slower and not any more obvious than just doing the 
iterative few system calls that I don't really see the point other than 
from a scripting standpoint, but on the other hand I can't see how you'd 
use sched_setaffinity() and friends from within a script anyway, so ;)

(yes, there's perl syscalls, but then the standard "find the size" also 
works fine ;)

		Linus
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Paul Jackson Sept. 5, 2004, 4:17 a.m. UTC | #11
> Take a toke, man.

Ahh ... much better ... thanks.

> Well, that's so much slower and not any more obvious than just doing the 
> iterative few system calls that I don't really see the point other than 

Perhaps no more obvious to you, but if you had to see the confusion
I'm seeing over on user side, this /proc/sys/kernel/sizeof_cpumask
might be a win.

But if you want to take the position that it's not the kernels job
to keep the users head screwed on straight, I won't argue.

Besides, when you wrote "I don't know how to sanely expose the damn
things", I instinctively took that as a challenge to present a way.

==

I still like the position I thought you took for a moment there, of
tightening, not loosening, the preconditions on setaffinity, starting
with backing out the changes made to it this week.

Are you still thinking of doing that, or would you rather just let this
dog go back to sleep, as it lies now?
Paul Jackson Sept. 5, 2004, 4:52 a.m. UTC | #12
> starting with backing out the changes made to it this week.

Andi,

Given that Linus has gutted most of your patch to sched_setaffinity,
do you have a preference between where the code started the week,
and where it ended?

If I'm reading Linus' mind right (well ... there's a first time
for everything) then your preference, either way, would likely
carry the day.
Anton Blanchard Sept. 5, 2004, 2:27 p.m. UTC | #13
> Looks good from a quick review. But there is nothing to call it? 

Heres one :) Unfortunately we have to frob 32bit userspace bitmaps to
64bit ones on big endian platforms. This version does extra copies but
is simple, avoids set_fs tricks and gets things working for me on ppc64.

Anton

diff -puN mm/mempolicy.c~numa_api mm/mempolicy.c
--- gr_work/mm/mempolicy.c~numa_api	2004-09-04 21:14:44.595414365 -0500
+++ gr_work-anton/mm/mempolicy.c	2004-09-05 09:12:18.899685327 -0500
@@ -525,20 +525,82 @@ asmlinkage long sys_get_mempolicy(int __
 }
 
 #ifdef CONFIG_COMPAT
-/* The other functions are compatible */
+
 asmlinkage long compat_get_mempolicy(int __user *policy,
-				  unsigned __user *nmask, unsigned  maxnode,
-				  unsigned addr, unsigned  flags)
+				     compat_ulong_t __user *nmask,
+				     compat_ulong_t maxnode,
+				     compat_ulong_t addr, compat_ulong_t flags)
 {
 	long err;
 	unsigned long __user *nm = NULL;
+	unsigned long nr_bits, alloc_size;
+	DECLARE_BITMAP(bm, MAX_NUMNODES);
+
+	nr_bits = min(maxnode-1, MAX_NUMNODES);
+	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
+
 	if (nmask)
-		nm = compat_alloc_user_space(ALIGN(maxnode-1, 64) / 8);
-	err = sys_get_mempolicy(policy, nm, maxnode, addr, flags);
-	if (!err && copy_in_user(nmask, nm, ALIGN(maxnode-1, 32)/8))
-		err = -EFAULT;
+		nm = compat_alloc_user_space(alloc_size);
+
+	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
+
+	if (!err && nmask) {
+		err = copy_from_user(bm, nm, alloc_size);
+		/* ensure entire bitmap is zeroed */
+		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
+		err |= compat_put_bitmap(nmask, bm, nr_bits);
+	}
+
 	return err;
 }
+
+asmlinkage long compat_set_mempolicy(int mode, compat_ulong_t __user *nmask,
+				     compat_ulong_t maxnode)
+{
+	long err;
+	unsigned long __user *nm = NULL;
+	unsigned long nr_bits, alloc_size;
+	DECLARE_BITMAP(bm, MAX_NUMNODES);
+
+	nr_bits = min(maxnode-1, MAX_NUMNODES);
+	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
+
+	if (nmask) {
+		err = compat_get_bitmap(bm, nmask, nr_bits);
+		nm = compat_alloc_user_space(alloc_size);
+		err |= copy_to_user(nm, bm, alloc_size);
+	}
+
+	if (err)
+		return -EFAULT;
+
+	return sys_set_mempolicy(mode, nm, nr_bits+1);
+}
+
+asmlinkage long compat_mbind(compat_ulong_t start, compat_ulong_t len,
+			     compat_ulong_t mode, compat_ulong_t __user *nmask,
+			     compat_ulong_t maxnode, compat_ulong_t flags)
+{
+	long err;
+	unsigned long __user *nm = NULL;
+	unsigned long nr_bits, alloc_size;
+	DECLARE_BITMAP(bm, MAX_NUMNODES);
+
+	nr_bits = min(maxnode-1, MAX_NUMNODES);
+	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
+
+	if (nmask) {
+		err = compat_get_bitmap(bm, nmask, nr_bits);
+		nm = compat_alloc_user_space(alloc_size);
+		err |= copy_to_user(nm, bm, alloc_size);
+	}
+
+	if (err)
+		return -EFAULT;
+
+	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
+}
+
 #endif
 
 /* Return effective policy for a VMA */
@@ -900,7 +962,7 @@ mpol_shared_policy_lookup(struct shared_
 
 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 {
-	PDprintk("deleting %lx-l%x\n", n->start, n->end);
+	PDprintk("deleting %lx-%lx\n", n->start, n->end);
 	rb_erase(&n->nd, &sp->root);
 	mpol_free(n->policy);
 	kmem_cache_free(sn_cache, n);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Andi Kleen Sept. 6, 2004, 1:16 p.m. UTC | #14
On Sat, Sep 04, 2004 at 05:18:30PM -0700, Linus Torvalds wrote:
> 
> 
> On Sat, 4 Sep 2004, Paul Jackson wrote:
> > 
> > How is what Linus left more broken?
> 
> It's not. If anything, we should probably remove even more.
> 
> I don't see what the problem was with just requiring the right damn size.  
> User mode can trivially get the size by asking for it. But if it can't be

I don't think writing a syscall loop is a good idea for this. 
The main reason is that when you get an EINVAL for some other
reason you will still blow up your memory until you
hit some arbitary upper size.

Currently this EINVAL is the only instance in this syscall,
but this may change in some future version.

A sysctl may have worked, but it results in a lot of code
bloat in the application to handle it.

> bothered, then Andi's code certainly just made things worse.

I disagree on that. It was not perfect, but with minor fixes
could have been a proper solution. Your current code is even worse than
what was there before my patch.

Alternative would be the sysctl and strict check again. I don't
like it too much because it makes the application more complicated
(i prefer simple interfaces, because complex interfaces tend to 
have more bugs) 

-Andi
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Andi Kleen Sept. 6, 2004, 6:23 p.m. UTC | #15
On Sat, Sep 04, 2004 at 09:52:05PM -0700, Paul Jackson wrote:
> > starting with backing out the changes made to it this week.
> 
> Andi,
> 
> Given that Linus has gutted most of your patch to sched_setaffinity,
> do you have a preference between where the code started the week,
> and where it ended?
> 
> If I'm reading Linus' mind right (well ... there's a first time
> for everything) then your preference, either way, would likely
> carry the day.

The only change I would like to have is to check the excess bytes
to make sure they don't contain some random value. They should
be either all 0 or all 0xff. 

-Andi

Here's a patch for bk12: 

Linus, does this look better?

--------------------------------------------------------

For excess cpumask bits passed from user space ensure
they are all zero or all one.  This minimizes binary incompatibilities
when the kernel is recompiled with a bigger cpumask_t type.

diff -u linux-2.6.8/kernel/sched.c-o linux-2.6.8/kernel/sched.c
--- linux-2.6.8/kernel/sched.c-o	2004-09-06 20:06:58.000000000 +0200
+++ linux-2.6.8/kernel/sched.c	2004-09-06 20:16:33.940579241 +0200
@@ -3368,6 +3368,19 @@
 	if (len < sizeof(cpumask_t)) {
 		memset(new_mask, 0, sizeof(cpumask_t));
 	} else if (len > sizeof(cpumask_t)) {
+		unsigned i;
+		unsigned char val, initval;
+		if (len > PAGE_SIZE)
+			return -EINVAL;
+		/* excess bytes must be all 0 or all 0xff */
+		for (i = sizeof(cpumask_t); i < len; i++) { 
+			if (get_user(val, (char *)new_mask + i))
+				return -EFAULT; 
+			if (i == sizeof(cpumask_t))
+				initval = val;
+			if (!(val == 0 || val == 0xff) || val != initval)
+				return -EINVAL; 
+		} 
 		len = sizeof(cpumask_t);
 	}
 	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Linus Torvalds Sept. 6, 2004, 6:48 p.m. UTC | #16
On Mon, 6 Sep 2004, Andi Kleen wrote:
> 
> The only change I would like to have is to check the excess bytes
> to make sure they don't contain some random value. They should
> be either all 0 or all 0xff. 

I hate the "byte at a time" interface.

That said, I think the "long at a time" interface we have now for bitmaps 
ends up being a compatibility problem, where the compat layer has to worry 
about big-endian 32-bit "long" lookign different from big-endian 64-bit 
"long".

So there are other issues here.

		Linus
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Paul Jackson Sept. 6, 2004, 9:11 p.m. UTC | #17
Linus wrote:
> I hate the "byte at a time" interface.
> 
> That said, I think the "long at a time" interface we have now for bitmaps 
> ends up being a compatibility problem, where the compat layer has to worry 
> about big-endian 32-bit "long" lookign different from big-endian 64-bit 
> "long".

My first preference would be to get all the binary bitmap interfaces
(affinity, mbind and mempolicy) "right":

    I think that means an array of 'u32'.  This parallels what I did for
    the ascii format, where there was less need to remain compatible
    (except that ascii is naturally big-endian, while the u32 array has
    the low order word first):

      $ cat /sys/devices/system/node/node0/cpumap
      00000000,00000000,00000000,000000ff

    No doubt Andi will veto this for mbind/mempolicy, because it breaks
    libnuma's he has in the field - a reasonable concern.  We'd probably
    have to burn another couple of system calls, introducing the new API
    while keeping the old one around, as is, for a year or three.

    And this (array of u32) is different from the kernel bitmap, due to
    the reversed u32 halves of each u64 on big endian 64 arches.  If I
    were God, the kernel bitmap would also be an array of u32's, not ulongs.
    Still ... might as well start somewhere, and get the kernel/user API
    "right", even if the kernel internals have an irreparable twist.

    I agree with Andi that there should be an explicit way to get the
    correct size - the loops cause too many user level code bugs, and
    trying to accomodate user code that doesn't know the exact size is
    causing our kernel code too much grief.

    Possible ways to publish cpumask/nodemask sizes include:

     1) # an ascii field in some proc file:

	    $ grep sizeof /proc/sys/kernel

     2) sysctl

     3) overload sched_getaffinity (for sizeof cpumask) and get_mempolicy
	(for sizeof nodemask) to return the sizes if passed zero lengths
	or NULL mask pointers or some other currently useless input.

My second preference would be what we had a week ago.  Minor tweaks
(especially ones that relax the preconditions) to busted API's do more
harm than good.  Leave 'em be, or get 'em "right".  Quit putting
lipstick on a pig.

I was surprised that Andi came up with yet another tweak to this API
(his suggested patch to allow either 0x00 or 0xff fill).  Surely Andi
doesn't need this for _his_ code, since he's competent to code to the
current API.  So I guess he's trying to make life easier for others.
Eh ... doesn't seem worth it.

Leave it be, I say.  Leave it be.  Or get it right.
Andi Kleen Sept. 7, 2004, 8:07 a.m. UTC | #18
On Mon, Sep 06, 2004 at 11:48:46AM -0700, Linus Torvalds wrote:
> 
> 
> On Mon, 6 Sep 2004, Andi Kleen wrote:
> > 
> > The only change I would like to have is to check the excess bytes
> > to make sure they don't contain some random value. They should
> > be either all 0 or all 0xff. 
> 
> I hate the "byte at a time" interface.

I looked at doing it, but it would be far too complicated
for such a single operation with the two necessary alignment and
fix up left over bytes at the end loops and other fixup code. 
And this should not really be performance critical in any ways. 
long handling would be easy if the interface had been designed in longs, 
but it wasn't.

> That said, I think the "long at a time" interface we have now for bitmaps 
> ends up being a compatibility problem, where the compat layer has to worry 
> about big-endian 32-bit "long" lookign different from big-endian 64-bit 
> "long".
> 
> So there are other issues here.

In this special case not - big endian and little endian 0 and -1 are both
identical :-)

-Andi

Here's the byte at a time code again in case you change your mind.

--------------------------------------------------------------

Check that excess bytes passed by the user process to 
sched_setaffinity contain all 0 (no cpus) or all ones (all cpus)

diff -u linux-2.6.8/kernel/sched.c-o linux-2.6.8/kernel/sched.c
--- linux-2.6.8/kernel/sched.c-o	2004-09-06 20:06:58.000000000 +0200
+++ linux-2.6.8/kernel/sched.c	2004-09-06 20:16:33.940579241 +0200
@@ -3368,6 +3368,19 @@
 	if (len < sizeof(cpumask_t)) {
 		memset(new_mask, 0, sizeof(cpumask_t));
 	} else if (len > sizeof(cpumask_t)) {
+		unsigned i;
+		unsigned char val, initval;
+		if (len > PAGE_SIZE)
+			return -EINVAL;
+		/* excess bytes must be all 0 or all 0xff */
+		for (i = sizeof(cpumask_t); i < len; i++) { 
+			if (get_user(val, (char *)new_mask + i))
+				return -EFAULT; 
+			if (i == sizeof(cpumask_t))
+				initval = val;
+			if (!(val == 0 || val == 0xff) || val != initval)
+				return -EINVAL; 
+		} 
 		len = sizeof(cpumask_t);
 	}
 	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Patch
diff mbox series

diff -u linux-2.6.8-work/kernel/sched.c-AFFINITY linux-2.6.8-work/kernel/sched.c
--- linux-2.6.8-work/kernel/sched.c-AFFINITY	2004-08-05 04:31:11.000000000 +0200
+++ linux-2.6.8-work/kernel/sched.c	2004-08-31 15:36:38.000000000 +0200
@@ -2891,6 +2891,34 @@ 
 	return retval;
 }
 
+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
+			     cpumask_t *new_mask)
+{
+	if (len < sizeof(cpumask_t)) {
+		/* Smaller is ok as long as all online CPUs are covered */
+		int i, max = 0;
+		for_each_online_cpu(i) 
+			max = i; 
+		if (len < (max + 7)/8)
+			return -EINVAL;
+		memset(new_mask, 0, sizeof(cpumask_t)); 
+	} else if (len > sizeof(cpumask_t)) { 
+		/* Longer is ok as long as all high bits are 0 */
+		int i;
+		if (len > PAGE_SIZE)
+			return -EINVAL;
+		for (i = sizeof(cpumask_t); i < len; i++) { 
+			unsigned char val;
+			if (get_user(val, (unsigned char *)user_mask_ptr + i))
+				return -EFAULT; 
+			if (val)
+				return -EINVAL;
+		} 
+		len = sizeof(cpumask_t);			
+	}
+	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
+}
+
 /**
  * sys_sched_setaffinity - set the cpu affinity of a process
  * @pid: pid of the process
@@ -2903,12 +2931,10 @@ 
 	cpumask_t new_mask;
 	int retval;
 	task_t *p;
-
-	if (len < sizeof(new_mask))
-		return -EINVAL;
-
-	if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
-		return -EFAULT;
+	
+	retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
+	if (retval)
+		return retval;
 
 	lock_cpu_hotplug();
 	read_lock(&tasklist_lock);