linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC] syscalls, x86: Add __NR_kcmp syscall
@ 2012-01-17 14:27 Cyrill Gorcunov
  2012-01-17 14:38 ` Alexey Dobriyan
  0 siblings, 1 reply; 27+ messages in thread
From: Cyrill Gorcunov @ 2012-01-17 14:27 UTC (permalink / raw)
  To: LKML
  Cc: Eric W. Biederman, Pavel Emelyanov, Andrey Vagin, Ingo Molnar,
	H. Peter Anvin, Thomas Gleixner, Glauber Costa, Andi Kleen,
	Tejun Heo, Matt Helsley, Pekka Enberg, Eric Dumazet,
	Vasiliy Kulikov, Andrew Morton, Alexey Dobriyan,
	Valdis.Kletnieks

Hi all,

while general-object-id patch series were reviewed and commented
(see https://lkml.org/lkml/2012/1/11/220 the thread) it seems we have
ended up in conclusion that a syscall which do compare kernel members
internally and provide the result back to user space is a way more
secure than exporting (even strongly encrypted) kernel pointers in form
of some general IDs.

So here is a really early and draft version of how can it be done I think.
Please review and comment. I've tested it on x86-64 platform only for a while.
And it seems I missed to export header to user-space but it could be fixed
later once things are settle.

p.s. hope I didn't miss anyone in CC who were involved in previous discussion ;)

Thanks,
	Cyrill
---
syscalls, x86: Add __NR_kcmp syscall

While doing the checkpoint-restore in the userspace one need to determine
whether various kernel objects (like mm_struct-s of file_struct-s) are shared
between tasks and restore this state.

The 2nd step can be solved by using appropriate CLONE_ flags and the unshare
syscall, while there's currently no ways for solving the 1st one.

One of the ways for checking whether two tasks share e.g. mm_struct is to
provide some mm_struct ID of a task to its proc file, but showing such
info considered to be not that good for security reasons.

Thus after some debates we end up in conclusion that using that named
'comparision' syscall might be the best candidate. So here is it --
__NR_kcmp.

It takes up to 5 agruments - the pids of the two tasks (which
characteristics should be compared), the comparision type and
(in case of comparision of files) two file descriptors.

At moment only x86 is supported.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: "Eric W. Biederman" <ebiederm@xmission.com>
CC: Pavel Emelyanov <xemul@parallels.com>
CC: Andrey Vagin <avagin@openvz.org>
CC: Ingo Molnar <mingo@elte.hu>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Glauber Costa <glommer@parallels.com>
CC: Andi Kleen <andi@firstfloor.org>
CC: Tejun Heo <tj@kernel.org>
CC: Matt Helsley <matthltc@us.ibm.com>
CC: Pekka Enberg <penberg@kernel.org>
CC: Eric Dumazet <eric.dumazet@gmail.com>
CC: Vasiliy Kulikov <segoon@openwall.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Valdis.Kletnieks@vt.edu
---
 arch/x86/include/asm/kcmp.h        |   21 ++++++
 arch/x86/include/asm/syscalls.h    |    4 +
 arch/x86/include/asm/unistd_32.h   |    1 
 arch/x86/include/asm/unistd_64.h   |    2 
 arch/x86/kernel/Makefile           |    1 
 arch/x86/kernel/kcmp.c             |  127 +++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/syscall_table_32.S |    1 
 7 files changed, 157 insertions(+)

Index: linux-2.6.git/arch/x86/include/asm/kcmp.h
===================================================================
--- /dev/null
+++ linux-2.6.git/arch/x86/include/asm/kcmp.h
@@ -0,0 +1,21 @@
+#ifndef _LINUX_KCMP_H
+#define _LINUX_KCMP_H
+
+/* Comparision type */
+enum {
+	KCMP_FILE,
+	KCMP_VM,
+	KCMP_FILES,
+	KCMP_FS,
+	KCMP_SIGHAND,
+	KCMP_IO,
+	KCMP_SYSVSEM,
+
+	KCMP_TYPES,
+};
+
+#define KCMP_EQ		0
+#define KCMP_LT		1
+#define KCMP_GT		2
+
+#endif /* _LINUX_KCMP_H */
Index: linux-2.6.git/arch/x86/include/asm/syscalls.h
===================================================================
--- linux-2.6.git.orig/arch/x86/include/asm/syscalls.h
+++ linux-2.6.git/arch/x86/include/asm/syscalls.h
@@ -42,6 +42,10 @@ long sys_sigaltstack(const stack_t __use
 asmlinkage int sys_set_thread_area(struct user_desc __user *);
 asmlinkage int sys_get_thread_area(struct user_desc __user *);
 
+/* kenrel/kcmp.c */
+asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
+			 unsigned long idx1, unsigned long idx2);
+
 /* X86_32 only */
 #ifdef CONFIG_X86_32
 
Index: linux-2.6.git/arch/x86/include/asm/unistd_32.h
===================================================================
--- linux-2.6.git.orig/arch/x86/include/asm/unistd_32.h
+++ linux-2.6.git/arch/x86/include/asm/unistd_32.h
@@ -354,6 +354,7 @@
 #define __NR_setns		346
 #define __NR_process_vm_readv	347
 #define __NR_process_vm_writev	348
+#define __NR_kcmp		349
 
 #ifdef __KERNEL__
 
Index: linux-2.6.git/arch/x86/include/asm/unistd_64.h
===================================================================
--- linux-2.6.git.orig/arch/x86/include/asm/unistd_64.h
+++ linux-2.6.git/arch/x86/include/asm/unistd_64.h
@@ -686,6 +686,8 @@ __SYSCALL(__NR_getcpu, sys_getcpu)
 __SYSCALL(__NR_process_vm_readv, sys_process_vm_readv)
 #define __NR_process_vm_writev			311
 __SYSCALL(__NR_process_vm_writev, sys_process_vm_writev)
+#define __NR_kcmp				312
+__SYSCALL(__NR_kcmp, sys_kcmp)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
Index: linux-2.6.git/arch/x86/kernel/Makefile
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/Makefile
+++ linux-2.6.git/arch/x86/kernel/Makefile
@@ -33,6 +33,7 @@ obj-y			+= alternative.o i8253.o pci-nom
 obj-y			+= tsc.o io_delay.o rtc.o
 obj-y			+= pci-iommu_table.o
 obj-y			+= resource.o
+obj-y			+= kcmp.o
 
 obj-y				+= trampoline.o trampoline_$(BITS).o
 obj-y				+= process.o
Index: linux-2.6.git/arch/x86/kernel/kcmp.c
===================================================================
--- /dev/null
+++ linux-2.6.git/arch/x86/kernel/kcmp.c
@@ -0,0 +1,127 @@
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+#include <linux/fdtable.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/cache.h>
+#include <linux/bug.h>
+#include <linux/err.h>
+
+#include <linux/syscalls.h>
+#include <asm/unistd.h>
+#include <asm/kcmp.h>
+
+#define KCMP_PTR(ptr1, ptr2)						\
+({									\
+	long ___r = (long)ptr1 - (long)ptr2;				\
+	(___r == 0 ? KCMP_EQ : (___r < 0 ? KCMP_LT : KCMP_GT));		\
+})
+
+#define KCMP_TASK_PTR(task1, task2, member)				\
+	KCMP_PTR((task1)->member, (task2)->member)
+
+/* A caller must be sure the task is presented in memory */
+static struct file *
+get_file_raw_ptr(struct task_struct *task, unsigned int idx)
+{
+	struct fdtable *fdt;
+	struct file *file;
+
+	spin_lock(&task->files->file_lock);
+	fdt = files_fdtable(task->files);
+	if (idx < fdt->max_fds)
+		file = fdt->fd[idx];
+	else
+		file = NULL;
+	spin_unlock(&task->files->file_lock);
+
+	return file;
+}
+
+SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
+		unsigned long, idx1, unsigned long, idx2)
+{
+	struct task_struct *task1;
+	struct task_struct *task2;
+	int ret = 0;
+
+	rcu_read_lock();
+
+	task1 = find_task_by_vpid(pid1);
+	if (!task1) {
+		rcu_read_unlock();
+		return -ESRCH;
+	}
+
+	task2 = find_task_by_vpid(pid2);
+	if (!task2) {
+		put_task_struct(task1);
+		rcu_read_unlock();
+		return -ESRCH;
+	}
+
+	get_task_struct(task1);
+	get_task_struct(task2);
+
+	rcu_read_unlock();
+
+	if (!ptrace_may_access(task1, PTRACE_MODE_READ) ||
+	    !ptrace_may_access(task2, PTRACE_MODE_READ)) {
+		ret = -EACCES;
+		goto err;
+	}
+
+	/*
+	 * Note for all cases but the KCMP_FILE we
+	 * don't take any locks and do a plain pointer
+	 * comparision in a sake of speed.
+	 */
+
+	switch (type) {
+	case KCMP_FILE: {
+		struct file *filp1, *filp2;
+
+		filp1 = get_file_raw_ptr(task1, idx1);
+		filp2 = get_file_raw_ptr(task2, idx2);
+
+		if (filp1 && filp2)
+			ret = KCMP_PTR(filp1, filp2);
+		else
+			ret = -ENOENT;
+		break;
+	}
+	case KCMP_VM:
+		ret = KCMP_TASK_PTR(task1, task2, mm);
+		break;
+	case KCMP_FILES:
+		ret = KCMP_TASK_PTR(task1, task2, files);
+		break;
+	case KCMP_FS:
+		ret = KCMP_TASK_PTR(task1, task2, fs);
+		break;
+	case KCMP_SIGHAND:
+		ret = KCMP_TASK_PTR(task1, task2, sighand);
+		break;
+	case KCMP_IO:
+		ret = KCMP_TASK_PTR(task1, task2, io_context);
+		break;
+	case KCMP_SYSVSEM:
+#ifdef CONFIG_SYSVIPC
+		ret = KCMP_TASK_PTR(task1, task2, sysvsem.undo_list);
+#else
+		ret = -ENOENT;
+		goto err;
+#endif
+		break;
+	default:
+		ret = -EINVAL;
+		goto err;
+	}
+
+err:
+	put_task_struct(task1);
+	put_task_struct(task2);
+
+	return ret;
+}
Index: linux-2.6.git/arch/x86/kernel/syscall_table_32.S
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/syscall_table_32.S
+++ linux-2.6.git/arch/x86/kernel/syscall_table_32.S
@@ -348,3 +348,4 @@ ENTRY(sys_call_table)
 	.long sys_setns
 	.long sys_process_vm_readv
 	.long sys_process_vm_writev
+	.long sys_kcmp

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-17 14:27 [RFC] syscalls, x86: Add __NR_kcmp syscall Cyrill Gorcunov
@ 2012-01-17 14:38 ` Alexey Dobriyan
  2012-01-17 14:44   ` Cyrill Gorcunov
  0 siblings, 1 reply; 27+ messages in thread
From: Alexey Dobriyan @ 2012-01-17 14:38 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: LKML, Eric W. Biederman, Pavel Emelyanov, Andrey Vagin,
	Ingo Molnar, H. Peter Anvin, Thomas Gleixner, Glauber Costa,
	Andi Kleen, Tejun Heo, Matt Helsley, Pekka Enberg, Eric Dumazet,
	Vasiliy Kulikov, Andrew Morton, Valdis.Kletnieks

On 1/17/12, Cyrill Gorcunov <gorcunov@gmail.com> wrote:
> +#define KCMP_EQ		0
> +#define KCMP_LT		1
> +#define KCMP_GT		2

LT and GT are meaningless.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-17 14:38 ` Alexey Dobriyan
@ 2012-01-17 14:44   ` Cyrill Gorcunov
  2012-01-17 18:47     ` H. Peter Anvin
  0 siblings, 1 reply; 27+ messages in thread
From: Cyrill Gorcunov @ 2012-01-17 14:44 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: LKML, Eric W. Biederman, Pavel Emelyanov, Andrey Vagin,
	Ingo Molnar, H. Peter Anvin, Thomas Gleixner, Glauber Costa,
	Andi Kleen, Tejun Heo, Matt Helsley, Pekka Enberg, Eric Dumazet,
	Vasiliy Kulikov, Andrew Morton, Valdis.Kletnieks

On Tue, Jan 17, 2012 at 04:38:14PM +0200, Alexey Dobriyan wrote:
> On 1/17/12, Cyrill Gorcunov <gorcunov@gmail.com> wrote:
> > +#define KCMP_EQ		0
> > +#define KCMP_LT		1
> > +#define KCMP_GT		2
> 
> LT and GT are meaningless.
> 

I found symbolic names better than open-coded values. But sure,
if this is problem it could be dropped.

Or you mean that in general anything but 'equal' is useless?

	Cyrill

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-17 14:44   ` Cyrill Gorcunov
@ 2012-01-17 18:47     ` H. Peter Anvin
  2012-01-17 21:15       ` Cyrill Gorcunov
  2012-01-17 21:35       ` Eric W. Biederman
  0 siblings, 2 replies; 27+ messages in thread
From: H. Peter Anvin @ 2012-01-17 18:47 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Alexey Dobriyan, LKML, Eric W. Biederman, Pavel Emelyanov,
	Andrey Vagin, Ingo Molnar, Thomas Gleixner, Glauber Costa,
	Andi Kleen, Tejun Heo, Matt Helsley, Pekka Enberg, Eric Dumazet,
	Vasiliy Kulikov, Andrew Morton, Valdis.Kletnieks

On 01/17/2012 06:44 AM, Cyrill Gorcunov wrote:
> On Tue, Jan 17, 2012 at 04:38:14PM +0200, Alexey Dobriyan wrote:
>> On 1/17/12, Cyrill Gorcunov <gorcunov@gmail.com> wrote:
>>> +#define KCMP_EQ		0
>>> +#define KCMP_LT		1
>>> +#define KCMP_GT		2
>>
>> LT and GT are meaningless.
>>
> 
> I found symbolic names better than open-coded values. But sure,
> if this is problem it could be dropped.
> 
> Or you mean that in general anything but 'equal' is useless?
> 

Why on Earth would user space need to know which order in memory certain
kernel objects are?

Keep in mind that this is *exactly* the kind of information which makes
rootkits easier.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-17 18:47     ` H. Peter Anvin
@ 2012-01-17 21:15       ` Cyrill Gorcunov
  2012-01-17 21:40         ` Eric W. Biederman
  2012-01-17 21:35       ` Eric W. Biederman
  1 sibling, 1 reply; 27+ messages in thread
From: Cyrill Gorcunov @ 2012-01-17 21:15 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Alexey Dobriyan, LKML, Eric W. Biederman, Pavel Emelyanov,
	Andrey Vagin, Ingo Molnar, Thomas Gleixner, Glauber Costa,
	Andi Kleen, Tejun Heo, Matt Helsley, Pekka Enberg, Eric Dumazet,
	Vasiliy Kulikov, Andrew Morton, Valdis.Kletnieks

On Tue, Jan 17, 2012 at 10:47:37AM -0800, H. Peter Anvin wrote:
> On 01/17/2012 06:44 AM, Cyrill Gorcunov wrote:
> > On Tue, Jan 17, 2012 at 04:38:14PM +0200, Alexey Dobriyan wrote:
> >> On 1/17/12, Cyrill Gorcunov <gorcunov@gmail.com> wrote:
> >>> +#define KCMP_EQ		0
> >>> +#define KCMP_LT		1
> >>> +#define KCMP_GT		2
> >>
> >> LT and GT are meaningless.
> >>
> > 
> > I found symbolic names better than open-coded values. But sure,
> > if this is problem it could be dropped.
> > 
> > Or you mean that in general anything but 'equal' is useless?
> > 
> 
> Why on Earth would user space need to know which order in memory certain
> kernel objects are?
> 
> Keep in mind that this is *exactly* the kind of information which makes
> rootkits easier.
> 

Hmm, indeed this might help narrow down the target address I fear. So
after some conversation with Pavel I think we can try to live with just
one result -- is objects are at same location in kernel memory or not.
The updated version is below. Please review if you get a chance. Thanks
a lot for comments!

	Cyrill
---
From: Cyrill Gorcunov <gorcunov@openvz.org>
Subject: [RFC] syscalls, x86: Add __NR_kcmp syscall v2

While doing the checkpoint-restore in the userspace one need to determine
whether various kernel objects (like mm_struct-s of file_struct-s) are shared
between tasks and restore this state.

The 2nd step can be solved by using appropriate CLONE_ flags and the unshare
syscall, while there's currently no ways for solving the 1st one.

One of the ways for checking whether two tasks share e.g. mm_struct is to
provide some mm_struct ID of a task to its proc file, but showing such
info considered to be not that good for security reasons.

Thus after some debates we end up in conclusion that using that named
'comparision' syscall might be the best candidate. So here is it --
__NR_kcmp.

It takes up to 5 agruments - the pids of the two tasks (which
characteristics should be compared), the comparision type and
(in case of comparision of files) two file descriptors.

At moment only x86 is supported.

v2: The result of syscall is restricted to only two variants:
    either objects match and lay at same memory address (0),
    either not and (1) is returned.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: "Eric W. Biederman" <ebiederm@xmission.com>
CC: Pavel Emelyanov <xemul@parallels.com>
CC: Andrey Vagin <avagin@openvz.org>
CC: Ingo Molnar <mingo@elte.hu>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Glauber Costa <glommer@parallels.com>
CC: Andi Kleen <andi@firstfloor.org>
CC: Tejun Heo <tj@kernel.org>
CC: Matt Helsley <matthltc@us.ibm.com>
CC: Pekka Enberg <penberg@kernel.org>
CC: Eric Dumazet <eric.dumazet@gmail.com>
CC: Vasiliy Kulikov <segoon@openwall.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Valdis.Kletnieks@vt.edu
---
 arch/x86/include/asm/kcmp.h        |   20 ++++++
 arch/x86/include/asm/syscalls.h    |    4 +
 arch/x86/include/asm/unistd_32.h   |    1 
 arch/x86/include/asm/unistd_64.h   |    2 
 arch/x86/kernel/Makefile           |    1 
 arch/x86/kernel/kcmp.c             |  121 +++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/syscall_table_32.S |    1 
 7 files changed, 150 insertions(+)

Index: linux-2.6.git/arch/x86/include/asm/kcmp.h
===================================================================
--- /dev/null
+++ linux-2.6.git/arch/x86/include/asm/kcmp.h
@@ -0,0 +1,20 @@
+#ifndef _LINUX_KCMP_H
+#define _LINUX_KCMP_H
+
+/* Comparision type */
+enum {
+	KCMP_FILE,
+	KCMP_VM,
+	KCMP_FILES,
+	KCMP_FS,
+	KCMP_SIGHAND,
+	KCMP_IO,
+	KCMP_SYSVSEM,
+
+	KCMP_TYPES,
+};
+
+#define KCMP_EQ		0
+#define KCMP_NE		1
+
+#endif /* _LINUX_KCMP_H */
Index: linux-2.6.git/arch/x86/include/asm/syscalls.h
===================================================================
--- linux-2.6.git.orig/arch/x86/include/asm/syscalls.h
+++ linux-2.6.git/arch/x86/include/asm/syscalls.h
@@ -42,6 +42,10 @@ long sys_sigaltstack(const stack_t __use
 asmlinkage int sys_set_thread_area(struct user_desc __user *);
 asmlinkage int sys_get_thread_area(struct user_desc __user *);
 
+/* kenrel/kcmp.c */
+asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
+			 unsigned long idx1, unsigned long idx2);
+
 /* X86_32 only */
 #ifdef CONFIG_X86_32
 
Index: linux-2.6.git/arch/x86/include/asm/unistd_32.h
===================================================================
--- linux-2.6.git.orig/arch/x86/include/asm/unistd_32.h
+++ linux-2.6.git/arch/x86/include/asm/unistd_32.h
@@ -354,6 +354,7 @@
 #define __NR_setns		346
 #define __NR_process_vm_readv	347
 #define __NR_process_vm_writev	348
+#define __NR_kcmp		349
 
 #ifdef __KERNEL__
 
Index: linux-2.6.git/arch/x86/include/asm/unistd_64.h
===================================================================
--- linux-2.6.git.orig/arch/x86/include/asm/unistd_64.h
+++ linux-2.6.git/arch/x86/include/asm/unistd_64.h
@@ -686,6 +686,8 @@ __SYSCALL(__NR_getcpu, sys_getcpu)
 __SYSCALL(__NR_process_vm_readv, sys_process_vm_readv)
 #define __NR_process_vm_writev			311
 __SYSCALL(__NR_process_vm_writev, sys_process_vm_writev)
+#define __NR_kcmp				312
+__SYSCALL(__NR_kcmp, sys_kcmp)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
Index: linux-2.6.git/arch/x86/kernel/Makefile
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/Makefile
+++ linux-2.6.git/arch/x86/kernel/Makefile
@@ -33,6 +33,7 @@ obj-y			+= alternative.o i8253.o pci-nom
 obj-y			+= tsc.o io_delay.o rtc.o
 obj-y			+= pci-iommu_table.o
 obj-y			+= resource.o
+obj-y			+= kcmp.o
 
 obj-y				+= trampoline.o trampoline_$(BITS).o
 obj-y				+= process.o
Index: linux-2.6.git/arch/x86/kernel/kcmp.c
===================================================================
--- /dev/null
+++ linux-2.6.git/arch/x86/kernel/kcmp.c
@@ -0,0 +1,121 @@
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+#include <linux/fdtable.h>
+#include <linux/err.h>
+
+#include <asm/unistd.h>
+#include <asm/kcmp.h>
+
+#define KCMP_PTR(ptr1, ptr2)						\
+({									\
+	long ___r = (long)ptr1 - (long)ptr2;				\
+	(___r == 0 ? KCMP_EQ : KCMP_NE);				\
+})
+
+#define KCMP_TASK_PTR(task1, task2, member)				\
+	KCMP_PTR((task1)->member, (task2)->member)
+
+/* A caller must be sure the task is presented in memory */
+static struct file *
+get_file_raw_ptr(struct task_struct *task, unsigned int idx)
+{
+	struct fdtable *fdt;
+	struct file *file;
+
+	spin_lock(&task->files->file_lock);
+	fdt = files_fdtable(task->files);
+	if (idx < fdt->max_fds)
+		file = fdt->fd[idx];
+	else
+		file = NULL;
+	spin_unlock(&task->files->file_lock);
+
+	return file;
+}
+
+SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
+		unsigned long, idx1, unsigned long, idx2)
+{
+	struct task_struct *task1;
+	struct task_struct *task2;
+	int ret = 0;
+
+	rcu_read_lock();
+
+	task1 = find_task_by_vpid(pid1);
+	if (!task1) {
+		rcu_read_unlock();
+		return -ESRCH;
+	}
+
+	task2 = find_task_by_vpid(pid2);
+	if (!task2) {
+		put_task_struct(task1);
+		rcu_read_unlock();
+		return -ESRCH;
+	}
+
+	get_task_struct(task1);
+	get_task_struct(task2);
+
+	rcu_read_unlock();
+
+	if (!ptrace_may_access(task1, PTRACE_MODE_READ) ||
+	    !ptrace_may_access(task2, PTRACE_MODE_READ)) {
+		ret = -EACCES;
+		goto err;
+	}
+
+	/*
+	 * Note for all cases but the KCMP_FILE we
+	 * don't take any locks and do a plain pointer
+	 * comparision in a sake of speed.
+	 */
+
+	switch (type) {
+	case KCMP_FILE: {
+		struct file *filp1, *filp2;
+
+		filp1 = get_file_raw_ptr(task1, idx1);
+		filp2 = get_file_raw_ptr(task2, idx2);
+
+		if (filp1 && filp2)
+			ret = KCMP_PTR(filp1, filp2);
+		else
+			ret = -ENOENT;
+		break;
+	}
+	case KCMP_VM:
+		ret = KCMP_TASK_PTR(task1, task2, mm);
+		break;
+	case KCMP_FILES:
+		ret = KCMP_TASK_PTR(task1, task2, files);
+		break;
+	case KCMP_FS:
+		ret = KCMP_TASK_PTR(task1, task2, fs);
+		break;
+	case KCMP_SIGHAND:
+		ret = KCMP_TASK_PTR(task1, task2, sighand);
+		break;
+	case KCMP_IO:
+		ret = KCMP_TASK_PTR(task1, task2, io_context);
+		break;
+	case KCMP_SYSVSEM:
+#ifdef CONFIG_SYSVIPC
+		ret = KCMP_TASK_PTR(task1, task2, sysvsem.undo_list);
+#else
+		ret = -ENOENT;
+		goto err;
+#endif
+		break;
+	default:
+		ret = -EINVAL;
+		goto err;
+	}
+
+err:
+	put_task_struct(task1);
+	put_task_struct(task2);
+
+	return ret;
+}
Index: linux-2.6.git/arch/x86/kernel/syscall_table_32.S
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/syscall_table_32.S
+++ linux-2.6.git/arch/x86/kernel/syscall_table_32.S
@@ -348,3 +348,4 @@ ENTRY(sys_call_table)
 	.long sys_setns
 	.long sys_process_vm_readv
 	.long sys_process_vm_writev
+	.long sys_kcmp

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-17 18:47     ` H. Peter Anvin
  2012-01-17 21:15       ` Cyrill Gorcunov
@ 2012-01-17 21:35       ` Eric W. Biederman
  2012-01-18  8:01         ` Cyrill Gorcunov
  2012-01-18 22:05         ` david
  1 sibling, 2 replies; 27+ messages in thread
From: Eric W. Biederman @ 2012-01-17 21:35 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Cyrill Gorcunov, Alexey Dobriyan, LKML, Pavel Emelyanov,
	Andrey Vagin, Ingo Molnar, Thomas Gleixner, Glauber Costa,
	Andi Kleen, Tejun Heo, Matt Helsley, Pekka Enberg, Eric Dumazet,
	Vasiliy Kulikov, Andrew Morton, Valdis.Kletnieks

"H. Peter Anvin" <hpa@zytor.com> writes:

> On 01/17/2012 06:44 AM, Cyrill Gorcunov wrote:
>> On Tue, Jan 17, 2012 at 04:38:14PM +0200, Alexey Dobriyan wrote:
>>> On 1/17/12, Cyrill Gorcunov <gorcunov@gmail.com> wrote:
>>>> +#define KCMP_EQ		0
>>>> +#define KCMP_LT		1
>>>> +#define KCMP_GT		2
>>>
>>> LT and GT are meaningless.
>>>
>> 
>> I found symbolic names better than open-coded values. But sure,
>> if this is problem it could be dropped.
>> 
>> Or you mean that in general anything but 'equal' is useless?
>> 
>
> Why on Earth would user space need to know which order in memory certain
> kernel objects are?

For checkpoint restart and for some other kinds of introspection what is
needed is a comparison function to see if two processes share the same
object.  The most interesting of these objects from a checkpoint restart case
are file descriptors, and there can be a lot of file descriptors.

The order in memory does not matter.  What does matter is that the
comparison function return some ordering between objects.  The algorithm
for figuring out of N items which of them are duplicates is O(N^2) if
the comparison function can only return equal or not equal.  The
algorithm for finding duplications is only O(NlogN) if the comparison
function will return an ordering among the objects.

> Keep in mind that this is *exactly* the kind of information which makes
> rootkits easier.

I would be very surprised if basic in memory ordering information was
not already available from simple creation ordering.

If using the in memory ordering is a problem in practice there are a lot
of other possible ways to order the kernel objects.  Allocating sequence
numbers for the kernel objects, passing the pointers through a
cryptographically secure hash before comparing them, etc.

It does look like Cyrill's patch description lacked the important bit of
information about the algorithm complexity requiring an ordering among
kernel objects.  Cyrill you probably want to describe more prominently
what is happening now and why in your patch description rather than give
the history of different approaches.

Eric

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-17 21:15       ` Cyrill Gorcunov
@ 2012-01-17 21:40         ` Eric W. Biederman
  2012-01-18  5:07           ` Pavel Emelyanov
  0 siblings, 1 reply; 27+ messages in thread
From: Eric W. Biederman @ 2012-01-17 21:40 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: H. Peter Anvin, Alexey Dobriyan, LKML, Pavel Emelyanov,
	Andrey Vagin, Ingo Molnar, Thomas Gleixner, Glauber Costa,
	Andi Kleen, Tejun Heo, Matt Helsley, Pekka Enberg, Eric Dumazet,
	Vasiliy Kulikov, Andrew Morton, Valdis.Kletnieks

Cyrill Gorcunov <gorcunov@gmail.com> writes:

> On Tue, Jan 17, 2012 at 10:47:37AM -0800, H. Peter Anvin wrote:
>> On 01/17/2012 06:44 AM, Cyrill Gorcunov wrote:
>> > On Tue, Jan 17, 2012 at 04:38:14PM +0200, Alexey Dobriyan wrote:
>> >> On 1/17/12, Cyrill Gorcunov <gorcunov@gmail.com> wrote:
>> >>> +#define KCMP_EQ		0
>> >>> +#define KCMP_LT		1
>> >>> +#define KCMP_GT		2
>> >>
>> >> LT and GT are meaningless.
>> >>
>> > 
>> > I found symbolic names better than open-coded values. But sure,
>> > if this is problem it could be dropped.
>> > 
>> > Or you mean that in general anything but 'equal' is useless?
>> > 
>> 
>> Why on Earth would user space need to know which order in memory certain
>> kernel objects are?
>> 
>> Keep in mind that this is *exactly* the kind of information which makes
>> rootkits easier.
>> 
>
> Hmm, indeed this might help narrow down the target address I fear. So
> after some conversation with Pavel I think we can try to live with just
> one result -- is objects are at same location in kernel memory or not.
> The updated version is below. Please review if you get a chance. Thanks
> a lot for comments!

Seriously?

Or is this a case where you get something in then when people start
seriously using it and the performance is sucks badly you go back to
something like the current system call?

How are you going to ensure the performance does not degrade badly when
looking across a large number of processes?

Eric

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-17 21:40         ` Eric W. Biederman
@ 2012-01-18  5:07           ` Pavel Emelyanov
  0 siblings, 0 replies; 27+ messages in thread
From: Pavel Emelyanov @ 2012-01-18  5:07 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Cyrill Gorcunov, H. Peter Anvin, Alexey Dobriyan, LKML,
	Andrey Vagin, Ingo Molnar, Thomas Gleixner, Glauber Costa,
	Andi Kleen, Tejun Heo, Matt Helsley, Pekka Enberg, Eric Dumazet,
	Vasiliy Kulikov, Andrew Morton, Valdis.Kletnieks

On 01/18/2012 01:40 AM, Eric W. Biederman wrote:
> Cyrill Gorcunov <gorcunov@gmail.com> writes:
> 
>> On Tue, Jan 17, 2012 at 10:47:37AM -0800, H. Peter Anvin wrote:
>>> On 01/17/2012 06:44 AM, Cyrill Gorcunov wrote:
>>>> On Tue, Jan 17, 2012 at 04:38:14PM +0200, Alexey Dobriyan wrote:
>>>>> On 1/17/12, Cyrill Gorcunov <gorcunov@gmail.com> wrote:
>>>>>> +#define KCMP_EQ		0
>>>>>> +#define KCMP_LT		1
>>>>>> +#define KCMP_GT		2
>>>>>
>>>>> LT and GT are meaningless.
>>>>>
>>>>
>>>> I found symbolic names better than open-coded values. But sure,
>>>> if this is problem it could be dropped.
>>>>
>>>> Or you mean that in general anything but 'equal' is useless?
>>>>
>>>
>>> Why on Earth would user space need to know which order in memory certain
>>> kernel objects are?
>>>
>>> Keep in mind that this is *exactly* the kind of information which makes
>>> rootkits easier.
>>>
>>
>> Hmm, indeed this might help narrow down the target address I fear. So
>> after some conversation with Pavel I think we can try to live with just
>> one result -- is objects are at same location in kernel memory or not.
>> The updated version is below. Please review if you get a chance. Thanks
>> a lot for comments!
> 
> Seriously?
> 
> Or is this a case where you get something in then when people start
> seriously using it and the performance is sucks badly you go back to
> something like the current system call?
> 
> How are you going to ensure the performance does not degrade badly when
> looking across a large number of processes?

We can compare the e.g. files' target inodes (ino + dev) and positions and
comparing each-to-each only for those having these pairs equal. Looking at
the existing large containers with tens thousands of fd-s we have this 
gives us maximum 6 files to compare, and performing 15 syscalls for this suits 
us for now.

Of course, if you manage to persuade Peter that his memory ordering concerns
are not real problems _now_, that would be great, but, yet again -- simple
{eq, ne} suit us for now, providing we can extend this API on {eq, le, gt}
in the future.

> Eric
> .
> 


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-17 21:35       ` Eric W. Biederman
@ 2012-01-18  8:01         ` Cyrill Gorcunov
  2012-01-18  9:12           ` KOSAKI Motohiro
  2012-01-18 22:05         ` david
  1 sibling, 1 reply; 27+ messages in thread
From: Cyrill Gorcunov @ 2012-01-18  8:01 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: H. Peter Anvin, Alexey Dobriyan, LKML, Pavel Emelyanov,
	Andrey Vagin, Ingo Molnar, Thomas Gleixner, Glauber Costa,
	Andi Kleen, Tejun Heo, Matt Helsley, Pekka Enberg, Eric Dumazet,
	Vasiliy Kulikov, Andrew Morton, Valdis.Kletnieks

On Tue, Jan 17, 2012 at 01:35:00PM -0800, Eric W. Biederman wrote:
> "H. Peter Anvin" <hpa@zytor.com> writes:
> 
> > On 01/17/2012 06:44 AM, Cyrill Gorcunov wrote:
> >> On Tue, Jan 17, 2012 at 04:38:14PM +0200, Alexey Dobriyan wrote:
> >>> On 1/17/12, Cyrill Gorcunov <gorcunov@gmail.com> wrote:
> >>>> +#define KCMP_EQ		0
> >>>> +#define KCMP_LT		1
> >>>> +#define KCMP_GT		2
> >>>
> >>> LT and GT are meaningless.
> >>>
> >> 
> >> I found symbolic names better than open-coded values. But sure,
> >> if this is problem it could be dropped.
> >> 
> >> Or you mean that in general anything but 'equal' is useless?
> >> 
> >
> > Why on Earth would user space need to know which order in memory certain
> > kernel objects are?
> 
> For checkpoint restart and for some other kinds of introspection what is
> needed is a comparison function to see if two processes share the same
> object.  The most interesting of these objects from a checkpoint restart case
> are file descriptors, and there can be a lot of file descriptors.
> 
> The order in memory does not matter.  What does matter is that the
> comparison function return some ordering between objects.  The algorithm
> for figuring out of N items which of them are duplicates is O(N^2) if
> the comparison function can only return equal or not equal.  The
> algorithm for finding duplications is only O(NlogN) if the comparison
> function will return an ordering among the objects.
> 

Yes, thanks Eric, I missed this text in patch description, my bad. And
yes, performance will degrade with plain eq/ne approach. But as Pavel
stated in another email

 | We can compare the e.g. files' target inodes (ino + dev) and positions and
 | comparing each-to-each only for those having these pairs equal. Looking at
 | the existing large containers with tens thousands of fd-s we have this
 | gives us maximum 6 files to compare, and performing 15 syscalls for this suits
 | us for now.

> > Keep in mind that this is *exactly* the kind of information which makes
> > rootkits easier.
> 
> I would be very surprised if basic in memory ordering information was
> not already available from simple creation ordering.
> 

I think Peter means the scenario where we say have some bug in slab/slub
code which happens on say some Nth allocation and attacker somehow reveal
at least one memory address of struct file, then using such syscall an
attacker might inspect a series of fd (and associated struct file) and guess
which addresses the rest of "struct file" are. In most cases this wont help
(if a system is under more/less high load and open/close files fast enough
 'cause "struct file" comes from kmem caches) but on some non-heavy loaded
machine this might do a trick and narrow addresses (if say there only 10
fds which allocated from cache in a row and you somehow know address of
one associated struct file).

In short -- I don't know if it's indeed really serious issue or not
(since from my POV it'll require at least a couple of bugs in a row
 to happen before the attacker might use this information). OTOH, shit
happens exactly in 'impossible' scenarios ;)

> If using the in memory ordering is a problem in practice there are a lot
> of other possible ways to order the kernel objects.  Allocating sequence
> numbers for the kernel objects, passing the pointers through a
> cryptographically secure hash before comparing them, etc.
> 

We've been trying this already ;)

> It does look like Cyrill's patch description lacked the important bit of
> information about the algorithm complexity requiring an ordering among
> kernel objects.  Cyrill you probably want to describe more prominently
> what is happening now and why in your patch description rather than give
> the history of different approaches.
> 

Yeah, i'll write detailed change log, gimme some time. Thanks Eric!

Btw, extending this syscall to lt/ge variant will be easy, so this is
not a problem I think. At moment we guarantee to return 0/1 on succes,
and < 0 on error, so if we start returing 2/3 in a sake of ordering
the applications which were using only 0/1 values wont crash (if they
are not crappy written ones).

	Cyrill

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-18  8:01         ` Cyrill Gorcunov
@ 2012-01-18  9:12           ` KOSAKI Motohiro
  2012-01-18  9:19             ` Pavel Emelyanov
  0 siblings, 1 reply; 27+ messages in thread
From: KOSAKI Motohiro @ 2012-01-18  9:12 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Eric W. Biederman, H. Peter Anvin, Alexey Dobriyan, LKML,
	Pavel Emelyanov, Andrey Vagin, Ingo Molnar, Thomas Gleixner,
	Glauber Costa, Andi Kleen, Tejun Heo, Matt Helsley, Pekka Enberg,
	Eric Dumazet, Vasiliy Kulikov, Andrew Morton, Valdis.Kletnieks

(1/18/12 3:01 AM), Cyrill Gorcunov wrote:
> On Tue, Jan 17, 2012 at 01:35:00PM -0800, Eric W. Biederman wrote:
>> "H. Peter Anvin"<hpa@zytor.com>  writes:
>>
>>> On 01/17/2012 06:44 AM, Cyrill Gorcunov wrote:
>>>> On Tue, Jan 17, 2012 at 04:38:14PM +0200, Alexey Dobriyan wrote:
>>>>> On 1/17/12, Cyrill Gorcunov<gorcunov@gmail.com>  wrote:
>>>>>> +#define KCMP_EQ		0
>>>>>> +#define KCMP_LT		1
>>>>>> +#define KCMP_GT		2
>>>>>
>>>>> LT and GT are meaningless.
>>>>>
>>>>
>>>> I found symbolic names better than open-coded values. But sure,
>>>> if this is problem it could be dropped.
>>>>
>>>> Or you mean that in general anything but 'equal' is useless?
>>>>
>>>
>>> Why on Earth would user space need to know which order in memory certain
>>> kernel objects are?
>>
>> For checkpoint restart and for some other kinds of introspection what is
>> needed is a comparison function to see if two processes share the same
>> object.  The most interesting of these objects from a checkpoint restart case
>> are file descriptors, and there can be a lot of file descriptors.
>>
>> The order in memory does not matter.  What does matter is that the
>> comparison function return some ordering between objects.  The algorithm
>> for figuring out of N items which of them are duplicates is O(N^2) if
>> the comparison function can only return equal or not equal.  The
>> algorithm for finding duplications is only O(NlogN) if the comparison
>> function will return an ordering among the objects.
>
> Yes, thanks Eric, I missed this text in patch description, my bad. And
> yes, performance will degrade with plain eq/ne approach. But as Pavel
> stated in another email

I think Eric only said gt/lt compare is useful. We don't need to expose bare
pointer order. example, kcmp(rotate(ptr, per-task-random-value)) is enough
hide the critical information. I think.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-18  9:12           ` KOSAKI Motohiro
@ 2012-01-18  9:19             ` Pavel Emelyanov
  2012-01-18  9:23               ` KOSAKI Motohiro
  0 siblings, 1 reply; 27+ messages in thread
From: Pavel Emelyanov @ 2012-01-18  9:19 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin,
	Alexey Dobriyan, LKML, Andrey Vagin, Ingo Molnar,
	Thomas Gleixner, Glauber Costa, Andi Kleen, Tejun Heo,
	Matt Helsley, Pekka Enberg, Eric Dumazet, Vasiliy Kulikov,
	Andrew Morton, Valdis.Kletnieks

> I think Eric only said gt/lt compare is useful. We don't need to expose bare
> pointer order. example, kcmp(rotate(ptr, per-task-random-value)) is enough
> hide the critical information. I think.

The per-task might break thinks up in case

(tsk1->file != tsk2->file) && (rotate(tsk1->file, tsk1->random) == rotate(tsk2->file, tsk2->rotate))

but I agree, that the overall idea of comparing not bare pointers, but those poisoned with
some global value can address the Peter's concerns about rootkits.

Thanks,
Pavel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-18  9:19             ` Pavel Emelyanov
@ 2012-01-18  9:23               ` KOSAKI Motohiro
  2012-01-18 11:57                 ` Cyrill Gorcunov
  0 siblings, 1 reply; 27+ messages in thread
From: KOSAKI Motohiro @ 2012-01-18  9:23 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin,
	Alexey Dobriyan, LKML, Andrey Vagin, Ingo Molnar,
	Thomas Gleixner, Glauber Costa, Andi Kleen, Tejun Heo,
	Matt Helsley, Pekka Enberg, Eric Dumazet, Vasiliy Kulikov,
	Andrew Morton, Valdis.Kletnieks

(1/18/12 4:19 AM), Pavel Emelyanov wrote:
>> I think Eric only said gt/lt compare is useful. We don't need to expose bare
>> pointer order. example, kcmp(rotate(ptr, per-task-random-value)) is enough
>> hide the critical information. I think.
>
> The per-task might break thinks up in case
>
> (tsk1->file != tsk2->file)&&  (rotate(tsk1->file, tsk1->random) == rotate(tsk2->file, tsk2->rotate))

I meant,

(tsk1->file != tsk2->file) && (rotate(tsk1->file, caller_task->random) == rotate(tsk2->file, caller_task->random))



>
> but I agree, that the overall idea of comparing not bare pointers, but those poisoned with
> some global value can address the Peter's concerns about rootkits.



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-18  9:23               ` KOSAKI Motohiro
@ 2012-01-18 11:57                 ` Cyrill Gorcunov
  2012-01-18 16:46                   ` KOSAKI Motohiro
  0 siblings, 1 reply; 27+ messages in thread
From: Cyrill Gorcunov @ 2012-01-18 11:57 UTC (permalink / raw)
  To: KOSAKI Motohiro, Pavel Emelyanov, Eric W. Biederman, H. Peter Anvin
  Cc: Alexey Dobriyan, LKML, Andrey Vagin, Ingo Molnar,
	Thomas Gleixner, Glauber Costa, Andi Kleen, Tejun Heo,
	Matt Helsley, Pekka Enberg, Eric Dumazet, Vasiliy Kulikov,
	Andrew Morton, Valdis.Kletnieks

On Wed, Jan 18, 2012 at 04:23:24AM -0500, KOSAKI Motohiro wrote:
> (1/18/12 4:19 AM), Pavel Emelyanov wrote:
> >>I think Eric only said gt/lt compare is useful. We don't need to expose bare
> >>pointer order. example, kcmp(rotate(ptr, per-task-random-value)) is enough
> >>hide the critical information. I think.
> >
> >The per-task might break thinks up in case
> >
> >(tsk1->file != tsk2->file)&&  (rotate(tsk1->file, tsk1->random) == rotate(tsk2->file, tsk2->rotate))
> 
> I meant,
> 
> (tsk1->file != tsk2->file) && (rotate(tsk1->file, caller_task->random) == rotate(tsk2->file, caller_task->random))
> 
> 
> >
> >but I agree, that the overall idea of comparing not bare pointers, but those poisoned with
> >some global value can address the Peter's concerns about rootkits.
> 
> 

Guys, can we stick with something simplier? I could use hashes here (again?!) or
even aes encoded pointers extended to 128 bits as it was proposed too. But
maybe we can live with something more simplier?

We could export EQ/NE for regular users (which might be usefull for less
frequently used objects such as namespaces I guess). And GT/LT for root
only.

Does it look better? Does the change log tells enough?

	Cyrill
---
From: Cyrill Gorcunov <gorcunov@openvz.org>
Subject: [RFC] syscalls, x86: Add __NR_kcmp syscall v3

While doing the checkpoint-restore in the userspace one need to determine
whether various kernel objects (like mm_struct-s of file_struct-s) are shared
between tasks and restore this state.

The 2nd step can be solved by using appropriate CLONE_ flags and the unshare
syscall, while there's currently no ways for solving the 1st one.

One of the ways for checking whether two tasks share e.g. mm_struct is to
provide some mm_struct ID of a task to its proc file, but showing such
info considered to be not that good for security reasons.

Thus after some debates we end up in conclusion that using that named
'comparision' syscall might be the best candidate. So here is it --
__NR_kcmp.

It takes up to 5 agruments - the pids of the two tasks (which
characteristics should be compared), the comparision type and
(in case of comparision of files) two file descriptors.

Only two results are supported at moment -- if the objects
are the same or not. So there is no way to restore in-memory
order of objects.

At moment only x86 is supported.

v2: Drop ordered results.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: "Eric W. Biederman" <ebiederm@xmission.com>
CC: Pavel Emelyanov <xemul@parallels.com>
CC: Andrey Vagin <avagin@openvz.org>
CC: Ingo Molnar <mingo@elte.hu>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Glauber Costa <glommer@parallels.com>
CC: Andi Kleen <andi@firstfloor.org>
CC: Tejun Heo <tj@kernel.org>
CC: Matt Helsley <matthltc@us.ibm.com>
CC: Pekka Enberg <penberg@kernel.org>
CC: Eric Dumazet <eric.dumazet@gmail.com>
CC: Vasiliy Kulikov <segoon@openwall.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Valdis.Kletnieks@vt.edu
---
 arch/x86/include/asm/kcmp.h        |   20 +++++
 arch/x86/include/asm/syscalls.h    |    4 +
 arch/x86/include/asm/unistd_32.h   |    1 
 arch/x86/include/asm/unistd_64.h   |    2 
 arch/x86/kernel/Makefile           |    1 
 arch/x86/kernel/kcmp.c             |  124 +++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/syscall_table_32.S |    1 
 7 files changed, 153 insertions(+)

Index: linux-2.6.git/arch/x86/include/asm/kcmp.h
===================================================================
--- /dev/null
+++ linux-2.6.git/arch/x86/include/asm/kcmp.h
@@ -0,0 +1,20 @@
+#ifndef _LINUX_KCMP_H
+#define _LINUX_KCMP_H
+
+/* Comparision type */
+enum {
+	KCMP_FILE,
+	KCMP_VM,
+	KCMP_FILES,
+	KCMP_FS,
+	KCMP_SIGHAND,
+	KCMP_IO,
+	KCMP_SYSVSEM,
+
+	KCMP_TYPES,
+};
+
+#define KCMP_EQ		0
+#define KCMP_NE		1
+
+#endif /* _LINUX_KCMP_H */
Index: linux-2.6.git/arch/x86/include/asm/syscalls.h
===================================================================
--- linux-2.6.git.orig/arch/x86/include/asm/syscalls.h
+++ linux-2.6.git/arch/x86/include/asm/syscalls.h
@@ -42,6 +42,10 @@ long sys_sigaltstack(const stack_t __use
 asmlinkage int sys_set_thread_area(struct user_desc __user *);
 asmlinkage int sys_get_thread_area(struct user_desc __user *);
 
+/* kenrel/kcmp.c */
+asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
+			 unsigned long idx1, unsigned long idx2);
+
 /* X86_32 only */
 #ifdef CONFIG_X86_32
 
Index: linux-2.6.git/arch/x86/include/asm/unistd_32.h
===================================================================
--- linux-2.6.git.orig/arch/x86/include/asm/unistd_32.h
+++ linux-2.6.git/arch/x86/include/asm/unistd_32.h
@@ -354,6 +354,7 @@
 #define __NR_setns		346
 #define __NR_process_vm_readv	347
 #define __NR_process_vm_writev	348
+#define __NR_kcmp		349
 
 #ifdef __KERNEL__
 
Index: linux-2.6.git/arch/x86/include/asm/unistd_64.h
===================================================================
--- linux-2.6.git.orig/arch/x86/include/asm/unistd_64.h
+++ linux-2.6.git/arch/x86/include/asm/unistd_64.h
@@ -686,6 +686,8 @@ __SYSCALL(__NR_getcpu, sys_getcpu)
 __SYSCALL(__NR_process_vm_readv, sys_process_vm_readv)
 #define __NR_process_vm_writev			311
 __SYSCALL(__NR_process_vm_writev, sys_process_vm_writev)
+#define __NR_kcmp				312
+__SYSCALL(__NR_kcmp, sys_kcmp)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
Index: linux-2.6.git/arch/x86/kernel/Makefile
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/Makefile
+++ linux-2.6.git/arch/x86/kernel/Makefile
@@ -33,6 +33,7 @@ obj-y			+= alternative.o i8253.o pci-nom
 obj-y			+= tsc.o io_delay.o rtc.o
 obj-y			+= pci-iommu_table.o
 obj-y			+= resource.o
+obj-y			+= kcmp.o
 
 obj-y				+= trampoline.o trampoline_$(BITS).o
 obj-y				+= process.o
Index: linux-2.6.git/arch/x86/kernel/kcmp.c
===================================================================
--- /dev/null
+++ linux-2.6.git/arch/x86/kernel/kcmp.c
@@ -0,0 +1,124 @@
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+#include <linux/fdtable.h>
+#include <linux/err.h>
+
+#include <asm/unistd.h>
+#include <asm/kcmp.h>
+
+static int kcmp_ptr(long v1, long v2)
+{
+	long ret = v1 - v2;
+	return ret == 0 ? KCMP_EQ : KCMP_NE;
+}
+
+#define KCMP_TASK_PTR(task1, task2, member)	\
+	kcmp_ptr((long)(task1)->member, (long)(task2)->member)
+
+#define KCMP_PTR(ptr1, ptr2)			\
+	kcmp_ptr((long)ptr1, (long)ptr2)
+
+/* A caller must be sure the task is presented in memory */
+static struct file *
+get_file_raw_ptr(struct task_struct *task, unsigned int idx)
+{
+	struct fdtable *fdt;
+	struct file *file;
+
+	spin_lock(&task->files->file_lock);
+	fdt = files_fdtable(task->files);
+	if (idx < fdt->max_fds)
+		file = fdt->fd[idx];
+	else
+		file = NULL;
+	spin_unlock(&task->files->file_lock);
+
+	return file;
+}
+
+SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
+		unsigned long, idx1, unsigned long, idx2)
+{
+	struct task_struct *task1;
+	struct task_struct *task2;
+	int ret = 0;
+
+	rcu_read_lock();
+
+	task1 = find_task_by_vpid(pid1);
+	if (!task1) {
+		rcu_read_unlock();
+		return -ESRCH;
+	}
+
+	task2 = find_task_by_vpid(pid2);
+	if (!task2) {
+		put_task_struct(task1);
+		rcu_read_unlock();
+		return -ESRCH;
+	}
+
+	get_task_struct(task1);
+	get_task_struct(task2);
+
+	rcu_read_unlock();
+
+	if (!ptrace_may_access(task1, PTRACE_MODE_READ) ||
+	    !ptrace_may_access(task2, PTRACE_MODE_READ)) {
+		ret = -EACCES;
+		goto err;
+	}
+
+	/*
+	 * Note for all cases but the KCMP_FILE we
+	 * don't take any locks and do a plain pointer
+	 * comparision in a sake of speed.
+	 */
+
+	switch (type) {
+	case KCMP_FILE: {
+		struct file *filp1, *filp2;
+
+		filp1 = get_file_raw_ptr(task1, idx1);
+		filp2 = get_file_raw_ptr(task2, idx2);
+
+		if (filp1 && filp2)
+			ret = KCMP_PTR(filp1, filp2);
+		else
+			ret = -ENOENT;
+		break;
+	}
+	case KCMP_VM:
+		ret = KCMP_TASK_PTR(task1, task2, mm);
+		break;
+	case KCMP_FILES:
+		ret = KCMP_TASK_PTR(task1, task2, files);
+		break;
+	case KCMP_FS:
+		ret = KCMP_TASK_PTR(task1, task2, fs);
+		break;
+	case KCMP_SIGHAND:
+		ret = KCMP_TASK_PTR(task1, task2, sighand);
+		break;
+	case KCMP_IO:
+		ret = KCMP_TASK_PTR(task1, task2, io_context);
+		break;
+	case KCMP_SYSVSEM:
+#ifdef CONFIG_SYSVIPC
+		ret = KCMP_TASK_PTR(task1, task2, sysvsem.undo_list);
+#else
+		ret = -ENOENT;
+		goto err;
+#endif
+		break;
+	default:
+		ret = -EINVAL;
+		goto err;
+	}
+
+err:
+	put_task_struct(task1);
+	put_task_struct(task2);
+
+	return ret;
+}
Index: linux-2.6.git/arch/x86/kernel/syscall_table_32.S
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/syscall_table_32.S
+++ linux-2.6.git/arch/x86/kernel/syscall_table_32.S
@@ -348,3 +348,4 @@ ENTRY(sys_call_table)
 	.long sys_setns
 	.long sys_process_vm_readv
 	.long sys_process_vm_writev
+	.long sys_kcmp
---
From: Cyrill Gorcunov <gorcunov@openvz.org>
Subject: [RFC] syscalls, x86: Report objects ordering for CAP_SYS_ADMIN in __NR_kcmp syscall

For checkpoint-restore procedure we would like to increase performance
over comparision test if huge number of file descriptros is involved,
i.e. being able to re-create at least parial order of kernel objects
compared.

Thus __NR_kcmp syscall is extended to return values representing
the objects order ("greater" and "less").

Such approach allows us to sort file descriptors and don't compare
every file with every other, increasing performance from O(n^2)
to about O(NlogN).

In a sake of safety this interface is allowed for CAP_SYS_ADMIN
only. A regular user still get EQ/NE results only.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: "Eric W. Biederman" <ebiederm@xmission.com>
CC: Pavel Emelyanov <xemul@parallels.com>
CC: Andrey Vagin <avagin@openvz.org>
CC: Ingo Molnar <mingo@elte.hu>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Glauber Costa <glommer@parallels.com>
CC: Andi Kleen <andi@firstfloor.org>
CC: Tejun Heo <tj@kernel.org>
CC: Matt Helsley <matthltc@us.ibm.com>
CC: Pekka Enberg <penberg@kernel.org>
CC: Eric Dumazet <eric.dumazet@gmail.com>
CC: Vasiliy Kulikov <segoon@openwall.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Valdis.Kletnieks@vt.edu
---
 arch/x86/include/asm/kcmp.h |    6 ++++--
 arch/x86/kernel/kcmp.c      |   13 ++++++++++++-
 2 files changed, 16 insertions(+), 3 deletions(-)

Index: linux-2.6.git/arch/x86/include/asm/kcmp.h
===================================================================
--- linux-2.6.git.orig/arch/x86/include/asm/kcmp.h
+++ linux-2.6.git/arch/x86/include/asm/kcmp.h
@@ -14,7 +14,9 @@ enum {
 	KCMP_TYPES,
 };
 
-#define KCMP_EQ		0
-#define KCMP_NE		1
+#define KCMP_EQ		0	/* objects are equal */
+#define KCMP_NE		1	/* objects are not equal */
+#define KCMP_GT		2	/* 1st is greater than 2nd */
+#define KCMP_LT		3	/* 1st is less than 2nd */
 
 #endif /* _LINUX_KCMP_H */
Index: linux-2.6.git/arch/x86/kernel/kcmp.c
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/kcmp.c
+++ linux-2.6.git/arch/x86/kernel/kcmp.c
@@ -9,7 +9,18 @@
 static int kcmp_ptr(long v1, long v2)
 {
 	long ret = v1 - v2;
-	return ret == 0 ? KCMP_EQ : KCMP_NE;
+
+	if (ret == 0) {
+		return KCMP_EQ;
+	} else {
+		/* More detailed result for root only */
+		if (capable(CAP_SYS_ADMIN))
+			ret = ret < 0 ? KCMP_LT : KCMP_GT;
+		else
+			ret = KCMP_NE;
+	}
+
+	return ret;
 }
 
 #define KCMP_TASK_PTR(task1, task2, member)	\

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-18 11:57                 ` Cyrill Gorcunov
@ 2012-01-18 16:46                   ` KOSAKI Motohiro
  2012-01-18 17:20                     ` Cyrill Gorcunov
  0 siblings, 1 reply; 27+ messages in thread
From: KOSAKI Motohiro @ 2012-01-18 16:46 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Pavel Emelyanov, Eric W. Biederman, H. Peter Anvin,
	Alexey Dobriyan, LKML, Andrey Vagin, Ingo Molnar,
	Thomas Gleixner, Glauber Costa, Andi Kleen, Tejun Heo,
	Matt Helsley, Pekka Enberg, Eric Dumazet, Vasiliy Kulikov,
	Andrew Morton, Valdis.Kletnieks

(1/18/12 6:57 AM), Cyrill Gorcunov wrote:
> On Wed, Jan 18, 2012 at 04:23:24AM -0500, KOSAKI Motohiro wrote:
>> (1/18/12 4:19 AM), Pavel Emelyanov wrote:
>>>> I think Eric only said gt/lt compare is useful. We don't need to expose bare
>>>> pointer order. example, kcmp(rotate(ptr, per-task-random-value)) is enough
>>>> hide the critical information. I think.
>>>
>>> The per-task might break thinks up in case
>>>
>>> (tsk1->file != tsk2->file)&&   (rotate(tsk1->file, tsk1->random) == rotate(tsk2->file, tsk2->rotate))
>>
>> I meant,
>>
>> (tsk1->file != tsk2->file)&&  (rotate(tsk1->file, caller_task->random) == rotate(tsk2->file, caller_task->random))
>>
>>
>>>
>>> but I agree, that the overall idea of comparing not bare pointers, but those poisoned with
>>> some global value can address the Peter's concerns about rootkits.
>
> Guys, can we stick with something simplier? I could use hashes here (again?!) or
> even aes encoded pointers extended to 128 bits as it was proposed too. But
> maybe we can live with something more simplier?

The problem of hashes is,

  - SHA1 didn't provide correct "equal or not" policy. (and I don't think sha1 is faster than kcmp)
  - Poisoned pointer can be used to restore original bare pointer.

Do this have the same issue?


> We could export EQ/NE for regular users (which might be usefull for less
> frequently used objects such as namespaces I guess). And GT/LT for root
> only.
>
> Does it look better? Does the change log tells enough?

I dislike. Just EQ/NE is better than "root only" behavior change. it's misleading.
If you dislike GT/LT, please just delete it.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-18 16:46                   ` KOSAKI Motohiro
@ 2012-01-18 17:20                     ` Cyrill Gorcunov
  0 siblings, 0 replies; 27+ messages in thread
From: Cyrill Gorcunov @ 2012-01-18 17:20 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Pavel Emelyanov, Eric W. Biederman, H. Peter Anvin,
	Alexey Dobriyan, LKML, Andrey Vagin, Ingo Molnar,
	Thomas Gleixner, Glauber Costa, Andi Kleen, Tejun Heo,
	Matt Helsley, Pekka Enberg, Eric Dumazet, Vasiliy Kulikov,
	Andrew Morton, Valdis.Kletnieks

On Wed, Jan 18, 2012 at 11:46:57AM -0500, KOSAKI Motohiro wrote:
> (1/18/12 6:57 AM), Cyrill Gorcunov wrote:
> >On Wed, Jan 18, 2012 at 04:23:24AM -0500, KOSAKI Motohiro wrote:
> >>(1/18/12 4:19 AM), Pavel Emelyanov wrote:
> >>>>I think Eric only said gt/lt compare is useful. We don't need to expose bare
> >>>>pointer order. example, kcmp(rotate(ptr, per-task-random-value)) is enough
> >>>>hide the critical information. I think.
> >>>
> >>>The per-task might break thinks up in case
> >>>
> >>>(tsk1->file != tsk2->file) && (rotate(tsk1->file, tsk1->random) == rotate(tsk2->file, tsk2->rotate))
> >>
> >>I meant,
> >>
> >>(tsk1->file != tsk2->file) && (rotate(tsk1->file, caller_task->random) == rotate(tsk2->file, caller_task->random))
> >>
> >>>
> >>>but I agree, that the overall idea of comparing not bare pointers, but those poisoned with
> >>>some global value can address the Peter's concerns about rootkits.
> >
> >Guys, can we stick with something simplier? I could use hashes here (again?!) or
> >even aes encoded pointers extended to 128 bits as it was proposed too. But
> >maybe we can live with something more simplier?
> 
> The problem of hashes is,
> 
>  - SHA1 didn't provide correct "equal or not" policy. (and I don't think sha1 is faster than kcmp)
>  - Poisoned pointer can be used to restore original bare pointer.
> 
> Do this have the same issue?

No, this rorate() helper seems to not have such problems (still sha1 provided
pretty well equal or not policy, aes with internals random too). The thing is
the ->random you choose here (which I suppose will be the number of bits to
rotate in former pointer and this way break order -- weak option too, you
will be rotating in modulo field).

> 
> >We could export EQ/NE for regular users (which might be usefull for less
> >frequently used objects such as namespaces I guess). And GT/LT for root
> >only.
> >
> >Does it look better? Does the change log tells enough?
> 
> I dislike. Just EQ/NE is better than "root only" behavior change. it's misleading.
> If you dislike GT/LT, please just delete it.
> 

EQ/NE remains here for everyone and behaves constantly for all users. For safety reason
only root can restore in-memory order, so I must admit I don't understand the problem.

If I'm root on a machine already, the memory order is least interesting thing for me,
really, but getting the root rights is really a problem for most cases in turn.

So we would preferred to have gt/lt ability at least for root. If there
absolutely no way to do so -- eq/ne is admisable and we can try to optimise
sorting somehow (still not sure if we will success) but it's not desirable.

	Cyrill

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-17 21:35       ` Eric W. Biederman
  2012-01-18  8:01         ` Cyrill Gorcunov
@ 2012-01-18 22:05         ` david
  2012-01-18 22:49           ` Cyrill Gorcunov
  1 sibling, 1 reply; 27+ messages in thread
From: david @ 2012-01-18 22:05 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: H. Peter Anvin, Cyrill Gorcunov, Alexey Dobriyan, LKML,
	Pavel Emelyanov, Andrey Vagin, Ingo Molnar, Thomas Gleixner,
	Glauber Costa, Andi Kleen, Tejun Heo, Matt Helsley, Pekka Enberg,
	Eric Dumazet, Vasiliy Kulikov, Andrew Morton, Valdis.Kletnieks

On Tue, 17 Jan 2012, Eric W. Biederman wrote:

> "H. Peter Anvin" <hpa@zytor.com> writes:
>
>> On 01/17/2012 06:44 AM, Cyrill Gorcunov wrote:
>>> On Tue, Jan 17, 2012 at 04:38:14PM +0200, Alexey Dobriyan wrote:
>>>> On 1/17/12, Cyrill Gorcunov <gorcunov@gmail.com> wrote:
>>>>> +#define KCMP_EQ		0
>>>>> +#define KCMP_LT		1
>>>>> +#define KCMP_GT		2
>>>>
>>>> LT and GT are meaningless.
>>>>
>>>
>>> I found symbolic names better than open-coded values. But sure,
>>> if this is problem it could be dropped.
>>>
>>> Or you mean that in general anything but 'equal' is useless?
>>>
>>
>> Why on Earth would user space need to know which order in memory certain
>> kernel objects are?
>
> For checkpoint restart and for some other kinds of introspection what is
> needed is a comparison function to see if two processes share the same
> object.  The most interesting of these objects from a checkpoint restart case
> are file descriptors, and there can be a lot of file descriptors.
>
> The order in memory does not matter.  What does matter is that the
> comparison function return some ordering between objects.  The algorithm
> for figuring out of N items which of them are duplicates is O(N^2) if
> the comparison function can only return equal or not equal.  The
> algorithm for finding duplications is only O(NlogN) if the comparison
> function will return an ordering among the objects.

so what you really want is a syscall that can take a list of objects 
instead of having to do a syscall per object. right?

David Lang

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-18 22:05         ` david
@ 2012-01-18 22:49           ` Cyrill Gorcunov
  2012-01-18 23:29             ` Eric W. Biederman
  0 siblings, 1 reply; 27+ messages in thread
From: Cyrill Gorcunov @ 2012-01-18 22:49 UTC (permalink / raw)
  To: david
  Cc: Eric W. Biederman, H. Peter Anvin, Alexey Dobriyan, LKML,
	Pavel Emelyanov, Andrey Vagin, Ingo Molnar, Thomas Gleixner,
	Glauber Costa, Andi Kleen, Tejun Heo, Matt Helsley, Pekka Enberg,
	Eric Dumazet, Vasiliy Kulikov, Andrew Morton, Valdis.Kletnieks

On Wed, Jan 18, 2012 at 02:05:31PM -0800, david@lang.hm wrote:
...
> >>
> >>Why on Earth would user space need to know which order in memory certain
> >>kernel objects are?
> >
> >For checkpoint restart and for some other kinds of introspection what is
> >needed is a comparison function to see if two processes share the same
> >object.  The most interesting of these objects from a checkpoint restart case
> >are file descriptors, and there can be a lot of file descriptors.
> >
> >The order in memory does not matter.  What does matter is that the
> >comparison function return some ordering between objects.  The algorithm
> >for figuring out of N items which of them are duplicates is O(N^2) if
> >the comparison function can only return equal or not equal.  The
> >algorithm for finding duplications is only O(NlogN) if the comparison
> >function will return an ordering among the objects.
> 
> so what you really want is a syscall that can take a list of objects
> instead of having to do a syscall per object. right?
> 

It doesn't matter. Even if we take a list of objects the kernel either
should return us some ordering info or find duplicates, in any case it
makes things more complex i think. So we wanted to bring some minimum
into kernel leaving the rest of work to user-space.

(and, btw, Eric, I'm really sorry that I didn't mentioned this sorting
 problem at the very first email).

	Cyrill

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-18 22:49           ` Cyrill Gorcunov
@ 2012-01-18 23:29             ` Eric W. Biederman
  2012-01-19  6:55               ` Cyrill Gorcunov
  0 siblings, 1 reply; 27+ messages in thread
From: Eric W. Biederman @ 2012-01-18 23:29 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: david, H. Peter Anvin, Alexey Dobriyan, LKML, Pavel Emelyanov,
	Andrey Vagin, Ingo Molnar, Thomas Gleixner, Glauber Costa,
	Andi Kleen, Tejun Heo, Matt Helsley, Pekka Enberg, Eric Dumazet,
	Vasiliy Kulikov, Andrew Morton, Valdis.Kletnieks

Cyrill Gorcunov <gorcunov@gmail.com> writes:

> On Wed, Jan 18, 2012 at 02:05:31PM -0800, david@lang.hm wrote:
> ...
>> >>
>> >>Why on Earth would user space need to know which order in memory certain
>> >>kernel objects are?
>> >
>> >For checkpoint restart and for some other kinds of introspection what is
>> >needed is a comparison function to see if two processes share the same
>> >object.  The most interesting of these objects from a checkpoint restart case
>> >are file descriptors, and there can be a lot of file descriptors.
>> >
>> >The order in memory does not matter.  What does matter is that the
>> >comparison function return some ordering between objects.  The algorithm
>> >for figuring out of N items which of them are duplicates is O(N^2) if
>> >the comparison function can only return equal or not equal.  The
>> >algorithm for finding duplications is only O(NlogN) if the comparison
>> >function will return an ordering among the objects.
>> 
>> so what you really want is a syscall that can take a list of objects
>> instead of having to do a syscall per object. right?
>> 
>
> It doesn't matter. Even if we take a list of objects the kernel either
> should return us some ordering info or find duplicates, in any case it
> makes things more complex i think. So we wanted to bring some minimum
> into kernel leaving the rest of work to user-space.

Agreed a syscall does the duplication is probably not the way to go.

 A syscall that takes a huge list of objects would solve any security
concerns that we have with returning the object order to user space if
done carefully, but it would require a bunch of additional user space
and kernel memory.

Sometimes taking a data structure transforming it into a weird form for
a specific task and then transforming the data structure back to it's
original form is a useful way to go.  So I think a general kernel object
deduplicating system call is an interesting plan B, but a straight
comparison function if we can make it work is a lot more flexible and
useful.

Eric

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-18 23:29             ` Eric W. Biederman
@ 2012-01-19  6:55               ` Cyrill Gorcunov
  2012-01-20  3:16                 ` Eric W. Biederman
  0 siblings, 1 reply; 27+ messages in thread
From: Cyrill Gorcunov @ 2012-01-19  6:55 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: david, H. Peter Anvin, Alexey Dobriyan, LKML, Pavel Emelyanov,
	Andrey Vagin, Ingo Molnar, Thomas Gleixner, Glauber Costa,
	Andi Kleen, Tejun Heo, Matt Helsley, Pekka Enberg, Eric Dumazet,
	Vasiliy Kulikov, Andrew Morton, Valdis.Kletnieks

On Wed, Jan 18, 2012 at 03:29:50PM -0800, Eric W. Biederman wrote:
> >
> > It doesn't matter. Even if we take a list of objects the kernel either
> > should return us some ordering info or find duplicates, in any case it
> > makes things more complex i think. So we wanted to bring some minimum
> > into kernel leaving the rest of work to user-space.
> 
> Agreed a syscall does the duplication is probably not the way to go.
> 
>  A syscall that takes a huge list of objects would solve any security
> concerns that we have with returning the object order to user space if
> done carefully, but it would require a bunch of additional user space
> and kernel memory.
> 

yes, an it increase syscall time itself since we will have to provide
this memory dynamically

> Sometimes taking a data structure transforming it into a weird form for
> a specific task and then transforming the data structure back to it's
> original form is a useful way to go.  So I think a general kernel object
> deduplicating system call is an interesting plan B, but a straight
> comparison function if we can make it work is a lot more flexible and
> useful.
> 

I hope the root-only restriction would resolve the potential security
problem, since as I mentioned if I've hijacked the machine and already
goot root -- mem order is not that interesting info I could obtain from
such computer :)

	Cyrill

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-19  6:55               ` Cyrill Gorcunov
@ 2012-01-20  3:16                 ` Eric W. Biederman
  2012-01-20  8:40                   ` Cyrill Gorcunov
  0 siblings, 1 reply; 27+ messages in thread
From: Eric W. Biederman @ 2012-01-20  3:16 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: david, H. Peter Anvin, Alexey Dobriyan, LKML, Pavel Emelyanov,
	Andrey Vagin, Ingo Molnar, Thomas Gleixner, Glauber Costa,
	Andi Kleen, Tejun Heo, Matt Helsley, Pekka Enberg, Eric Dumazet,
	Vasiliy Kulikov, Andrew Morton, Valdis.Kletnieks

Cyrill Gorcunov <gorcunov@gmail.com> writes:

> On Wed, Jan 18, 2012 at 03:29:50PM -0800, Eric W. Biederman wrote:
>> >
>> > It doesn't matter. Even if we take a list of objects the kernel either
>> > should return us some ordering info or find duplicates, in any case it
>> > makes things more complex i think. So we wanted to bring some minimum
>> > into kernel leaving the rest of work to user-space.
>> 
>> Agreed a syscall does the duplication is probably not the way to go.
>> 
>>  A syscall that takes a huge list of objects would solve any security
>> concerns that we have with returning the object order to user space if
>> done carefully, but it would require a bunch of additional user space
>> and kernel memory.
>> 
>
> yes, an it increase syscall time itself since we will have to provide
> this memory dynamically

I just did a back of the napkin calculation.
struct kobj_id {
	pid_t pid;
        size_t descriptor;
	size_t first_idx;
        void *kernel_ignore_this_pointer;
};

int find_kobject_dups(int type, struct kobj_id __user *ids, size_t count);

Looks pretty reasonable on a 64bit machine for 100,000 file
descriptors.

3 Meg of input data.
4 Meg of an internal rbtree that remembers the first entry where
we saw an item.
struct {
       struct rb_node node;
       void *key;
       size_t idx;
};

And the code is very straight forward.  Insert each pointer to a kernel
object we find into an rbtree, and return the index we find.  Then
finally tear down the rbtree.

8Meg worst case does not seem like a lot of memory to me.  Especially
since half of it is userspace memory.

A simple implementation plus a guarantee that we will never ever
leak information that we don't intend to seem very attractive to me.

>> Sometimes taking a data structure transforming it into a weird form for
>> a specific task and then transforming the data structure back to it's
>> original form is a useful way to go.  So I think a general kernel object
>> deduplicating system call is an interesting plan B, but a straight
>> comparison function if we can make it work is a lot more flexible and
>> useful.
>> 
>
> I hope the root-only restriction would resolve the potential security
> problem, since as I mentioned if I've hijacked the machine and already
> goot root -- mem order is not that interesting info I could obtain from
> such computer :)

We either need a full comparison operator or we don't.  Root-only is a
solution just looking to get abused.

Eric

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-20  3:16                 ` Eric W. Biederman
@ 2012-01-20  8:40                   ` Cyrill Gorcunov
  2012-01-20  9:02                     ` Cyrill Gorcunov
  0 siblings, 1 reply; 27+ messages in thread
From: Cyrill Gorcunov @ 2012-01-20  8:40 UTC (permalink / raw)
  To: Eric W. Biederman, H. Peter Anvin, Pavel Emelyanov, KOSAKI Motohiro
  Cc: david, Alexey Dobriyan, LKML, Andrey Vagin, Ingo Molnar,
	Thomas Gleixner, Glauber Costa, Andi Kleen, Tejun Heo,
	Matt Helsley, Pekka Enberg, Eric Dumazet, Vasiliy Kulikov,
	Andrew Morton, Valdis.Kletnieks

On Thu, Jan 19, 2012 at 07:16:02PM -0800, Eric W. Biederman wrote:
> Cyrill Gorcunov <gorcunov@gmail.com> writes:
> 
> > On Wed, Jan 18, 2012 at 03:29:50PM -0800, Eric W. Biederman wrote:
> >> >
> >> > It doesn't matter. Even if we take a list of objects the kernel either
> >> > should return us some ordering info or find duplicates, in any case it
> >> > makes things more complex i think. So we wanted to bring some minimum
> >> > into kernel leaving the rest of work to user-space.
> >> 
> >> Agreed a syscall does the duplication is probably not the way to go.
> >> 
> >>  A syscall that takes a huge list of objects would solve any security
> >> concerns that we have with returning the object order to user space if
> >> done carefully, but it would require a bunch of additional user space
> >> and kernel memory.
> >> 
> >
> > yes, an it increase syscall time itself since we will have to provide
> > this memory dynamically
> 
> I just did a back of the napkin calculation.
> struct kobj_id {
> 	pid_t pid;
>         size_t descriptor;
> 	size_t first_idx;
>         void *kernel_ignore_this_pointer;
> };
> 
> int find_kobject_dups(int type, struct kobj_id __user *ids, size_t count);
> 
> Looks pretty reasonable on a 64bit machine for 100,000 file
> descriptors.
> 
> 3 Meg of input data.
> 4 Meg of an internal rbtree that remembers the first entry where
> we saw an item.
> struct {
>        struct rb_node node;
>        void *key;
>        size_t idx;
> };
> 
> And the code is very straight forward.  Insert each pointer to a kernel
> object we find into an rbtree, and return the index we find.  Then
> finally tear down the rbtree.
> 
> 8Meg worst case does not seem like a lot of memory to me.  Especially
> since half of it is userspace memory.
> 
> A simple implementation plus a guarantee that we will never ever
> leak information that we don't intend to seem very attractive to me.
> 
> >> Sometimes taking a data structure transforming it into a weird form for
> >> a specific task and then transforming the data structure back to it's
> >> original form is a useful way to go.  So I think a general kernel object
> >> deduplicating system call is an interesting plan B, but a straight
> >> comparison function if we can make it work is a lot more flexible and
> >> useful.
> >> 
> >
> > I hope the root-only restriction would resolve the potential security
> > problem, since as I mentioned if I've hijacked the machine and already
> > goot root -- mem order is not that interesting info I could obtain from
> > such computer :)
> 
> We either need a full comparison operator or we don't.  Root-only is a
> solution just looking to get abused.
> 
> Eric
> 

Hi Eric, this approach sounds like a plan for me too. But what if
we make some cummulative approach and shuffle pointers randomly
(ie I think this is based on what Kosaki proposed in slightly
 different manner).

The patch is below. Eric, Peter, Kosaki, Pavel, could you please
check if this finally suits us from any point of view -- and
security and usability. Did I miss something?

Thanks!

	Cyrill
---
From: Cyrill Gorcunov <gorcunov@openvz.org>
Subject: [RFC] syscalls, x86: Add __NR_kcmp syscall v3


While doing the checkpoint-restore in the userspace one need to determine
whether various kernel objects (like mm_struct-s of file_struct-s) are shared
between tasks and restore this state.

The 2nd step can be solved by using appropriate CLONE_ flags and the unshare
syscall, while there's currently no ways for solving the 1st one.

One of the ways for checking whether two tasks share e.g. mm_struct is to
provide some mm_struct ID of a task to its proc file, but showing such
info considered to be not that good for security reasons.

Thus after some debates we end up in conclusion that using that named
'comparision' syscall might be the best candidate. So here is it --
__NR_kcmp.

It takes up to 5 agruments - the pids of the two tasks (which
characteristics should be compared), the comparision type and
(in case of comparision of files) two file descriptors.

At moment only x86 is supported.

v3:
 - Obfuscate kernel pointer addresses so still sorting
   is possible but order is not the same as in memory.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: "Eric W. Biederman" <ebiederm@xmission.com>
CC: Pavel Emelyanov <xemul@parallels.com>
CC: Andrey Vagin <avagin@openvz.org>
CC: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
CC: Ingo Molnar <mingo@elte.hu>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Glauber Costa <glommer@parallels.com>
CC: Andi Kleen <andi@firstfloor.org>
CC: Tejun Heo <tj@kernel.org>
CC: Matt Helsley <matthltc@us.ibm.com>
CC: Pekka Enberg <penberg@kernel.org>
CC: Eric Dumazet <eric.dumazet@gmail.com>
CC: Vasiliy Kulikov <segoon@openwall.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Valdis.Kletnieks@vt.edu
---
 arch/x86/include/asm/kcmp.h      |   17 +++
 arch/x86/include/asm/syscalls.h  |    4 
 arch/x86/kernel/Makefile         |    1 
 arch/x86/kernel/kcmp.c           |  182 +++++++++++++++++++++++++++++++++++++++
 arch/x86/syscalls/syscall_32.tbl |    1 
 arch/x86/syscalls/syscall_64.tbl |    1 
 6 files changed, 206 insertions(+)

Index: linux-2.6.git/arch/x86/include/asm/kcmp.h
===================================================================
--- /dev/null
+++ linux-2.6.git/arch/x86/include/asm/kcmp.h
@@ -0,0 +1,17 @@
+#ifndef _LINUX_KCMP_H
+#define _LINUX_KCMP_H
+
+/* Comparision type */
+enum {
+	KCMP_FILE,
+	KCMP_VM,
+	KCMP_FILES,
+	KCMP_FS,
+	KCMP_SIGHAND,
+	KCMP_IO,
+	KCMP_SYSVSEM,
+
+	KCMP_TYPES,
+};
+
+#endif /* _LINUX_KCMP_H */
Index: linux-2.6.git/arch/x86/include/asm/syscalls.h
===================================================================
--- linux-2.6.git.orig/arch/x86/include/asm/syscalls.h
+++ linux-2.6.git/arch/x86/include/asm/syscalls.h
@@ -42,6 +42,10 @@ long sys_sigaltstack(const stack_t __use
 asmlinkage int sys_set_thread_area(struct user_desc __user *);
 asmlinkage int sys_get_thread_area(struct user_desc __user *);
 
+/* kenrel/kcmp.c */
+asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
+			 unsigned long idx1, unsigned long idx2);
+
 /* X86_32 only */
 #ifdef CONFIG_X86_32
 
Index: linux-2.6.git/arch/x86/kernel/Makefile
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/Makefile
+++ linux-2.6.git/arch/x86/kernel/Makefile
@@ -34,6 +34,7 @@ obj-y			+= alternative.o i8253.o pci-nom
 obj-y			+= tsc.o io_delay.o rtc.o
 obj-y			+= pci-iommu_table.o
 obj-y			+= resource.o
+obj-y			+= kcmp.o
 
 obj-y				+= trampoline.o trampoline_$(BITS).o
 obj-y				+= process.o
Index: linux-2.6.git/arch/x86/kernel/kcmp.c
===================================================================
--- /dev/null
+++ linux-2.6.git/arch/x86/kernel/kcmp.c
@@ -0,0 +1,182 @@
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+#include <linux/fdtable.h>
+#include <linux/string.h>
+#include <linux/random.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/cache.h>
+#include <linux/bug.h>
+#include <linux/err.h>
+
+#include <asm/unistd.h>
+#include <asm/kcmp.h>
+
+static unsigned long cookies[KCMP_TYPES][2] __read_mostly;
+static bool cookies_valid __read_mostly;
+
+static long kptr_obfuscate(long v, int type)
+{
+	return (v + cookies[type][0]) ^ cookies[type][1];
+}
+
+/*
+ * 0 - equal
+ * 1 - less than
+ * 2 - greater than
+ * 3 - not equal but ordering unavailable
+ */
+static int kcmp_ptr(long v1, long v2, int type)
+{
+	long ret;
+
+	ret = kptr_obfuscate(v1, type) - kptr_obfuscate(v2, type);
+
+	if (likely(cookies_valid))
+		return (ret < 0) | ((ret > 0) << 1);
+
+	/*
+	 * Cookies failed to init,
+	 * still it's safe to point if
+	 * objects are equal.
+	 */
+	return ret ? 3 : 0;
+}
+
+#define KCMP_TASK_PTR(task1, task2, member, type)	\
+	kcmp_ptr((long)(task1)->member,			\
+		 (long)(task2)->member,			\
+		 type)
+
+#define KCMP_PTR(ptr1, ptr2, type)			\
+	kcmp_ptr((long)ptr1, (long)ptr2, type)
+
+/* A caller must be sure the task is presented in memory */
+static struct file *
+get_file_raw_ptr(struct task_struct *task, unsigned int idx)
+{
+	struct fdtable *fdt;
+	struct file *file;
+
+	spin_lock(&task->files->file_lock);
+	fdt = files_fdtable(task->files);
+	if (idx < fdt->max_fds)
+		file = fdt->fd[idx];
+	else
+		file = NULL;
+	spin_unlock(&task->files->file_lock);
+
+	return file;
+}
+
+SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
+		unsigned long, idx1, unsigned long, idx2)
+{
+	struct task_struct *task1;
+	struct task_struct *task2;
+	int ret = 0;
+
+	rcu_read_lock();
+
+	task1 = find_task_by_vpid(pid1);
+	if (!task1) {
+		rcu_read_unlock();
+		return -ESRCH;
+	}
+
+	task2 = find_task_by_vpid(pid2);
+	if (!task2) {
+		put_task_struct(task1);
+		rcu_read_unlock();
+		return -ESRCH;
+	}
+
+	get_task_struct(task1);
+	get_task_struct(task2);
+
+	rcu_read_unlock();
+
+	if (!ptrace_may_access(task1, PTRACE_MODE_READ) ||
+	    !ptrace_may_access(task2, PTRACE_MODE_READ)) {
+		ret = -EACCES;
+		goto err;
+	}
+
+	/*
+	 * Note for all cases but the KCMP_FILE we
+	 * don't take any locks and do a plain pointer
+	 * comparision in a sake of speed.
+	 */
+
+	switch (type) {
+	case KCMP_FILE: {
+		struct file *filp1, *filp2;
+
+		filp1 = get_file_raw_ptr(task1, idx1);
+		filp2 = get_file_raw_ptr(task2, idx2);
+
+		if (filp1 && filp2)
+			ret = KCMP_PTR(filp1, filp2, KCMP_FILE);
+		else
+			ret = -ENOENT;
+		break;
+	}
+	case KCMP_VM:
+		ret = KCMP_TASK_PTR(task1, task2, mm, KCMP_VM);
+		break;
+	case KCMP_FILES:
+		ret = KCMP_TASK_PTR(task1, task2, files, KCMP_FILES);
+		break;
+	case KCMP_FS:
+		ret = KCMP_TASK_PTR(task1, task2, fs, KCMP_FS);
+		break;
+	case KCMP_SIGHAND:
+		ret = KCMP_TASK_PTR(task1, task2, sighand, KCMP_SIGHAND);
+		break;
+	case KCMP_IO:
+		ret = KCMP_TASK_PTR(task1, task2, io_context, KCMP_IO);
+		break;
+	case KCMP_SYSVSEM:
+#ifdef CONFIG_SYSVIPC
+		ret = KCMP_TASK_PTR(task1, task2, sysvsem.undo_list, KCMP_SYSVSEM);
+#else
+		ret = -ENOENT;
+		goto err;
+#endif
+		break;
+	default:
+		ret = -EINVAL;
+		goto err;
+	}
+
+err:
+	put_task_struct(task1);
+	put_task_struct(task2);
+
+	return ret;
+}
+
+static __init int kcmp_cookie_init(void)
+{
+	int i, j;
+
+	for (i = 0; i < KCMP_TYPES; i++) {
+		for (j = 0; j < 2; j++) {
+			get_random_bytes(&cookies[i][j], sizeof(cookies[i][j]));
+
+			if (cookies[i][j])
+				continue;
+
+			/*
+			 * This is impossible case but just to be sure.
+			 */
+			cookies_valid = false;
+			WARN_ONCE(1, "Can't get random bytes for k-pointers\n");
+		}
+	}
+
+	cookies_valid = true;
+
+	return 0;
+}
+late_initcall(kcmp_cookie_init);
Index: linux-2.6.git/arch/x86/syscalls/syscall_32.tbl
===================================================================
--- linux-2.6.git.orig/arch/x86/syscalls/syscall_32.tbl
+++ linux-2.6.git/arch/x86/syscalls/syscall_32.tbl
@@ -355,3 +355,4 @@
 346	i386	setns			sys_setns
 347	i386	process_vm_readv	sys_process_vm_readv		compat_sys_process_vm_readv
 348	i386	process_vm_writev	sys_process_vm_writev		compat_sys_process_vm_writev
+349	i386	kcmp			sys_kcmp
Index: linux-2.6.git/arch/x86/syscalls/syscall_64.tbl
===================================================================
--- linux-2.6.git.orig/arch/x86/syscalls/syscall_64.tbl
+++ linux-2.6.git/arch/x86/syscalls/syscall_64.tbl
@@ -318,3 +318,4 @@
 309	64	getcpu			sys_getcpu
 310	64	process_vm_readv	sys_process_vm_readv
 311	64	process_vm_writev	sys_process_vm_writev
+312	64	kcmp			sys_kcmp

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-20  8:40                   ` Cyrill Gorcunov
@ 2012-01-20  9:02                     ` Cyrill Gorcunov
  2012-01-20 14:51                       ` H. Peter Anvin
  0 siblings, 1 reply; 27+ messages in thread
From: Cyrill Gorcunov @ 2012-01-20  9:02 UTC (permalink / raw)
  To: Eric W. Biederman, H. Peter Anvin, Pavel Emelyanov, KOSAKI Motohiro
  Cc: david, Alexey Dobriyan, LKML, Andrey Vagin, Ingo Molnar,
	Thomas Gleixner, Glauber Costa, Andi Kleen, Tejun Heo,
	Matt Helsley, Pekka Enberg, Eric Dumazet, Vasiliy Kulikov,
	Andrew Morton, Valdis.Kletnieks

On Fri, Jan 20, 2012 at 12:40:08PM +0400, Cyrill Gorcunov wrote:
...
> +
> +static __init int kcmp_cookie_init(void)
> +{
> +	int i, j;
> +
> +	for (i = 0; i < KCMP_TYPES; i++) {
> +		for (j = 0; j < 2; j++) {
> +			get_random_bytes(&cookies[i][j], sizeof(cookies[i][j]));
> +
> +			if (cookies[i][j])
> +				continue;
> +
> +			/*
> +			 * This is impossible case but just to be sure.
> +			 */
> +			cookies_valid = false;
> +			WARN_ONCE(1, "Can't get random bytes for k-pointers\n");
> +		}
> +	}
> +
> +	cookies_valid = true;

darn, this string of course should be on top,
i'll update don't complain on this nit.

> +
> +	return 0;
> +}

	Cyrill

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-20  9:02                     ` Cyrill Gorcunov
@ 2012-01-20 14:51                       ` H. Peter Anvin
  2012-01-20 16:29                         ` Cyrill Gorcunov
  0 siblings, 1 reply; 27+ messages in thread
From: H. Peter Anvin @ 2012-01-20 14:51 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Eric W. Biederman, Pavel Emelyanov, KOSAKI Motohiro, david,
	Alexey Dobriyan, LKML, Andrey Vagin, Ingo Molnar,
	Thomas Gleixner, Glauber Costa, Andi Kleen, Tejun Heo,
	Matt Helsley, Pekka Enberg, Eric Dumazet, Vasiliy Kulikov,
	Andrew Morton, Valdis.Kletnieks

On 01/20/2012 01:02 AM, Cyrill Gorcunov wrote:
> On Fri, Jan 20, 2012 at 12:40:08PM +0400, Cyrill Gorcunov wrote:
> ...
>> +
>> +static __init int kcmp_cookie_init(void)
>> +{
>> +	int i, j;
>> +
>> +	for (i = 0; i < KCMP_TYPES; i++) {
>> +		for (j = 0; j < 2; j++) {
>> +			get_random_bytes(&cookies[i][j], sizeof(cookies[i][j]));
>> +
>> +			if (cookies[i][j])
>> +				continue;
>> +
>> +			/*
>> +			 * This is impossible case but just to be sure.
>> +			 */
>> +			cookies_valid = false;
>> +			WARN_ONCE(1, "Can't get random bytes for k-pointers\n");
>> +		}
>> +	}
>> +
>> +	cookies_valid = true;
> 
> darn, this string of course should be on top,
> i'll update don't complain on this nit.
> 

This code is wrong.  You will have a zero cookie, legitimately, once in
2^32 or 2^64 attempts, depending on the bitness.

The other thing is that for the multiplicative cookie you should OR in
the value (~(~0UL >> 1) | 1) in order to make sure that the value is (a)
large and (b) odd.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-20 14:51                       ` H. Peter Anvin
@ 2012-01-20 16:29                         ` Cyrill Gorcunov
  2012-01-20 16:57                           ` H. Peter Anvin
  0 siblings, 1 reply; 27+ messages in thread
From: Cyrill Gorcunov @ 2012-01-20 16:29 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Eric W. Biederman, Pavel Emelyanov, KOSAKI Motohiro, david,
	Alexey Dobriyan, LKML, Andrey Vagin, Ingo Molnar,
	Thomas Gleixner, Glauber Costa, Andi Kleen, Tejun Heo,
	Matt Helsley, Pekka Enberg, Eric Dumazet, Vasiliy Kulikov,
	Andrew Morton, Valdis.Kletnieks

On Fri, Jan 20, 2012 at 06:51:31AM -0800, H. Peter Anvin wrote:
> On 01/20/2012 01:02 AM, Cyrill Gorcunov wrote:
> > On Fri, Jan 20, 2012 at 12:40:08PM +0400, Cyrill Gorcunov wrote:
> > ...
> >> +
> >> +static __init int kcmp_cookie_init(void)
> >> +{
> >> +	int i, j;
> >> +
> >> +	for (i = 0; i < KCMP_TYPES; i++) {
> >> +		for (j = 0; j < 2; j++) {
> >> +			get_random_bytes(&cookies[i][j], sizeof(cookies[i][j]));
> >> +
> >> +			if (cookies[i][j])
> >> +				continue;
> >> +
> >> +			/*
> >> +			 * This is impossible case but just to be sure.
> >> +			 */
> >> +			cookies_valid = false;
> >> +			WARN_ONCE(1, "Can't get random bytes for k-pointers\n");
> >> +		}
> >> +	}
> >> +
> >> +	cookies_valid = true;
> > 
> > darn, this string of course should be on top,
> > i'll update don't complain on this nit.
> > 
> 
> This code is wrong.  You will have a zero cookie, legitimately, once in
> 2^32 or 2^64 attempts, depending on the bitness.
>

We've had kind of emergency pool before which were used in such cases.
These legitime cases can happen not that frequently, so I guess -- can
we use it again?

	if (cookies[i][j])
		continue;
	else
		cookies[i][j] = some-value;

One wont know which exactly values we were using.

> The other thing is that for the multiplicative cookie you should OR in
> the value (~(~0UL >> 1) | 1) in order to make sure that the value is (a)
> large and (b) odd.
> 

I see, i'll update (at first, I occasionally translated odd as 'even' which
made me really scratching the head, until I realized it's 'odd'! :)

	Cyrill

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-20 16:29                         ` Cyrill Gorcunov
@ 2012-01-20 16:57                           ` H. Peter Anvin
  2012-01-20 18:19                             ` Cyrill Gorcunov
  0 siblings, 1 reply; 27+ messages in thread
From: H. Peter Anvin @ 2012-01-20 16:57 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Eric W. Biederman, Pavel Emelyanov, KOSAKI Motohiro, david,
	Alexey Dobriyan, LKML, Andrey Vagin, Ingo Molnar,
	Thomas Gleixner, Glauber Costa, Andi Kleen, Tejun Heo,
	Matt Helsley, Pekka Enberg, Eric Dumazet, Vasiliy Kulikov,
	Andrew Morton, Valdis.Kletnieks

On 01/20/2012 08:29 AM, Cyrill Gorcunov wrote:
>
> We've had kind of emergency pool before which were used in such cases.
> These legitime cases can happen not that frequently, so I guess -- can
> we use it again?
>
> 	if (cookies[i][j])
> 		continue;
> 	else
> 		cookies[i][j] = some-value;
>
> One wont know which exactly values we were using.
>

I would not worry about get_random_bytes() returning nothing... if so, a 
lot of other places in the kernel would be broken.

>> The other thing is that for the multiplicative cookie you should OR in
>> the value (~(~0UL>>  1) | 1) in order to make sure that the value is (a)
>> large and (b) odd.
>>
>
> I see, i'll update (at first, I occasionally translated odd as 'even' which
> made me really scratching the head, until I realized it's 'odd'! :)
>
> 	Cyrill


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-20 16:57                           ` H. Peter Anvin
@ 2012-01-20 18:19                             ` Cyrill Gorcunov
  2012-01-20 18:22                               ` Cyrill Gorcunov
  0 siblings, 1 reply; 27+ messages in thread
From: Cyrill Gorcunov @ 2012-01-20 18:19 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Eric W. Biederman, Pavel Emelyanov, KOSAKI Motohiro, david,
	Alexey Dobriyan, LKML, Andrey Vagin, Ingo Molnar,
	Thomas Gleixner, Glauber Costa, Andi Kleen, Tejun Heo,
	Matt Helsley, Pekka Enberg, Eric Dumazet, Vasiliy Kulikov,
	Andrew Morton, Valdis.Kletnieks

On Fri, Jan 20, 2012 at 08:57:11AM -0800, H. Peter Anvin wrote:
> On 01/20/2012 08:29 AM, Cyrill Gorcunov wrote:
> 
> I would not worry about get_random_bytes() returning nothing... if
> so, a lot of other places in the kernel would be broken.
> 

Something like below?

	Cyrill
---
From: Cyrill Gorcunov <gorcunov@openvz.org>
Subject: [RFC] syscalls, x86: Add __NR_kcmp syscall v4

While doing the checkpoint-restore in the userspace one need to determine
whether various kernel objects (like mm_struct-s of file_struct-s) are shared
between tasks and restore this state.

The 2nd step can be solved by using appropriate CLONE_ flags and the unshare
syscall, while there's currently no ways for solving the 1st one.

One of the ways for checking whether two tasks share e.g. mm_struct is to
provide some mm_struct ID of a task to its proc file, but showing such
info considered to be not that good for security reasons.

Thus after some debates we end up in conclusion that using that named
'comparision' syscall might be the best candidate. So here is it --
__NR_kcmp.

It takes up to 5 agruments - the pids of the two tasks (which
characteristics should be compared), the comparision type and
(in case of comparision of files) two file descriptors.

At moment only x86 is supported.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: "Eric W. Biederman" <ebiederm@xmission.com>
CC: Pavel Emelyanov <xemul@parallels.com>
CC: Andrey Vagin <avagin@openvz.org>
CC: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
CC: Ingo Molnar <mingo@elte.hu>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Glauber Costa <glommer@parallels.com>
CC: Andi Kleen <andi@firstfloor.org>
CC: Tejun Heo <tj@kernel.org>
CC: Matt Helsley <matthltc@us.ibm.com>
CC: Pekka Enberg <penberg@kernel.org>
CC: Eric Dumazet <eric.dumazet@gmail.com>
CC: Vasiliy Kulikov <segoon@openwall.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Valdis.Kletnieks@vt.edu
---
 arch/x86/include/asm/kcmp.h      |   17 ++++
 arch/x86/include/asm/syscalls.h  |    4 
 arch/x86/kernel/Makefile         |    1 
 arch/x86/kernel/kcmp.c           |  163 +++++++++++++++++++++++++++++++++++++++
 arch/x86/syscalls/syscall_32.tbl |    1 
 arch/x86/syscalls/syscall_64.tbl |    1 
 6 files changed, 187 insertions(+)

Index: linux-2.6.git/arch/x86/include/asm/kcmp.h
===================================================================
--- /dev/null
+++ linux-2.6.git/arch/x86/include/asm/kcmp.h
@@ -0,0 +1,17 @@
+#ifndef _LINUX_KCMP_H
+#define _LINUX_KCMP_H
+
+/* Comparision type */
+enum {
+	KCMP_FILE,
+	KCMP_VM,
+	KCMP_FILES,
+	KCMP_FS,
+	KCMP_SIGHAND,
+	KCMP_IO,
+	KCMP_SYSVSEM,
+
+	KCMP_TYPES,
+};
+
+#endif /* _LINUX_KCMP_H */
Index: linux-2.6.git/arch/x86/include/asm/syscalls.h
===================================================================
--- linux-2.6.git.orig/arch/x86/include/asm/syscalls.h
+++ linux-2.6.git/arch/x86/include/asm/syscalls.h
@@ -42,6 +42,10 @@ long sys_sigaltstack(const stack_t __use
 asmlinkage int sys_set_thread_area(struct user_desc __user *);
 asmlinkage int sys_get_thread_area(struct user_desc __user *);
 
+/* kenrel/kcmp.c */
+asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
+			 unsigned long idx1, unsigned long idx2);
+
 /* X86_32 only */
 #ifdef CONFIG_X86_32
 
Index: linux-2.6.git/arch/x86/kernel/Makefile
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/Makefile
+++ linux-2.6.git/arch/x86/kernel/Makefile
@@ -34,6 +34,7 @@ obj-y			+= alternative.o i8253.o pci-nom
 obj-y			+= tsc.o io_delay.o rtc.o
 obj-y			+= pci-iommu_table.o
 obj-y			+= resource.o
+obj-y			+= kcmp.o
 
 obj-y				+= trampoline.o trampoline_$(BITS).o
 obj-y				+= process.o
Index: linux-2.6.git/arch/x86/kernel/kcmp.c
===================================================================
--- /dev/null
+++ linux-2.6.git/arch/x86/kernel/kcmp.c
@@ -0,0 +1,163 @@
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+#include <linux/fdtable.h>
+#include <linux/string.h>
+#include <linux/random.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/cache.h>
+#include <linux/bug.h>
+#include <linux/err.h>
+
+#include <asm/unistd.h>
+#include <asm/kcmp.h>
+
+static unsigned long cookies[KCMP_TYPES][2] __read_mostly;
+
+static long kptr_obfuscate(long v, int type)
+{
+	return (v + cookies[type][0]) ^ cookies[type][1];
+}
+
+/*
+ * 0 - equal
+ * 1 - less than
+ * 2 - greater than
+ * 3 - not equal but ordering unavailable
+ */
+static int kcmp_ptr(long v1, long v2, int type)
+{
+	long ret;
+
+	ret = kptr_obfuscate(v1, type) - kptr_obfuscate(v2, type);
+
+	return (ret < 0) | ((ret > 0) << 1);
+}
+
+#define KCMP_TASK_PTR(task1, task2, member, type)	\
+	kcmp_ptr((long)(task1)->member,			\
+		 (long)(task2)->member,			\
+		 type)
+
+#define KCMP_PTR(ptr1, ptr2, type)			\
+	kcmp_ptr((long)ptr1, (long)ptr2, type)
+
+/* A caller must be sure the task is presented in memory */
+static struct file *
+get_file_raw_ptr(struct task_struct *task, unsigned int idx)
+{
+	struct fdtable *fdt;
+	struct file *file;
+
+	spin_lock(&task->files->file_lock);
+	fdt = files_fdtable(task->files);
+	if (idx < fdt->max_fds)
+		file = fdt->fd[idx];
+	else
+		file = NULL;
+	spin_unlock(&task->files->file_lock);
+
+	return file;
+}
+
+SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
+		unsigned long, idx1, unsigned long, idx2)
+{
+	struct task_struct *task1;
+	struct task_struct *task2;
+	int ret = 0;
+
+	rcu_read_lock();
+
+	task1 = find_task_by_vpid(pid1);
+	if (!task1) {
+		rcu_read_unlock();
+		return -ESRCH;
+	}
+
+	task2 = find_task_by_vpid(pid2);
+	if (!task2) {
+		put_task_struct(task1);
+		rcu_read_unlock();
+		return -ESRCH;
+	}
+
+	get_task_struct(task1);
+	get_task_struct(task2);
+
+	rcu_read_unlock();
+
+	if (!ptrace_may_access(task1, PTRACE_MODE_READ) ||
+	    !ptrace_may_access(task2, PTRACE_MODE_READ)) {
+		ret = -EACCES;
+		goto err;
+	}
+
+	/*
+	 * Note for all cases but the KCMP_FILE we
+	 * don't take any locks in a sake of speed.
+	 */
+
+	switch (type) {
+	case KCMP_FILE: {
+		struct file *filp1, *filp2;
+
+		filp1 = get_file_raw_ptr(task1, idx1);
+		filp2 = get_file_raw_ptr(task2, idx2);
+
+		if (filp1 && filp2)
+			ret = KCMP_PTR(filp1, filp2, KCMP_FILE);
+		else
+			ret = -ENOENT;
+		break;
+	}
+	case KCMP_VM:
+		ret = KCMP_TASK_PTR(task1, task2, mm, KCMP_VM);
+		break;
+	case KCMP_FILES:
+		ret = KCMP_TASK_PTR(task1, task2, files, KCMP_FILES);
+		break;
+	case KCMP_FS:
+		ret = KCMP_TASK_PTR(task1, task2, fs, KCMP_FS);
+		break;
+	case KCMP_SIGHAND:
+		ret = KCMP_TASK_PTR(task1, task2, sighand, KCMP_SIGHAND);
+		break;
+	case KCMP_IO:
+		ret = KCMP_TASK_PTR(task1, task2, io_context, KCMP_IO);
+		break;
+	case KCMP_SYSVSEM:
+#ifdef CONFIG_SYSVIPC
+		ret = KCMP_TASK_PTR(task1, task2, sysvsem.undo_list, KCMP_SYSVSEM);
+#else
+		ret = -ENOENT;
+		goto err;
+#endif
+		break;
+	default:
+		ret = -EINVAL;
+		goto err;
+	}
+
+err:
+	put_task_struct(task1);
+	put_task_struct(task2);
+
+	return ret;
+}
+
+static __init int kcmp_cookie_init(void)
+{
+	int i, j;
+
+	for (i = 0; i < KCMP_TYPES; i++) {
+		for (j = 0; j < 2; j++) {
+			get_random_bytes(&cookies[i][j],
+					 sizeof(cookies[i][j]));
+			cookies[i][0] |= (~(~0UL >>  1) | 1);
+		}
+	}
+
+	return 0;
+}
+late_initcall(kcmp_cookie_init);
Index: linux-2.6.git/arch/x86/syscalls/syscall_32.tbl
===================================================================
--- linux-2.6.git.orig/arch/x86/syscalls/syscall_32.tbl
+++ linux-2.6.git/arch/x86/syscalls/syscall_32.tbl
@@ -355,3 +355,4 @@
 346	i386	setns			sys_setns
 347	i386	process_vm_readv	sys_process_vm_readv		compat_sys_process_vm_readv
 348	i386	process_vm_writev	sys_process_vm_writev		compat_sys_process_vm_writev
+349	i386	kcmp			sys_kcmp
Index: linux-2.6.git/arch/x86/syscalls/syscall_64.tbl
===================================================================
--- linux-2.6.git.orig/arch/x86/syscalls/syscall_64.tbl
+++ linux-2.6.git/arch/x86/syscalls/syscall_64.tbl
@@ -318,3 +318,4 @@
 309	64	getcpu			sys_getcpu
 310	64	process_vm_readv	sys_process_vm_readv
 311	64	process_vm_writev	sys_process_vm_writev
+312	64	kcmp			sys_kcmp

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] syscalls, x86: Add __NR_kcmp syscall
  2012-01-20 18:19                             ` Cyrill Gorcunov
@ 2012-01-20 18:22                               ` Cyrill Gorcunov
  0 siblings, 0 replies; 27+ messages in thread
From: Cyrill Gorcunov @ 2012-01-20 18:22 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Eric W. Biederman, Pavel Emelyanov, KOSAKI Motohiro, david,
	Alexey Dobriyan, LKML, Andrey Vagin, Ingo Molnar,
	Thomas Gleixner, Glauber Costa, Andi Kleen, Tejun Heo,
	Matt Helsley, Pekka Enberg, Eric Dumazet, Vasiliy Kulikov,
	Andrew Morton, Valdis.Kletnieks

On Fri, Jan 20, 2012 at 10:19:54PM +0400, Cyrill Gorcunov wrote:
> On Fri, Jan 20, 2012 at 08:57:11AM -0800, H. Peter Anvin wrote:
> > On 01/20/2012 08:29 AM, Cyrill Gorcunov wrote:
> > 
> > I would not worry about get_random_bytes() returning nothing... if
> > so, a lot of other places in the kernel would be broken.
> > 
> 
> Something like below?
> 
> 	Cyrill
> ---
...
> +
> +static __init int kcmp_cookie_init(void)
> +{
> +	int i, j;
> +
> +	for (i = 0; i < KCMP_TYPES; i++) {
> +		for (j = 0; j < 2; j++) {
> +			get_random_bytes(&cookies[i][j],
> +					 sizeof(cookies[i][j]));
> +			cookies[i][0] |= (~(~0UL >>  1) | 1);

Sigh, what a day. cookies[i][j]!

> +		}
> +	}
> +
> +	return 0;
> +}

	Cyrill

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2012-01-20 18:22 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-01-17 14:27 [RFC] syscalls, x86: Add __NR_kcmp syscall Cyrill Gorcunov
2012-01-17 14:38 ` Alexey Dobriyan
2012-01-17 14:44   ` Cyrill Gorcunov
2012-01-17 18:47     ` H. Peter Anvin
2012-01-17 21:15       ` Cyrill Gorcunov
2012-01-17 21:40         ` Eric W. Biederman
2012-01-18  5:07           ` Pavel Emelyanov
2012-01-17 21:35       ` Eric W. Biederman
2012-01-18  8:01         ` Cyrill Gorcunov
2012-01-18  9:12           ` KOSAKI Motohiro
2012-01-18  9:19             ` Pavel Emelyanov
2012-01-18  9:23               ` KOSAKI Motohiro
2012-01-18 11:57                 ` Cyrill Gorcunov
2012-01-18 16:46                   ` KOSAKI Motohiro
2012-01-18 17:20                     ` Cyrill Gorcunov
2012-01-18 22:05         ` david
2012-01-18 22:49           ` Cyrill Gorcunov
2012-01-18 23:29             ` Eric W. Biederman
2012-01-19  6:55               ` Cyrill Gorcunov
2012-01-20  3:16                 ` Eric W. Biederman
2012-01-20  8:40                   ` Cyrill Gorcunov
2012-01-20  9:02                     ` Cyrill Gorcunov
2012-01-20 14:51                       ` H. Peter Anvin
2012-01-20 16:29                         ` Cyrill Gorcunov
2012-01-20 16:57                           ` H. Peter Anvin
2012-01-20 18:19                             ` Cyrill Gorcunov
2012-01-20 18:22                               ` Cyrill Gorcunov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).