All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC][PATCH] Cross Memory Attach
@ 2010-09-15  1:18 Christopher Yeoh
  2010-09-15  8:02   ` Ingo Molnar
                   ` (3 more replies)
  0 siblings, 4 replies; 62+ messages in thread
From: Christopher Yeoh @ 2010-09-15  1:18 UTC (permalink / raw)
  To: linux-kernel


The basic idea behind cross memory attach is to allow MPI programs doing
intra-node communication to do a single copy of the message rather than
a double copy of the message via shared memory.

The following patch attempts to achieve this by allowing a
destination process, given an address and size from a source process, to
copy memory directly from the source process into its own address space
via a system call. There is also a symmetrical ability to copy from 
the current process's address space into a destination process's
address space.

Use of vmsplice instead was considered, but has problems. Since you
need the reader and writer working co-operatively if the pipe is not
drained then you block. Which requires some wrapping to do non blocking
on the send side or polling on the receive. In all to all communication
it requires ordering otherwise you can deadlock. And in the example of
many MPI tasks writing to one MPI task vmsplice serialises the
copying.

I've added the use of this capability to OpenMPI and run some MPI
benchmarks on a 64-way (with SMT off) Power6 machine which see
improvements in the following areas:

HPCC results:
=============

MB/s			Num Processes	
Naturally Ordered	4	8	16	32
Base			1235	935	622	419
CMA			4741	3769	1977	703

			
MB/s			Num Processes	
Randomly Ordered	4	8	16	32
Base			1227	947	638	412
CMA			4666	3682	1978	710
				
MB/s			Num Processes	
Max Ping Pong		4	8	16	32
Base			2028	1938	1928	1882
CMA			7424	7510	7598	7708


NPB:
====
BT - 12% improvement
FT - 15% improvement
IS - 30% improvement
SP - 34% improvement

IMB:
===
		
Ping Pong - ~30% improvement
Ping Ping - ~120% improvement
SendRecv - ~100% improvement
Exchange - ~150% improvement
Gather(v) - ~20% improvement
Scatter(v) - ~20% improvement
AlltoAll(v) - 30-50% improvement

Patch is as below. Any comments?

Regards,

Chris
-- 
cyeoh@au.ibm.com


Signed-off-by: Chris Yeoh <cyeoh@au1.ibm.com>
--- 
 arch/powerpc/include/asm/systbl.h  |    2 
 arch/powerpc/include/asm/unistd.h  |    5 -
 arch/x86/include/asm/unistd_32.h   |    4 
 arch/x86/kernel/syscall_table_32.S |    2 
 include/linux/syscalls.h           |    6 +
 mm/memory.c                        |  184 +++++++++++++++++++++++++++++++++++++
 6 files changed, 200 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index a5ee345..d82a6be 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -326,3 +326,5 @@ SYSCALL_SPU(perf_event_open)
 COMPAT_SYS_SPU(preadv)
 COMPAT_SYS_SPU(pwritev)
 COMPAT_SYS(rt_tgsigqueueinfo)
+SYSCALL(copy_from_process)
+SYSCALL(copy_to_process)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index f0a1026..40d46fc 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -345,10 +345,11 @@
 #define __NR_preadv		320
 #define __NR_pwritev		321
 #define __NR_rt_tgsigqueueinfo	322
-
+#define __NR_copy_from_process  323
+#define __NR_copy_to_process    324
 #ifdef __KERNEL__
 
-#define __NR_syscalls		323
+#define __NR_syscalls		325
 
 #define __NR__exit __NR_exit
 #define NR_syscalls	__NR_syscalls
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index beb9b5f..9c90a65 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -343,10 +343,12 @@
 #define __NR_rt_tgsigqueueinfo	335
 #define __NR_perf_event_open	336
 #define __NR_recvmmsg		337
+#define __NR_copy_from_process	338
+#define __NR_copy_to_process	339
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 338
+#define NR_syscalls 340
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 8b37293..984b766 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -337,3 +337,5 @@ ENTRY(sys_call_table)
 	.long sys_rt_tgsigqueueinfo	/* 335 */
 	.long sys_perf_event_open
 	.long sys_recvmmsg
+	.long sys_copy_from_process
+	.long sys_copy_to_process
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 13ebb54..64b64c3 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -825,5 +825,11 @@ asmlinkage long sys_mmap_pgoff(unsigned long addr, unsigned long len,
 			unsigned long prot, unsigned long flags,
 			unsigned long fd, unsigned long pgoff);
 asmlinkage long sys_old_mmap(struct mmap_arg_struct __user *arg);
+asmlinkage long sys_copy_from_process(pid_t pid, unsigned long addr,
+				      unsigned long len,
+				      char __user *buf, int flags);
+asmlinkage long sys_copy_to_process(pid_t pid, unsigned long addr,
+				    unsigned long len,
+				    char __user *buf, int flags);
 
 #endif
diff --git a/mm/memory.c b/mm/memory.c
index 119b7cc..64a6d7b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -57,6 +57,7 @@
 #include <linux/swapops.h>
 #include <linux/elf.h>
 #include <linux/gfp.h>
+#include <linux/syscalls.h>
 
 #include <asm/io.h>
 #include <asm/pgalloc.h>
@@ -3487,6 +3488,189 @@ void print_vma_addr(char *prefix, unsigned long ip)
 	up_read(&current->mm->mmap_sem);
 }
 
+int copy_to_from_process_allowed(struct task_struct *task)
+{
+	/* Allow copy_to_from_process to access another process using
+	   the same critera  as a process would be allowed to ptrace
+	   that same process */
+	const struct cred *cred = current_cred(), *tcred;
+
+	rcu_read_lock();
+	tcred = __task_cred(task);
+	if ((cred->uid != tcred->euid ||
+	     cred->uid != tcred->suid ||
+	     cred->uid != tcred->uid  ||
+	     cred->gid != tcred->egid ||
+	     cred->gid != tcred->sgid ||
+	     cred->gid != tcred->gid) &&
+	    !capable(CAP_SYS_PTRACE)) {
+		rcu_read_unlock();
+		return 0;
+	}
+	rcu_read_unlock();
+	return 1;
+}
+
+
+
+static int copy_to_from_process_pages(struct task_struct *task,
+				      struct page **process_pages,
+				      unsigned long pa,
+				      unsigned long *bytes_copied,
+				      unsigned long start_offset,
+				      unsigned long len,
+				      char *user_buf,
+				      int copy_to,
+				      int nr_pages_remain)
+{
+	int pages_pinned;
+	void *target_kaddr;
+	int i;
+	int ret;
+	unsigned long bytes_to_copy;
+	int max_pages_per_loop = (PAGE_SIZE * 2) / sizeof(struct pages *);
+	int nr_pages_to_copy = min(nr_pages_remain, max_pages_per_loop);
+	int rc = -EFAULT;
+	
+	/* Get the pages we're interested in */
+	pages_pinned = get_user_pages(task, task->mm, pa,
+				      nr_pages_to_copy,
+				      copy_to, 0, process_pages, NULL);
+
+	if (pages_pinned != nr_pages_to_copy)
+		goto end;
+
+	/* Do the copy for each page */
+	for (i = 0; i < nr_pages_to_copy; i++) {
+		target_kaddr = kmap(process_pages[i]) + start_offset;
+		bytes_to_copy = min(PAGE_SIZE - start_offset,
+				    len - *bytes_copied);
+		if (start_offset)
+			start_offset = 0;
+
+		if (copy_to) {
+			ret = copy_from_user(target_kaddr,
+					     user_buf + *bytes_copied,
+					     bytes_to_copy);
+			if (ret) {
+				kunmap(process_pages[i]);
+				goto end;
+			}
+		} else {
+			ret = copy_to_user(user_buf + *bytes_copied,
+					   target_kaddr, bytes_to_copy);
+			if (ret) {
+				kunmap(process_pages[i]);
+				goto end;
+			}
+		}
+		kunmap(process_pages[i]);
+		*bytes_copied += bytes_to_copy;
+	}
+
+	rc = nr_pages_to_copy;
+
+end:
+	for (i = 0; i < pages_pinned; i++) {
+		if (copy_to)
+			set_page_dirty_lock(process_pages[i]);
+		put_page(process_pages[i]);
+	}
+
+	return rc;
+}
+
+static int copy_to_from_process(pid_t pid, unsigned long addr,
+				unsigned long len,
+				char *user_buf, int flags, int copy_to)
+{
+	unsigned long pa = addr & PAGE_MASK;
+	unsigned long start_offset = addr - pa;
+	int nr_pages;
+	struct task_struct *task;
+	struct page **process_pages;
+	unsigned long bytes_copied = 0;
+	int rc;
+	int nr_pages_copied = 0;
+
+	/* Work out address and page range required */
+	if (len == 0)
+		return 0;
+	nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1;
+
+	/* Get process information */
+	rcu_read_lock();
+	task = find_task_by_vpid(pid); /* pid namespace?!? */
+	if (task)
+		get_task_struct(task);
+	rcu_read_unlock();
+	if (!task)
+		return -ESRCH;
+
+	task_lock(task);
+	if (!copy_to_from_process_allowed(task)) {
+		task_unlock(task);
+		rc = -EPERM;
+		goto end;
+	}
+	task_unlock(task);
+
+
+	/* For reliability don't try to kmalloc more than 2 pages worth */
+	process_pages = kmalloc(min(PAGE_SIZE * 2,
+				    sizeof(struct pages *) * nr_pages),
+				GFP_KERNEL);
+
+	if (!process_pages) {
+		rc = -ENOMEM;
+		goto end;
+	}
+
+	down_read(&task->mm->mmap_sem);
+	while (nr_pages_copied < nr_pages) {
+		rc = copy_to_from_process_pages(task, process_pages,
+						pa,
+						&bytes_copied,
+						start_offset,
+						len,
+						user_buf,
+						copy_to,
+						nr_pages - nr_pages_copied);
+		start_offset = 0;
+
+		if (rc == -EFAULT)
+			goto free_mem;
+		else {
+			nr_pages_copied += rc;
+			pa += rc * PAGE_SIZE;
+		}
+	}
+
+	rc = bytes_copied;
+
+free_mem:
+	up_read(&task->mm->mmap_sem);
+	kfree(process_pages);
+
+end:
+	put_task_struct(task);
+	return rc;
+}
+
+SYSCALL_DEFINE5(copy_from_process, pid_t, pid, unsigned long, addr,
+		unsigned long, len, char __user *, buf, int, flags)
+{
+	return copy_to_from_process(pid, addr, len, buf, flags, 0);
+}
+
+
+SYSCALL_DEFINE5(copy_to_process, pid_t, pid, unsigned long, addr,
+		unsigned long, len, char __user *, buf, int, flags)
+{
+	return copy_to_from_process(pid, addr, len, buf, flags, 1);
+}
+
+
 #ifdef CONFIG_PROVE_LOCKING
 void might_fault(void)
 {


-- 
cyeoh@au.ibm.com

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-15  1:18 [RFC][PATCH] Cross Memory Attach Christopher Yeoh
@ 2010-09-15  8:02   ` Ingo Molnar
  2010-09-15 10:58   ` Avi Kivity
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 62+ messages in thread
From: Ingo Molnar @ 2010-09-15  8:02 UTC (permalink / raw)
  To: Christopher Yeoh
  Cc: linux-kernel, Andrew Morton, Linus Torvalds, Peter Zijlstra, linux-mm


(Interesting patch found on lkml, more folks Cc:-ed)

* Christopher Yeoh <cyeoh@au1.ibm.com> wrote:

> The basic idea behind cross memory attach is to allow MPI programs 
> doing intra-node communication to do a single copy of the message 
> rather than a double copy of the message via shared memory.
> 
> The following patch attempts to achieve this by allowing a destination 
> process, given an address and size from a source process, to copy 
> memory directly from the source process into its own address space via 
> a system call. There is also a symmetrical ability to copy from the 
> current process's address space into a destination process's address 
> space.
> 
> Use of vmsplice instead was considered, but has problems. Since you 
> need the reader and writer working co-operatively if the pipe is not 
> drained then you block. Which requires some wrapping to do non 
> blocking on the send side or polling on the receive. In all to all 
> communication it requires ordering otherwise you can deadlock. And in 
> the example of many MPI tasks writing to one MPI task vmsplice 
> serialises the copying.
> 
> I've added the use of this capability to OpenMPI and run some MPI 
> benchmarks on a 64-way (with SMT off) Power6 machine which see 
> improvements in the following areas:
> 
> HPCC results:
> =============
> 
> MB/s			Num Processes	
> Naturally Ordered	4	8	16	32
> Base			1235	935	622	419
> CMA			4741	3769	1977	703
> 
> 			
> MB/s			Num Processes	
> Randomly Ordered	4	8	16	32
> Base			1227	947	638	412
> CMA			4666	3682	1978	710
> 				
> MB/s			Num Processes	
> Max Ping Pong		4	8	16	32
> Base			2028	1938	1928	1882
> CMA			7424	7510	7598	7708
> 
> 
> NPB:
> ====
> BT - 12% improvement
> FT - 15% improvement
> IS - 30% improvement
> SP - 34% improvement
> 
> IMB:
> ===
> 		
> Ping Pong - ~30% improvement
> Ping Ping - ~120% improvement
> SendRecv - ~100% improvement
> Exchange - ~150% improvement
> Gather(v) - ~20% improvement
> Scatter(v) - ~20% improvement
> AlltoAll(v) - 30-50% improvement
> 
> Patch is as below. Any comments?

Impressive numbers!

What did those OpenMPI facilities use before your patch - shared memory 
or sockets?

I have an observation about the interface:

> +asmlinkage long sys_copy_from_process(pid_t pid, unsigned long addr,
> +				      unsigned long len,
> +				      char __user *buf, int flags);
> +asmlinkage long sys_copy_to_process(pid_t pid, unsigned long addr,
> +				    unsigned long len,
> +				    char __user *buf, int flags);

A small detail: 'int flags' should probably be 'unsigned long flags' - 
it leaves more space.

Also, note that there is a further performance optimization possible 
here: if the other task's ->mm is the same as this task's (they share 
the MM), then the copy can be done straight in this process context, 
without GUP. User-space might not necessarily be aware of this so it 
might make sense to express this special case in the kernel too.

More fundamentally, wouldnt it make sense to create an iovec interface 
here? If the Gather(v) / Scatter(v) / AlltoAll(v) workloads have any 
fragmentation on the user-space buffer side then the copy of multiple 
areas could be done in a single syscall. (the MM lock has to be touched 
only once, target task only be looked up only once, etc.)

Plus, a small naming detail, shouldnt the naming be more IO like:

  sys_process_vm_read()
  sys_process_vm_write()

Basically a regular read()/write() interface, but instead of fd's we'd 
have (PID,addr) identifiers for remote buffers, and instant execution 
(no buffering).

This makes these somewhat special syscalls a bit less special :-)

[ In theory we could also use this new ABI in a way to help the various 
  RDMA efforts as well - but it looks way too complex. RDMA is rather 
  difficult from an OS design POV - and this special case you have 
  implemented is much easier to do, as we are in a single trust domain. ]

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
@ 2010-09-15  8:02   ` Ingo Molnar
  0 siblings, 0 replies; 62+ messages in thread
From: Ingo Molnar @ 2010-09-15  8:02 UTC (permalink / raw)
  To: Christopher Yeoh
  Cc: linux-kernel, Andrew Morton, Linus Torvalds, Peter Zijlstra, linux-mm


(Interesting patch found on lkml, more folks Cc:-ed)

* Christopher Yeoh <cyeoh@au1.ibm.com> wrote:

> The basic idea behind cross memory attach is to allow MPI programs 
> doing intra-node communication to do a single copy of the message 
> rather than a double copy of the message via shared memory.
> 
> The following patch attempts to achieve this by allowing a destination 
> process, given an address and size from a source process, to copy 
> memory directly from the source process into its own address space via 
> a system call. There is also a symmetrical ability to copy from the 
> current process's address space into a destination process's address 
> space.
> 
> Use of vmsplice instead was considered, but has problems. Since you 
> need the reader and writer working co-operatively if the pipe is not 
> drained then you block. Which requires some wrapping to do non 
> blocking on the send side or polling on the receive. In all to all 
> communication it requires ordering otherwise you can deadlock. And in 
> the example of many MPI tasks writing to one MPI task vmsplice 
> serialises the copying.
> 
> I've added the use of this capability to OpenMPI and run some MPI 
> benchmarks on a 64-way (with SMT off) Power6 machine which see 
> improvements in the following areas:
> 
> HPCC results:
> =============
> 
> MB/s			Num Processes	
> Naturally Ordered	4	8	16	32
> Base			1235	935	622	419
> CMA			4741	3769	1977	703
> 
> 			
> MB/s			Num Processes	
> Randomly Ordered	4	8	16	32
> Base			1227	947	638	412
> CMA			4666	3682	1978	710
> 				
> MB/s			Num Processes	
> Max Ping Pong		4	8	16	32
> Base			2028	1938	1928	1882
> CMA			7424	7510	7598	7708
> 
> 
> NPB:
> ====
> BT - 12% improvement
> FT - 15% improvement
> IS - 30% improvement
> SP - 34% improvement
> 
> IMB:
> ===
> 		
> Ping Pong - ~30% improvement
> Ping Ping - ~120% improvement
> SendRecv - ~100% improvement
> Exchange - ~150% improvement
> Gather(v) - ~20% improvement
> Scatter(v) - ~20% improvement
> AlltoAll(v) - 30-50% improvement
> 
> Patch is as below. Any comments?

Impressive numbers!

What did those OpenMPI facilities use before your patch - shared memory 
or sockets?

I have an observation about the interface:

> +asmlinkage long sys_copy_from_process(pid_t pid, unsigned long addr,
> +				      unsigned long len,
> +				      char __user *buf, int flags);
> +asmlinkage long sys_copy_to_process(pid_t pid, unsigned long addr,
> +				    unsigned long len,
> +				    char __user *buf, int flags);

A small detail: 'int flags' should probably be 'unsigned long flags' - 
it leaves more space.

Also, note that there is a further performance optimization possible 
here: if the other task's ->mm is the same as this task's (they share 
the MM), then the copy can be done straight in this process context, 
without GUP. User-space might not necessarily be aware of this so it 
might make sense to express this special case in the kernel too.

More fundamentally, wouldnt it make sense to create an iovec interface 
here? If the Gather(v) / Scatter(v) / AlltoAll(v) workloads have any 
fragmentation on the user-space buffer side then the copy of multiple 
areas could be done in a single syscall. (the MM lock has to be touched 
only once, target task only be looked up only once, etc.)

Plus, a small naming detail, shouldnt the naming be more IO like:

  sys_process_vm_read()
  sys_process_vm_write()

Basically a regular read()/write() interface, but instead of fd's we'd 
have (PID,addr) identifiers for remote buffers, and instant execution 
(no buffering).

This makes these somewhat special syscalls a bit less special :-)

[ In theory we could also use this new ABI in a way to help the various 
  RDMA efforts as well - but it looks way too complex. RDMA is rather 
  difficult from an OS design POV - and this special case you have 
  implemented is much easier to do, as we are in a single trust domain. ]

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-15  8:02   ` Ingo Molnar
@ 2010-09-15  8:16     ` Ingo Molnar
  -1 siblings, 0 replies; 62+ messages in thread
From: Ingo Molnar @ 2010-09-15  8:16 UTC (permalink / raw)
  To: Christopher Yeoh
  Cc: linux-kernel, Andrew Morton, Linus Torvalds, Peter Zijlstra, linux-mm


> > NPB:
> > ====
> > BT - 12% improvement
> > FT - 15% improvement
> > IS - 30% improvement
> > SP - 34% improvement
> > 
> > IMB:
> > ===
> > 		
> > Ping Pong - ~30% improvement
> > Ping Ping - ~120% improvement
> > SendRecv - ~100% improvement
> > Exchange - ~150% improvement
> > Gather(v) - ~20% improvement
> > Scatter(v) - ~20% improvement
> > AlltoAll(v) - 30-50% improvement

btw., how does OpenMPI signal the target tasks that something happened 
to their address space - is there some pipe/socket side-channel, or 
perhaps purely based on flags in the modified memory areas, which are 
polled?

	Ingo

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
@ 2010-09-15  8:16     ` Ingo Molnar
  0 siblings, 0 replies; 62+ messages in thread
From: Ingo Molnar @ 2010-09-15  8:16 UTC (permalink / raw)
  To: Christopher Yeoh
  Cc: linux-kernel, Andrew Morton, Linus Torvalds, Peter Zijlstra, linux-mm


> > NPB:
> > ====
> > BT - 12% improvement
> > FT - 15% improvement
> > IS - 30% improvement
> > SP - 34% improvement
> > 
> > IMB:
> > ===
> > 		
> > Ping Pong - ~30% improvement
> > Ping Ping - ~120% improvement
> > SendRecv - ~100% improvement
> > Exchange - ~150% improvement
> > Gather(v) - ~20% improvement
> > Scatter(v) - ~20% improvement
> > AlltoAll(v) - 30-50% improvement

btw., how does OpenMPI signal the target tasks that something happened 
to their address space - is there some pipe/socket side-channel, or 
perhaps purely based on flags in the modified memory areas, which are 
polled?

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-15  1:18 [RFC][PATCH] Cross Memory Attach Christopher Yeoh
@ 2010-09-15 10:58   ` Avi Kivity
  2010-09-15 10:58   ` Avi Kivity
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 62+ messages in thread
From: Avi Kivity @ 2010-09-15 10:58 UTC (permalink / raw)
  To: Christopher Yeoh; +Cc: linux-kernel, Linux Memory Management List, Ingo Molnar

  On 09/15/2010 03:18 AM, Christopher Yeoh wrote:
> The basic idea behind cross memory attach is to allow MPI programs doing
> intra-node communication to do a single copy of the message rather than
> a double copy of the message via shared memory.

If the host has a dma engine (many modern ones do) you can reduce this 
to zero copies (at least, zero processor copies).

> The following patch attempts to achieve this by allowing a
> destination process, given an address and size from a source process, to
> copy memory directly from the source process into its own address space
> via a system call. There is also a symmetrical ability to copy from
> the current process's address space into a destination process's
> address space.
>
>

Instead of those two syscalls, how about a vmfd(pid_t pid, ulong start, 
ulong len) system call which returns an file descriptor that represents 
a portion of the process address space.  You can then use preadv() and 
pwritev() to copy memory, and io_submit(IO_CMD_PREADV) and 
io_submit(IO_CMD_PWRITEV) for asynchronous variants (especially useful 
with a dma engine, since that adds latency).

With some care (and use of mmu_notifiers) you can even mmap() your vmfd 
and access remote process memory directly.

A nice property of file descriptors is that you can pass them around 
securely via SCM_RIGHTS.  So a process can create a window into its 
address space and pass it to other processes.

(or you could just use a shared memory object and pass it around)

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
@ 2010-09-15 10:58   ` Avi Kivity
  0 siblings, 0 replies; 62+ messages in thread
From: Avi Kivity @ 2010-09-15 10:58 UTC (permalink / raw)
  To: Christopher Yeoh; +Cc: linux-kernel, Linux Memory Management List, Ingo Molnar

  On 09/15/2010 03:18 AM, Christopher Yeoh wrote:
> The basic idea behind cross memory attach is to allow MPI programs doing
> intra-node communication to do a single copy of the message rather than
> a double copy of the message via shared memory.

If the host has a dma engine (many modern ones do) you can reduce this 
to zero copies (at least, zero processor copies).

> The following patch attempts to achieve this by allowing a
> destination process, given an address and size from a source process, to
> copy memory directly from the source process into its own address space
> via a system call. There is also a symmetrical ability to copy from
> the current process's address space into a destination process's
> address space.
>
>

Instead of those two syscalls, how about a vmfd(pid_t pid, ulong start, 
ulong len) system call which returns an file descriptor that represents 
a portion of the process address space.  You can then use preadv() and 
pwritev() to copy memory, and io_submit(IO_CMD_PREADV) and 
io_submit(IO_CMD_PWRITEV) for asynchronous variants (especially useful 
with a dma engine, since that adds latency).

With some care (and use of mmu_notifiers) you can even mmap() your vmfd 
and access remote process memory directly.

A nice property of file descriptors is that you can pass them around 
securely via SCM_RIGHTS.  So a process can create a window into its 
address space and pass it to other processes.

(or you could just use a shared memory object and pass it around)

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-15  8:02   ` Ingo Molnar
@ 2010-09-15 13:20     ` Christopher Yeoh
  -1 siblings, 0 replies; 62+ messages in thread
From: Christopher Yeoh @ 2010-09-15 13:20 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-kernel, Andrew Morton, Linus Torvalds, Peter Zijlstra, linux-mm

On Wed, 15 Sep 2010 10:02:35 +0200
Ingo Molnar <mingo@elte.hu> wrote:
> 
> What did those OpenMPI facilities use before your patch - shared
> memory or sockets?

This comparison is against OpenMPI using the shared memory btl.

> I have an observation about the interface:
> 
> A small detail: 'int flags' should probably be 'unsigned long flags'
> - it leaves more space.

ok.

> Also, note that there is a further performance optimization possible 
> here: if the other task's ->mm is the same as this task's (they share 
> the MM), then the copy can be done straight in this process context, 
> without GUP. User-space might not necessarily be aware of this so it 
> might make sense to express this special case in the kernel too.

ok.

> More fundamentally, wouldnt it make sense to create an iovec
> interface here? If the Gather(v) / Scatter(v) / AlltoAll(v) workloads
> have any fragmentation on the user-space buffer side then the copy of
> multiple areas could be done in a single syscall. (the MM lock has to
> be touched only once, target task only be looked up only once, etc.)

yes, I think so. Currently where I'm using the interface in OpenMPI I
can't take advantage of this, but it could be changed in the future- and
its likely other MPI's could take advantage of it already.

> Plus, a small naming detail, shouldnt the naming be more IO like:
> 
>   sys_process_vm_read()
>   sys_process_vm_write()

Yes, that looks better to me. I really wasn't sure how to name them.

Regards,

Chris
-- 
cyeoh@au.ibm.com

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
@ 2010-09-15 13:20     ` Christopher Yeoh
  0 siblings, 0 replies; 62+ messages in thread
From: Christopher Yeoh @ 2010-09-15 13:20 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-kernel, Andrew Morton, Linus Torvalds, Peter Zijlstra, linux-mm

On Wed, 15 Sep 2010 10:02:35 +0200
Ingo Molnar <mingo@elte.hu> wrote:
> 
> What did those OpenMPI facilities use before your patch - shared
> memory or sockets?

This comparison is against OpenMPI using the shared memory btl.

> I have an observation about the interface:
> 
> A small detail: 'int flags' should probably be 'unsigned long flags'
> - it leaves more space.

ok.

> Also, note that there is a further performance optimization possible 
> here: if the other task's ->mm is the same as this task's (they share 
> the MM), then the copy can be done straight in this process context, 
> without GUP. User-space might not necessarily be aware of this so it 
> might make sense to express this special case in the kernel too.

ok.

> More fundamentally, wouldnt it make sense to create an iovec
> interface here? If the Gather(v) / Scatter(v) / AlltoAll(v) workloads
> have any fragmentation on the user-space buffer side then the copy of
> multiple areas could be done in a single syscall. (the MM lock has to
> be touched only once, target task only be looked up only once, etc.)

yes, I think so. Currently where I'm using the interface in OpenMPI I
can't take advantage of this, but it could be changed in the future- and
its likely other MPI's could take advantage of it already.

> Plus, a small naming detail, shouldnt the naming be more IO like:
> 
>   sys_process_vm_read()
>   sys_process_vm_write()

Yes, that looks better to me. I really wasn't sure how to name them.

Regards,

Chris
-- 
cyeoh@au.ibm.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-15  8:16     ` Ingo Molnar
@ 2010-09-15 13:23       ` Christopher Yeoh
  -1 siblings, 0 replies; 62+ messages in thread
From: Christopher Yeoh @ 2010-09-15 13:23 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-kernel, Andrew Morton, Linus Torvalds, Peter Zijlstra, linux-mm

On Wed, 15 Sep 2010 10:16:53 +0200
Ingo Molnar <mingo@elte.hu> wrote:
> 
> btw., how does OpenMPI signal the target tasks that something
> happened to their address space - is there some pipe/socket
> side-channel, or perhaps purely based on flags in the modified memory
> areas, which are polled?

The shared memory btl signals through shared memory, though when
threading is enabled (I think its mostly used with threading support
disabled) in OpenMPI there is also signalling done through a pipe.

Regards,

Chris
-- 
cyeoh@au.ibm.com

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
@ 2010-09-15 13:23       ` Christopher Yeoh
  0 siblings, 0 replies; 62+ messages in thread
From: Christopher Yeoh @ 2010-09-15 13:23 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-kernel, Andrew Morton, Linus Torvalds, Peter Zijlstra, linux-mm

On Wed, 15 Sep 2010 10:16:53 +0200
Ingo Molnar <mingo@elte.hu> wrote:
> 
> btw., how does OpenMPI signal the target tasks that something
> happened to their address space - is there some pipe/socket
> side-channel, or perhaps purely based on flags in the modified memory
> areas, which are polled?

The shared memory btl signals through shared memory, though when
threading is enabled (I think its mostly used with threading support
disabled) in OpenMPI there is also signalling done through a pipe.

Regards,

Chris
-- 
cyeoh@au.ibm.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-15 10:58   ` Avi Kivity
@ 2010-09-15 13:51     ` Ingo Molnar
  -1 siblings, 0 replies; 62+ messages in thread
From: Ingo Molnar @ 2010-09-15 13:51 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Christopher Yeoh, linux-kernel, Linux Memory Management List,
	Andrew Morton, Linus Torvalds, Peter Zijlstra


* Avi Kivity <avi@redhat.com> wrote:

>  On 09/15/2010 03:18 AM, Christopher Yeoh wrote:
>
> > The basic idea behind cross memory attach is to allow MPI programs 
> > doing intra-node communication to do a single copy of the message 
> > rather than a double copy of the message via shared memory.
> 
> If the host has a dma engine (many modern ones do) you can reduce this 
> to zero copies (at least, zero processor copies).
> 
> > The following patch attempts to achieve this by allowing a 
> > destination process, given an address and size from a source 
> > process, to copy memory directly from the source process into its 
> > own address space via a system call. There is also a symmetrical 
> > ability to copy from the current process's address space into a 
> > destination process's address space.
> 
> Instead of those two syscalls, how about a vmfd(pid_t pid, ulong 
> start, ulong len) system call which returns an file descriptor that 
> represents a portion of the process address space.  You can then use 
> preadv() and pwritev() to copy memory, and io_submit(IO_CMD_PREADV) 
> and io_submit(IO_CMD_PWRITEV) for asynchronous variants (especially 
> useful with a dma engine, since that adds latency).
> 
> With some care (and use of mmu_notifiers) you can even mmap() your 
> vmfd and access remote process memory directly.
> 
> A nice property of file descriptors is that you can pass them around 
> securely via SCM_RIGHTS.  So a process can create a window into its 
> address space and pass it to other processes.
> 
> (or you could just use a shared memory object and pass it around)

Interesting, but how will that work in a scalable way with lots of 
non-thread tasks?

Say we have 100 processes. We'd have to have 100 fd's - each has to be 
passed to a new worker process.

In that sense a PID is just as good of a reference as an fd - it can be 
looked up lockless, etc. - but has the added advantage that it can be 
passed along just by number.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
@ 2010-09-15 13:51     ` Ingo Molnar
  0 siblings, 0 replies; 62+ messages in thread
From: Ingo Molnar @ 2010-09-15 13:51 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Christopher Yeoh, linux-kernel, Linux Memory Management List,
	Andrew Morton, Linus Torvalds, Peter Zijlstra


* Avi Kivity <avi@redhat.com> wrote:

>  On 09/15/2010 03:18 AM, Christopher Yeoh wrote:
>
> > The basic idea behind cross memory attach is to allow MPI programs 
> > doing intra-node communication to do a single copy of the message 
> > rather than a double copy of the message via shared memory.
> 
> If the host has a dma engine (many modern ones do) you can reduce this 
> to zero copies (at least, zero processor copies).
> 
> > The following patch attempts to achieve this by allowing a 
> > destination process, given an address and size from a source 
> > process, to copy memory directly from the source process into its 
> > own address space via a system call. There is also a symmetrical 
> > ability to copy from the current process's address space into a 
> > destination process's address space.
> 
> Instead of those two syscalls, how about a vmfd(pid_t pid, ulong 
> start, ulong len) system call which returns an file descriptor that 
> represents a portion of the process address space.  You can then use 
> preadv() and pwritev() to copy memory, and io_submit(IO_CMD_PREADV) 
> and io_submit(IO_CMD_PWRITEV) for asynchronous variants (especially 
> useful with a dma engine, since that adds latency).
> 
> With some care (and use of mmu_notifiers) you can even mmap() your 
> vmfd and access remote process memory directly.
> 
> A nice property of file descriptors is that you can pass them around 
> securely via SCM_RIGHTS.  So a process can create a window into its 
> address space and pass it to other processes.
> 
> (or you could just use a shared memory object and pass it around)

Interesting, but how will that work in a scalable way with lots of 
non-thread tasks?

Say we have 100 processes. We'd have to have 100 fd's - each has to be 
passed to a new worker process.

In that sense a PID is just as good of a reference as an fd - it can be 
looked up lockless, etc. - but has the added advantage that it can be 
passed along just by number.

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-15 10:58   ` Avi Kivity
@ 2010-09-15 14:42     ` Christopher Yeoh
  -1 siblings, 0 replies; 62+ messages in thread
From: Christopher Yeoh @ 2010-09-15 14:42 UTC (permalink / raw)
  To: Avi Kivity; +Cc: linux-kernel, Linux Memory Management List, Ingo Molnar

On Wed, 15 Sep 2010 12:58:15 +0200
Avi Kivity <avi@redhat.com> wrote:

>   On 09/15/2010 03:18 AM, Christopher Yeoh wrote:
> > The basic idea behind cross memory attach is to allow MPI programs
> > doing intra-node communication to do a single copy of the message
> > rather than a double copy of the message via shared memory.
> 
> If the host has a dma engine (many modern ones do) you can reduce
> this to zero copies (at least, zero processor copies).

Yes, this interface doesn't really support that. I've tried to keep
things really simple here, but I see potential for increasing
level/complexity of support with diminishing returns:

1. single copy (basically what the current implementation does)
2. support for async dma offload (rather arch specific)
3. ability to map part of another process's address space directly into
   the current one. Would have setup/tear down overhead, but this would
   be useful specifically for reduction operations where we don't even
   need to really copy the data once at all, but use it directly in
   arithmetic/logical operations on the receiver.

For reference, there is also knem http://runtime.bordeaux.inria.fr/knem/
which does implement (2) for I/OAT, though it looks to me the interface
and implementation are relatively speaking quite a bit more complex.

> Instead of those two syscalls, how about a vmfd(pid_t pid, ulong
> start, ulong len) system call which returns an file descriptor that
> represents a portion of the process address space.  You can then use
> preadv() and pwritev() to copy memory, and io_submit(IO_CMD_PREADV)
> and io_submit(IO_CMD_PWRITEV) for asynchronous variants (especially
> useful with a dma engine, since that adds latency).
> 
> With some care (and use of mmu_notifiers) you can even mmap() your
> vmfd and access remote process memory directly.

That interface sounds interesting (I'm not sure I understand how
this would be implemented), though this would mean that a file
descriptor would need to be created for every message that
each process sent wouldn't it?

Regards,

Chris
-- 
cyeoh@au.ibm.com

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
@ 2010-09-15 14:42     ` Christopher Yeoh
  0 siblings, 0 replies; 62+ messages in thread
From: Christopher Yeoh @ 2010-09-15 14:42 UTC (permalink / raw)
  To: Avi Kivity; +Cc: linux-kernel, Linux Memory Management List, Ingo Molnar

On Wed, 15 Sep 2010 12:58:15 +0200
Avi Kivity <avi@redhat.com> wrote:

>   On 09/15/2010 03:18 AM, Christopher Yeoh wrote:
> > The basic idea behind cross memory attach is to allow MPI programs
> > doing intra-node communication to do a single copy of the message
> > rather than a double copy of the message via shared memory.
> 
> If the host has a dma engine (many modern ones do) you can reduce
> this to zero copies (at least, zero processor copies).

Yes, this interface doesn't really support that. I've tried to keep
things really simple here, but I see potential for increasing
level/complexity of support with diminishing returns:

1. single copy (basically what the current implementation does)
2. support for async dma offload (rather arch specific)
3. ability to map part of another process's address space directly into
   the current one. Would have setup/tear down overhead, but this would
   be useful specifically for reduction operations where we don't even
   need to really copy the data once at all, but use it directly in
   arithmetic/logical operations on the receiver.

For reference, there is also knem http://runtime.bordeaux.inria.fr/knem/
which does implement (2) for I/OAT, though it looks to me the interface
and implementation are relatively speaking quite a bit more complex.

> Instead of those two syscalls, how about a vmfd(pid_t pid, ulong
> start, ulong len) system call which returns an file descriptor that
> represents a portion of the process address space.  You can then use
> preadv() and pwritev() to copy memory, and io_submit(IO_CMD_PREADV)
> and io_submit(IO_CMD_PWRITEV) for asynchronous variants (especially
> useful with a dma engine, since that adds latency).
> 
> With some care (and use of mmu_notifiers) you can even mmap() your
> vmfd and access remote process memory directly.

That interface sounds interesting (I'm not sure I understand how
this would be implemented), though this would mean that a file
descriptor would need to be created for every message that
each process sent wouldn't it?

Regards,

Chris
-- 
cyeoh@au.ibm.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-15 10:58   ` Avi Kivity
@ 2010-09-15 14:46     ` Bryan Donlan
  -1 siblings, 0 replies; 62+ messages in thread
From: Bryan Donlan @ 2010-09-15 14:46 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Christopher Yeoh, linux-kernel, Linux Memory Management List,
	Ingo Molnar

On Wed, Sep 15, 2010 at 19:58, Avi Kivity <avi@redhat.com> wrote:

> Instead of those two syscalls, how about a vmfd(pid_t pid, ulong start,
> ulong len) system call which returns an file descriptor that represents a
> portion of the process address space.  You can then use preadv() and
> pwritev() to copy memory, and io_submit(IO_CMD_PREADV) and
> io_submit(IO_CMD_PWRITEV) for asynchronous variants (especially useful with
> a dma engine, since that adds latency).
>
> With some care (and use of mmu_notifiers) you can even mmap() your vmfd and
> access remote process memory directly.

Rather than introducing a new vmfd() API for this, why not just add
implementations for these more efficient operations to the existing
/proc/$pid/mem interface?

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
@ 2010-09-15 14:46     ` Bryan Donlan
  0 siblings, 0 replies; 62+ messages in thread
From: Bryan Donlan @ 2010-09-15 14:46 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Christopher Yeoh, linux-kernel, Linux Memory Management List,
	Ingo Molnar

On Wed, Sep 15, 2010 at 19:58, Avi Kivity <avi@redhat.com> wrote:

> Instead of those two syscalls, how about a vmfd(pid_t pid, ulong start,
> ulong len) system call which returns an file descriptor that represents a
> portion of the process address space.  You can then use preadv() and
> pwritev() to copy memory, and io_submit(IO_CMD_PREADV) and
> io_submit(IO_CMD_PWRITEV) for asynchronous variants (especially useful with
> a dma engine, since that adds latency).
>
> With some care (and use of mmu_notifiers) you can even mmap() your vmfd and
> access remote process memory directly.

Rather than introducing a new vmfd() API for this, why not just add
implementations for these more efficient operations to the existing
/proc/$pid/mem interface?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-15 14:42     ` Christopher Yeoh
@ 2010-09-15 14:52       ` Linus Torvalds
  -1 siblings, 0 replies; 62+ messages in thread
From: Linus Torvalds @ 2010-09-15 14:52 UTC (permalink / raw)
  To: Christopher Yeoh
  Cc: Avi Kivity, linux-kernel, Linux Memory Management List, Ingo Molnar

On Wed, Sep 15, 2010 at 7:42 AM, Christopher Yeoh <cyeoh@au1.ibm.com> wrote:
> On Wed, 15 Sep 2010 12:58:15 +0200
> Avi Kivity <avi@redhat.com> wrote:
>
>>   On 09/15/2010 03:18 AM, Christopher Yeoh wrote:
>> > The basic idea behind cross memory attach is to allow MPI programs
>> > doing intra-node communication to do a single copy of the message
>> > rather than a double copy of the message via shared memory.
>>
>> If the host has a dma engine (many modern ones do) you can reduce
>> this to zero copies (at least, zero processor copies).
>
> Yes, this interface doesn't really support that. I've tried to keep
> things really simple here, but I see potential for increasing
> level/complexity of support with diminishing returns:

I think keeping things simple is a good goal. The vmfd() approach
might be worth looking into, but your patch certainly is pretty simple
as-is.

That said, it's also buggy. You can't just get a task and then do

  down_read(task->mm->mmap_sem)

on it. Not even if you have a refcount. The mm may well go away. You
need to do the same thing "get_task_mm()" does, ie look up the mm
under task_lock, and get a reference to it. You already get the
task-lock for permission testing, so it looks like doing it there
would likely work out.

> 3. ability to map part of another process's address space directly into
>   the current one. Would have setup/tear down overhead, but this would
>   be useful specifically for reduction operations where we don't even
>   need to really copy the data once at all, but use it directly in
>   arithmetic/logical operations on the receiver.

Don't even think about this. If you want to map another tasks memory,
use shared memory. The shared memory code knows about that. The races
for anything else are crazy.

                   Linus

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
@ 2010-09-15 14:52       ` Linus Torvalds
  0 siblings, 0 replies; 62+ messages in thread
From: Linus Torvalds @ 2010-09-15 14:52 UTC (permalink / raw)
  To: Christopher Yeoh
  Cc: Avi Kivity, linux-kernel, Linux Memory Management List, Ingo Molnar

On Wed, Sep 15, 2010 at 7:42 AM, Christopher Yeoh <cyeoh@au1.ibm.com> wrote:
> On Wed, 15 Sep 2010 12:58:15 +0200
> Avi Kivity <avi@redhat.com> wrote:
>
>>   On 09/15/2010 03:18 AM, Christopher Yeoh wrote:
>> > The basic idea behind cross memory attach is to allow MPI programs
>> > doing intra-node communication to do a single copy of the message
>> > rather than a double copy of the message via shared memory.
>>
>> If the host has a dma engine (many modern ones do) you can reduce
>> this to zero copies (at least, zero processor copies).
>
> Yes, this interface doesn't really support that. I've tried to keep
> things really simple here, but I see potential for increasing
> level/complexity of support with diminishing returns:

I think keeping things simple is a good goal. The vmfd() approach
might be worth looking into, but your patch certainly is pretty simple
as-is.

That said, it's also buggy. You can't just get a task and then do

  down_read(task->mm->mmap_sem)

on it. Not even if you have a refcount. The mm may well go away. You
need to do the same thing "get_task_mm()" does, ie look up the mm
under task_lock, and get a reference to it. You already get the
task-lock for permission testing, so it looks like doing it there
would likely work out.

> 3. ability to map part of another process's address space directly into
>   the current one. Would have setup/tear down overhead, but this would
>   be useful specifically for reduction operations where we don't even
>   need to really copy the data once at all, but use it directly in
>   arithmetic/logical operations on the receiver.

Don't even think about this. If you want to map another tasks memory,
use shared memory. The shared memory code knows about that. The races
for anything else are crazy.

                   Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-15  1:18 [RFC][PATCH] Cross Memory Attach Christopher Yeoh
  2010-09-15  8:02   ` Ingo Molnar
  2010-09-15 10:58   ` Avi Kivity
@ 2010-09-15 15:11 ` Linus Torvalds
  2010-09-15 15:14   ` Linus Torvalds
  2010-09-16 16:27   ` Peter Zijlstra
  2010-09-15 16:07 ` Valdis.Kletnieks
  3 siblings, 2 replies; 62+ messages in thread
From: Linus Torvalds @ 2010-09-15 15:11 UTC (permalink / raw)
  To: Christopher Yeoh; +Cc: linux-kernel

On Tue, Sep 14, 2010 at 6:18 PM, Christopher Yeoh <cyeoh@au1.ibm.com> wrote:
> +
> +               if (copy_to) {
> +                       ret = copy_from_user(target_kaddr,
> +                                            user_buf + *bytes_copied,
> +                                            bytes_to_copy);
> +                       if (ret) {
> +                               kunmap(process_pages[i]);
> +                               goto end;
> +                       }
> +               } else {
> +                       ret = copy_to_user(user_buf + *bytes_copied,
> +                                          target_kaddr, bytes_to_copy);
> +                       if (ret) {
> +                               kunmap(process_pages[i]);
> +                               goto end;
> +                       }
> +               }
> +               kunmap(process_pages[i]);

Btw, please just do this as

   if (copy_to)
      ret = copy_from_user(..);
   else
      ret = copy_to_user(..);
   kunmap(process_pages[i]);
   if (ret)
      goto out;

(and also, I think you should probably use a real variable for
*bytes_copied, rather than update things behind the pointer. Update
the pointer just once at the end instead, or preferably just change
the calling convention to just return the number of bytes copied, and
let the caller do the page counting).

                      Linus

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-15 15:11 ` Linus Torvalds
@ 2010-09-15 15:14   ` Linus Torvalds
  2010-09-16  2:25     ` Christopher Yeoh
  2010-09-16 16:27   ` Peter Zijlstra
  1 sibling, 1 reply; 62+ messages in thread
From: Linus Torvalds @ 2010-09-15 15:14 UTC (permalink / raw)
  To: Christopher Yeoh; +Cc: linux-kernel

On Wed, Sep 15, 2010 at 8:11 AM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> Btw, please just do this as
>
>   if (copy_to)
>      ret = copy_from_user(..);
>   else
>      ret = copy_to_user(..);
>   kunmap(process_pages[i]);
>   if (ret)
>      goto out;

In fact, you might consider passing in a "copy_out" function pointer
rather than that "copy_to" boolean, and rather than that conditional,
just do a

  ret = copy_out(..);

thing. On sane/good architectures, branch target prediction will make
it all work out to the same work in the end, and it certainly looks
simpler and cleaner.

                                  Linus

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-15 14:52       ` Linus Torvalds
@ 2010-09-15 15:44         ` Robin Holt
  -1 siblings, 0 replies; 62+ messages in thread
From: Robin Holt @ 2010-09-15 15:44 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Christopher Yeoh, Avi Kivity, linux-kernel,
	Linux Memory Management List, Ingo Molnar

> > 3. ability to map part of another process's address space directly into
> >   the current one. Would have setup/tear down overhead, but this would
> >   be useful specifically for reduction operations where we don't even
> >   need to really copy the data once at all, but use it directly in
> >   arithmetic/logical operations on the receiver.
> 
> Don't even think about this. If you want to map another tasks memory,
> use shared memory. The shared memory code knows about that. The races
> for anything else are crazy.

SGI has a similar, but significantly more difficult, problem to solve and
have written a fairly complex driver to handle exactly the scenario IBM
is proposing.  In our case, not only are we trying to directly access one
processes memory, we are doing it from a completely different operating
system instance operating on the same numa fabric.

In our case (I have not looked at IBMs patch), we are actually using
get_user_pages() to get extra references on struct pages.  We are
judicious about reference counting the mm and we use get_task_mm in all
places with the exception of process teardown (ignorable detail for now).
We have a fault handler inserting PFNs as appropriate.  You can guess
at the complexity.  Even with all its complexity, we still need to
caveat certain functionality as not being supported.

If we were to try and get that driver included in the kernel, how would
you suggest we expand the shared memory code to include support for the
coordination needed between those seperate operating system instances?
I am genuinely interested and not trying to be argumentative.  This has
been on my "Get done before Aug-1 list for months and I have not had
any time to pursue.

Thanks,
Robin

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
@ 2010-09-15 15:44         ` Robin Holt
  0 siblings, 0 replies; 62+ messages in thread
From: Robin Holt @ 2010-09-15 15:44 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Christopher Yeoh, Avi Kivity, linux-kernel,
	Linux Memory Management List, Ingo Molnar

> > 3. ability to map part of another process's address space directly into
> >   the current one. Would have setup/tear down overhead, but this would
> >   be useful specifically for reduction operations where we don't even
> >   need to really copy the data once at all, but use it directly in
> >   arithmetic/logical operations on the receiver.
> 
> Don't even think about this. If you want to map another tasks memory,
> use shared memory. The shared memory code knows about that. The races
> for anything else are crazy.

SGI has a similar, but significantly more difficult, problem to solve and
have written a fairly complex driver to handle exactly the scenario IBM
is proposing.  In our case, not only are we trying to directly access one
processes memory, we are doing it from a completely different operating
system instance operating on the same numa fabric.

In our case (I have not looked at IBMs patch), we are actually using
get_user_pages() to get extra references on struct pages.  We are
judicious about reference counting the mm and we use get_task_mm in all
places with the exception of process teardown (ignorable detail for now).
We have a fault handler inserting PFNs as appropriate.  You can guess
at the complexity.  Even with all its complexity, we still need to
caveat certain functionality as not being supported.

If we were to try and get that driver included in the kernel, how would
you suggest we expand the shared memory code to include support for the
coordination needed between those seperate operating system instances?
I am genuinely interested and not trying to be argumentative.  This has
been on my "Get done before Aug-1 list for months and I have not had
any time to pursue.

Thanks,
Robin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-15  1:18 [RFC][PATCH] Cross Memory Attach Christopher Yeoh
                   ` (2 preceding siblings ...)
  2010-09-15 15:11 ` Linus Torvalds
@ 2010-09-15 16:07 ` Valdis.Kletnieks
  2010-09-16  2:17   ` Christopher Yeoh
  3 siblings, 1 reply; 62+ messages in thread
From: Valdis.Kletnieks @ 2010-09-15 16:07 UTC (permalink / raw)
  To: Christopher Yeoh; +Cc: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 926 bytes --]

On Wed, 15 Sep 2010 10:48:55 +0930, Christopher Yeoh said:

> The basic idea behind cross memory attach is to allow MPI programs doing
> intra-node communication to do a single copy of the message rather than
> a double copy of the message via shared memory.

Interesting, and nice benchmark results.  I have a question though:

> +	/* Get the pages we're interested in */
> +	pages_pinned = get_user_pages(task, task->mm, pa,
> +				      nr_pages_to_copy,
> +				      copy_to, 0, process_pages, NULL);
> +
> +	if (pages_pinned != nr_pages_to_copy)
> +		goto end;

...

> +end:
> +	for (i = 0; i < pages_pinned; i++) {
> +		if (copy_to)
> +			set_page_dirty_lock(process_pages[i]);
> +		put_page(process_pages[i]);
> +	}

It looks to me like if get_user_pages() fails to pin *all* the pages, we treat
the target pages as dirty even though we never actually touched them?

Maybe it should be 'if (copy_to && *bytes_copied)'?

[-- Attachment #2: Type: application/pgp-signature, Size: 227 bytes --]

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-15 13:51     ` Ingo Molnar
@ 2010-09-15 16:10       ` Avi Kivity
  -1 siblings, 0 replies; 62+ messages in thread
From: Avi Kivity @ 2010-09-15 16:10 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Christopher Yeoh, linux-kernel, Linux Memory Management List,
	Andrew Morton, Linus Torvalds, Peter Zijlstra

  On 09/15/2010 03:51 PM, Ingo Molnar wrote:
> * Avi Kivity<avi@redhat.com>  wrote:
>
>>   On 09/15/2010 03:18 AM, Christopher Yeoh wrote:
>>
>>> The basic idea behind cross memory attach is to allow MPI programs
>>> doing intra-node communication to do a single copy of the message
>>> rather than a double copy of the message via shared memory.
>> If the host has a dma engine (many modern ones do) you can reduce this
>> to zero copies (at least, zero processor copies).
>>
>>> The following patch attempts to achieve this by allowing a
>>> destination process, given an address and size from a source
>>> process, to copy memory directly from the source process into its
>>> own address space via a system call. There is also a symmetrical
>>> ability to copy from the current process's address space into a
>>> destination process's address space.
>> Instead of those two syscalls, how about a vmfd(pid_t pid, ulong
>> start, ulong len) system call which returns an file descriptor that
>> represents a portion of the process address space.  You can then use
>> preadv() and pwritev() to copy memory, and io_submit(IO_CMD_PREADV)
>> and io_submit(IO_CMD_PWRITEV) for asynchronous variants (especially
>> useful with a dma engine, since that adds latency).
>>
>> With some care (and use of mmu_notifiers) you can even mmap() your
>> vmfd and access remote process memory directly.
>>
>> A nice property of file descriptors is that you can pass them around
>> securely via SCM_RIGHTS.  So a process can create a window into its
>> address space and pass it to other processes.
>>
>> (or you could just use a shared memory object and pass it around)
> Interesting, but how will that work in a scalable way with lots of
> non-thread tasks?
>
> Say we have 100 processes. We'd have to have 100 fd's - each has to be
> passed to a new worker process.
>
> In that sense a PID is just as good of a reference as an fd - it can be
> looked up lockless, etc. - but has the added advantage that it can be
> passed along just by number.
>
>

It also has better life-cycle control (with just a pid, you never know 
what it refers to unless you're its parent).  Would have been better if 
clone() returned an fd from which you could derive the pid if you wanted 
to present it to the user.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
@ 2010-09-15 16:10       ` Avi Kivity
  0 siblings, 0 replies; 62+ messages in thread
From: Avi Kivity @ 2010-09-15 16:10 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Christopher Yeoh, linux-kernel, Linux Memory Management List,
	Andrew Morton, Linus Torvalds, Peter Zijlstra

  On 09/15/2010 03:51 PM, Ingo Molnar wrote:
> * Avi Kivity<avi@redhat.com>  wrote:
>
>>   On 09/15/2010 03:18 AM, Christopher Yeoh wrote:
>>
>>> The basic idea behind cross memory attach is to allow MPI programs
>>> doing intra-node communication to do a single copy of the message
>>> rather than a double copy of the message via shared memory.
>> If the host has a dma engine (many modern ones do) you can reduce this
>> to zero copies (at least, zero processor copies).
>>
>>> The following patch attempts to achieve this by allowing a
>>> destination process, given an address and size from a source
>>> process, to copy memory directly from the source process into its
>>> own address space via a system call. There is also a symmetrical
>>> ability to copy from the current process's address space into a
>>> destination process's address space.
>> Instead of those two syscalls, how about a vmfd(pid_t pid, ulong
>> start, ulong len) system call which returns an file descriptor that
>> represents a portion of the process address space.  You can then use
>> preadv() and pwritev() to copy memory, and io_submit(IO_CMD_PREADV)
>> and io_submit(IO_CMD_PWRITEV) for asynchronous variants (especially
>> useful with a dma engine, since that adds latency).
>>
>> With some care (and use of mmu_notifiers) you can even mmap() your
>> vmfd and access remote process memory directly.
>>
>> A nice property of file descriptors is that you can pass them around
>> securely via SCM_RIGHTS.  So a process can create a window into its
>> address space and pass it to other processes.
>>
>> (or you could just use a shared memory object and pass it around)
> Interesting, but how will that work in a scalable way with lots of
> non-thread tasks?
>
> Say we have 100 processes. We'd have to have 100 fd's - each has to be
> passed to a new worker process.
>
> In that sense a PID is just as good of a reference as an fd - it can be
> looked up lockless, etc. - but has the added advantage that it can be
> passed along just by number.
>
>

It also has better life-cycle control (with just a pid, you never know 
what it refers to unless you're its parent).  Would have been better if 
clone() returned an fd from which you could derive the pid if you wanted 
to present it to the user.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-15 14:46     ` Bryan Donlan
@ 2010-09-15 16:13       ` Avi Kivity
  -1 siblings, 0 replies; 62+ messages in thread
From: Avi Kivity @ 2010-09-15 16:13 UTC (permalink / raw)
  To: Bryan Donlan
  Cc: Christopher Yeoh, linux-kernel, Linux Memory Management List,
	Ingo Molnar

  On 09/15/2010 04:46 PM, Bryan Donlan wrote:
> On Wed, Sep 15, 2010 at 19:58, Avi Kivity<avi@redhat.com>  wrote:
>
>> Instead of those two syscalls, how about a vmfd(pid_t pid, ulong start,
>> ulong len) system call which returns an file descriptor that represents a
>> portion of the process address space.  You can then use preadv() and
>> pwritev() to copy memory, and io_submit(IO_CMD_PREADV) and
>> io_submit(IO_CMD_PWRITEV) for asynchronous variants (especially useful with
>> a dma engine, since that adds latency).
>>
>> With some care (and use of mmu_notifiers) you can even mmap() your vmfd and
>> access remote process memory directly.
> Rather than introducing a new vmfd() API for this, why not just add
> implementations for these more efficient operations to the existing
> /proc/$pid/mem interface?

Yes, opening that file should be equivalent (and you could certainly 
implement aio via dma for it).

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
@ 2010-09-15 16:13       ` Avi Kivity
  0 siblings, 0 replies; 62+ messages in thread
From: Avi Kivity @ 2010-09-15 16:13 UTC (permalink / raw)
  To: Bryan Donlan
  Cc: Christopher Yeoh, linux-kernel, Linux Memory Management List,
	Ingo Molnar

  On 09/15/2010 04:46 PM, Bryan Donlan wrote:
> On Wed, Sep 15, 2010 at 19:58, Avi Kivity<avi@redhat.com>  wrote:
>
>> Instead of those two syscalls, how about a vmfd(pid_t pid, ulong start,
>> ulong len) system call which returns an file descriptor that represents a
>> portion of the process address space.  You can then use preadv() and
>> pwritev() to copy memory, and io_submit(IO_CMD_PREADV) and
>> io_submit(IO_CMD_PWRITEV) for asynchronous variants (especially useful with
>> a dma engine, since that adds latency).
>>
>> With some care (and use of mmu_notifiers) you can even mmap() your vmfd and
>> access remote process memory directly.
> Rather than introducing a new vmfd() API for this, why not just add
> implementations for these more efficient operations to the existing
> /proc/$pid/mem interface?

Yes, opening that file should be equivalent (and you could certainly 
implement aio via dma for it).

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-15 16:13       ` Avi Kivity
@ 2010-09-15 19:35         ` Eric W. Biederman
  -1 siblings, 0 replies; 62+ messages in thread
From: Eric W. Biederman @ 2010-09-15 19:35 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Bryan Donlan, Christopher Yeoh, linux-kernel,
	Linux Memory Management List, Ingo Molnar, Linus Torvalds,
	Valdis.Kletnieks, Alan Cox, Robin Holt

Avi Kivity <avi@redhat.com> writes:

>  On 09/15/2010 04:46 PM, Bryan Donlan wrote:
>> On Wed, Sep 15, 2010 at 19:58, Avi Kivity<avi@redhat.com>  wrote:
>>
>>> Instead of those two syscalls, how about a vmfd(pid_t pid, ulong start,
>>> ulong len) system call which returns an file descriptor that represents a
>>> portion of the process address space.  You can then use preadv() and
>>> pwritev() to copy memory, and io_submit(IO_CMD_PREADV) and
>>> io_submit(IO_CMD_PWRITEV) for asynchronous variants (especially useful with
>>> a dma engine, since that adds latency).
>>>
>>> With some care (and use of mmu_notifiers) you can even mmap() your vmfd and
>>> access remote process memory directly.
>> Rather than introducing a new vmfd() API for this, why not just add
>> implementations for these more efficient operations to the existing
>> /proc/$pid/mem interface?
>
> Yes, opening that file should be equivalent (and you could certainly implement
> aio via dma for it).

I will second this /proc/$pid/mem is semantically the same and it would
really be good if this patch became a patch optimizing that case.

Otherwise we have code duplication and thus dilution of knowledge in
two different places for no discernable reason.  Hindering long term
maintenance.

+int copy_to_from_process_allowed(struct task_struct *task)
+{
+	/* Allow copy_to_from_process to access another process using
+	   the same critera  as a process would be allowed to ptrace
+	   that same process */
+	const struct cred *cred = current_cred(), *tcred;
+
+	rcu_read_lock();
+	tcred = __task_cred(task);
+	if ((cred->uid != tcred->euid ||
+	     cred->uid != tcred->suid ||
+	     cred->uid != tcred->uid  ||
+	     cred->gid != tcred->egid ||
+	     cred->gid != tcred->sgid ||
+	     cred->gid != tcred->gid) &&
+	    !capable(CAP_SYS_PTRACE)) {
+		rcu_read_unlock();
+		return 0;
+	}
+	rcu_read_unlock();
+	return 1;
+}

This hunk of the patch is a copy of __ptrace_may_access without security
hooks removed.  Both the code duplication, the removal of the dumpable
check and the removal of the security hooks look like a bad idea.

Removing the other checks in check_mem_permission seems reasonable as
those appear to be overly paranoid.

Hmm.  This is weird:

+	/* Get the pages we're interested in */
+	pages_pinned = get_user_pages(task, task->mm, pa,
+				      nr_pages_to_copy,
+				      copy_to, 0, process_pages, NULL);
+
+	if (pages_pinned != nr_pages_to_copy)
+		goto end;
+
+	/* Do the copy for each page */
+	for (i = 0; i < nr_pages_to_copy; i++) {
+		target_kaddr = kmap(process_pages[i]) + start_offset;
+		bytes_to_copy = min(PAGE_SIZE - start_offset,
+				    len - *bytes_copied);
+		if (start_offset)
+			start_offset = 0;
+
+		if (copy_to) {
+			ret = copy_from_user(target_kaddr,
+					     user_buf + *bytes_copied,
+					     bytes_to_copy);
+			if (ret) {
+				kunmap(process_pages[i]);
+				goto end;
+			}
+		} else {
+			ret = copy_to_user(user_buf + *bytes_copied,
+					   target_kaddr, bytes_to_copy);
+			if (ret) {
+				kunmap(process_pages[i]);
+				goto end;
+			}
+		}
+		kunmap(process_pages[i]);
+		*bytes_copied += bytes_to_copy;
+	}
+

That hunk of code appears to be an copy of mm/memmory.c:access_process_vm.
A little more optimized by taking the get_user_pages out of the inner
loop but otherwise pretty much the same code.

So I would argue it makes sense to optimize access_process_vm.

So unless there are fundamental bottlenecks to performance I am not
seeing please optimize the existing code paths in the kernel that do
exactly what you are trying to do.

Thanks,
Eric





^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
@ 2010-09-15 19:35         ` Eric W. Biederman
  0 siblings, 0 replies; 62+ messages in thread
From: Eric W. Biederman @ 2010-09-15 19:35 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Bryan Donlan, Christopher Yeoh, linux-kernel,
	Linux Memory Management List, Ingo Molnar, Linus Torvalds,
	Valdis.Kletnieks, Alan Cox, Robin Holt

Avi Kivity <avi@redhat.com> writes:

>  On 09/15/2010 04:46 PM, Bryan Donlan wrote:
>> On Wed, Sep 15, 2010 at 19:58, Avi Kivity<avi@redhat.com>  wrote:
>>
>>> Instead of those two syscalls, how about a vmfd(pid_t pid, ulong start,
>>> ulong len) system call which returns an file descriptor that represents a
>>> portion of the process address space.  You can then use preadv() and
>>> pwritev() to copy memory, and io_submit(IO_CMD_PREADV) and
>>> io_submit(IO_CMD_PWRITEV) for asynchronous variants (especially useful with
>>> a dma engine, since that adds latency).
>>>
>>> With some care (and use of mmu_notifiers) you can even mmap() your vmfd and
>>> access remote process memory directly.
>> Rather than introducing a new vmfd() API for this, why not just add
>> implementations for these more efficient operations to the existing
>> /proc/$pid/mem interface?
>
> Yes, opening that file should be equivalent (and you could certainly implement
> aio via dma for it).

I will second this /proc/$pid/mem is semantically the same and it would
really be good if this patch became a patch optimizing that case.

Otherwise we have code duplication and thus dilution of knowledge in
two different places for no discernable reason.  Hindering long term
maintenance.

+int copy_to_from_process_allowed(struct task_struct *task)
+{
+	/* Allow copy_to_from_process to access another process using
+	   the same critera  as a process would be allowed to ptrace
+	   that same process */
+	const struct cred *cred = current_cred(), *tcred;
+
+	rcu_read_lock();
+	tcred = __task_cred(task);
+	if ((cred->uid != tcred->euid ||
+	     cred->uid != tcred->suid ||
+	     cred->uid != tcred->uid  ||
+	     cred->gid != tcred->egid ||
+	     cred->gid != tcred->sgid ||
+	     cred->gid != tcred->gid) &&
+	    !capable(CAP_SYS_PTRACE)) {
+		rcu_read_unlock();
+		return 0;
+	}
+	rcu_read_unlock();
+	return 1;
+}

This hunk of the patch is a copy of __ptrace_may_access without security
hooks removed.  Both the code duplication, the removal of the dumpable
check and the removal of the security hooks look like a bad idea.

Removing the other checks in check_mem_permission seems reasonable as
those appear to be overly paranoid.

Hmm.  This is weird:

+	/* Get the pages we're interested in */
+	pages_pinned = get_user_pages(task, task->mm, pa,
+				      nr_pages_to_copy,
+				      copy_to, 0, process_pages, NULL);
+
+	if (pages_pinned != nr_pages_to_copy)
+		goto end;
+
+	/* Do the copy for each page */
+	for (i = 0; i < nr_pages_to_copy; i++) {
+		target_kaddr = kmap(process_pages[i]) + start_offset;
+		bytes_to_copy = min(PAGE_SIZE - start_offset,
+				    len - *bytes_copied);
+		if (start_offset)
+			start_offset = 0;
+
+		if (copy_to) {
+			ret = copy_from_user(target_kaddr,
+					     user_buf + *bytes_copied,
+					     bytes_to_copy);
+			if (ret) {
+				kunmap(process_pages[i]);
+				goto end;
+			}
+		} else {
+			ret = copy_to_user(user_buf + *bytes_copied,
+					   target_kaddr, bytes_to_copy);
+			if (ret) {
+				kunmap(process_pages[i]);
+				goto end;
+			}
+		}
+		kunmap(process_pages[i]);
+		*bytes_copied += bytes_to_copy;
+	}
+

That hunk of code appears to be an copy of mm/memmory.c:access_process_vm.
A little more optimized by taking the get_user_pages out of the inner
loop but otherwise pretty much the same code.

So I would argue it makes sense to optimize access_process_vm.

So unless there are fundamental bottlenecks to performance I am not
seeing please optimize the existing code paths in the kernel that do
exactly what you are trying to do.

Thanks,
Eric




--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-15 14:46     ` Bryan Donlan
@ 2010-09-16  1:18       ` Christopher Yeoh
  -1 siblings, 0 replies; 62+ messages in thread
From: Christopher Yeoh @ 2010-09-16  1:18 UTC (permalink / raw)
  To: Bryan Donlan
  Cc: Avi Kivity, linux-kernel, Linux Memory Management List, Ingo Molnar

On Wed, 15 Sep 2010 23:46:09 +0900
Bryan Donlan <bdonlan@gmail.com> wrote:

> On Wed, Sep 15, 2010 at 19:58, Avi Kivity <avi@redhat.com> wrote:
> 
> > Instead of those two syscalls, how about a vmfd(pid_t pid, ulong
> > start, ulong len) system call which returns an file descriptor that
> > represents a portion of the process address space.  You can then
> > use preadv() and pwritev() to copy memory, and
> > io_submit(IO_CMD_PREADV) and io_submit(IO_CMD_PWRITEV) for
> > asynchronous variants (especially useful with a dma engine, since
> > that adds latency).
> >
> > With some care (and use of mmu_notifiers) you can even mmap() your
> > vmfd and access remote process memory directly.
> 
> Rather than introducing a new vmfd() API for this, why not just add
> implementations for these more efficient operations to the existing
> /proc/$pid/mem interface?

Perhaps I'm misunderstanding something here, but
accessing /proc/$pid/mem requires ptracing the target process.
We can't really have all these MPI processes ptraceing each other
just to send/receive a message....

Regards,

Chris
-- 
cyeoh@au.ibm.com

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
@ 2010-09-16  1:18       ` Christopher Yeoh
  0 siblings, 0 replies; 62+ messages in thread
From: Christopher Yeoh @ 2010-09-16  1:18 UTC (permalink / raw)
  To: Bryan Donlan
  Cc: Avi Kivity, linux-kernel, Linux Memory Management List, Ingo Molnar

On Wed, 15 Sep 2010 23:46:09 +0900
Bryan Donlan <bdonlan@gmail.com> wrote:

> On Wed, Sep 15, 2010 at 19:58, Avi Kivity <avi@redhat.com> wrote:
> 
> > Instead of those two syscalls, how about a vmfd(pid_t pid, ulong
> > start, ulong len) system call which returns an file descriptor that
> > represents a portion of the process address space.  You can then
> > use preadv() and pwritev() to copy memory, and
> > io_submit(IO_CMD_PREADV) and io_submit(IO_CMD_PWRITEV) for
> > asynchronous variants (especially useful with a dma engine, since
> > that adds latency).
> >
> > With some care (and use of mmu_notifiers) you can even mmap() your
> > vmfd and access remote process memory directly.
> 
> Rather than introducing a new vmfd() API for this, why not just add
> implementations for these more efficient operations to the existing
> /proc/$pid/mem interface?

Perhaps I'm misunderstanding something here, but
accessing /proc/$pid/mem requires ptracing the target process.
We can't really have all these MPI processes ptraceing each other
just to send/receive a message....

Regards,

Chris
-- 
cyeoh@au.ibm.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-15 14:46     ` Bryan Donlan
@ 2010-09-16  1:58       ` KOSAKI Motohiro
  -1 siblings, 0 replies; 62+ messages in thread
From: KOSAKI Motohiro @ 2010-09-16  1:58 UTC (permalink / raw)
  To: Bryan Donlan
  Cc: kosaki.motohiro, Avi Kivity, Christopher Yeoh, linux-kernel,
	Linux Memory Management List, Ingo Molnar

> On Wed, Sep 15, 2010 at 19:58, Avi Kivity <avi@redhat.com> wrote:
> 
> > Instead of those two syscalls, how about a vmfd(pid_t pid, ulong start,
> > ulong len) system call which returns an file descriptor that represents a
> > portion of the process address space.  You can then use preadv() and
> > pwritev() to copy memory, and io_submit(IO_CMD_PREADV) and
> > io_submit(IO_CMD_PWRITEV) for asynchronous variants (especially useful with
> > a dma engine, since that adds latency).
> >
> > With some care (and use of mmu_notifiers) you can even mmap() your vmfd and
> > access remote process memory directly.
> 
> Rather than introducing a new vmfd() API for this, why not just add
> implementations for these more efficient operations to the existing
> /proc/$pid/mem interface?

As far as I heared from my friend, old HP MPI implementation used /proc/$pid/mem
for this purpose. (I don't know current status). However almost implementation
doesn't do that because /proc/$pid/mem required the process is ptraced.
As far as I understand , very old /proc/$pid/mem doesn't require it. but It changed
for security concern. Then, Anybody haven't want to change this interface because
they worry break security.

But, I don't know what exactly protected "the process is ptraced" check. If anyone
explain the reason and we can remove it. I'm not againt at all.




^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
@ 2010-09-16  1:58       ` KOSAKI Motohiro
  0 siblings, 0 replies; 62+ messages in thread
From: KOSAKI Motohiro @ 2010-09-16  1:58 UTC (permalink / raw)
  To: Bryan Donlan
  Cc: kosaki.motohiro, Avi Kivity, Christopher Yeoh, linux-kernel,
	Linux Memory Management List, Ingo Molnar

> On Wed, Sep 15, 2010 at 19:58, Avi Kivity <avi@redhat.com> wrote:
> 
> > Instead of those two syscalls, how about a vmfd(pid_t pid, ulong start,
> > ulong len) system call which returns an file descriptor that represents a
> > portion of the process address space.  You can then use preadv() and
> > pwritev() to copy memory, and io_submit(IO_CMD_PREADV) and
> > io_submit(IO_CMD_PWRITEV) for asynchronous variants (especially useful with
> > a dma engine, since that adds latency).
> >
> > With some care (and use of mmu_notifiers) you can even mmap() your vmfd and
> > access remote process memory directly.
> 
> Rather than introducing a new vmfd() API for this, why not just add
> implementations for these more efficient operations to the existing
> /proc/$pid/mem interface?

As far as I heared from my friend, old HP MPI implementation used /proc/$pid/mem
for this purpose. (I don't know current status). However almost implementation
doesn't do that because /proc/$pid/mem required the process is ptraced.
As far as I understand , very old /proc/$pid/mem doesn't require it. but It changed
for security concern. Then, Anybody haven't want to change this interface because
they worry break security.

But, I don't know what exactly protected "the process is ptraced" check. If anyone
explain the reason and we can remove it. I'm not againt at all.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-15 16:07 ` Valdis.Kletnieks
@ 2010-09-16  2:17   ` Christopher Yeoh
  0 siblings, 0 replies; 62+ messages in thread
From: Christopher Yeoh @ 2010-09-16  2:17 UTC (permalink / raw)
  To: Valdis.Kletnieks; +Cc: linux-kernel

On Wed, 15 Sep 2010 12:07:11 -0400
Valdis.Kletnieks@vt.edu wrote:
> 
> Interesting, and nice benchmark results.  I have a question though:
> 
> > +	/* Get the pages we're interested in */
> > +	pages_pinned = get_user_pages(task, task->mm, pa,
> > +				      nr_pages_to_copy,
> > +				      copy_to, 0, process_pages,
> > NULL); +
> > +	if (pages_pinned != nr_pages_to_copy)
> > +		goto end;
> 
> ...
> 
> > +end:
> > +	for (i = 0; i < pages_pinned; i++) {
> > +		if (copy_to)
> > +			set_page_dirty_lock(process_pages[i]);
> > +		put_page(process_pages[i]);
> > +	}
> 
> It looks to me like if get_user_pages() fails to pin *all* the pages,
> we treat the target pages as dirty even though we never actually
> touched them?
> 
> Maybe it should be 'if (copy_to && *bytes_copied)'?

Yes that can happen, though the *bytes_copied check doesn't completely
fix it as copy_from_user could fail resulting in some pages being
touched, but not all of them.  I'll add some code to only call
set_page_dirty_lock on pages that really have been touched...

Regards,

Chris
-- 
cyeoh@au.ibm.com

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-15 15:14   ` Linus Torvalds
@ 2010-09-16  2:25     ` Christopher Yeoh
  0 siblings, 0 replies; 62+ messages in thread
From: Christopher Yeoh @ 2010-09-16  2:25 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel

On Wed, 15 Sep 2010 08:14:49 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:
> 
> In fact, you might consider passing in a "copy_out" function pointer
> rather than that "copy_to" boolean, and rather than that conditional,
> just do a
> 
>   ret = copy_out(..);
> 
> thing. On sane/good architectures, branch target prediction will make
> it all work out to the same work in the end, and it certainly looks
> simpler and cleaner.

Thanks for all the feedback - I'll rework the patch...

Regards,

Chris
-- 
cyeoh@au.ibm.com

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-15 14:42     ` Christopher Yeoh
@ 2010-09-16  6:32       ` Brice Goglin
  -1 siblings, 0 replies; 62+ messages in thread
From: Brice Goglin @ 2010-09-16  6:32 UTC (permalink / raw)
  To: Christopher Yeoh
  Cc: Avi Kivity, linux-kernel, Linux Memory Management List, Ingo Molnar

Le 15/09/2010 16:42, Christopher Yeoh a écrit :
> On Wed, 15 Sep 2010 12:58:15 +0200
> Avi Kivity <avi@redhat.com> wrote:
>
>   
>>   On 09/15/2010 03:18 AM, Christopher Yeoh wrote:
>>     
>>> The basic idea behind cross memory attach is to allow MPI programs
>>> doing intra-node communication to do a single copy of the message
>>> rather than a double copy of the message via shared memory.
>>>       
>> If the host has a dma engine (many modern ones do) you can reduce
>> this to zero copies (at least, zero processor copies).
>>     
> Yes, this interface doesn't really support that. I've tried to keep
> things really simple here, but I see potential for increasing
> level/complexity of support with diminishing returns:
>
> 1. single copy (basically what the current implementation does)
> 2. support for async dma offload (rather arch specific)
> 3. ability to map part of another process's address space directly into
>    the current one. Would have setup/tear down overhead, but this would
>    be useful specifically for reduction operations where we don't even
>    need to really copy the data once at all, but use it directly in
>    arithmetic/logical operations on the receiver.
>
> For reference, there is also knem http://runtime.bordeaux.inria.fr/knem/
> which does implement (2) for I/OAT, though it looks to me the interface
> and implementation are relatively speaking quite a bit more complex.
>   

I am the guy doing KNEM so I can comment on this. The I/OAT part of KNEM
was mostly a research topic, it's mostly useless on current machines
since the memcpy performance is much larger than I/OAT DMA Engine. We
also have an offload model with a kernel thread, but it wasn't used a
lot so far. These features can be ignored for the current discussion.

We've been working on this for a while with MPICH and OpenMPI developers
(both already use KNEM), and here's what I think is missing in
Christopher's proposal:
* Vectorial buffer support: MPI likes things like datatypes, which make
buffers non-contigous. You could add vectorial buffer support to your
interface, but the users would have to store the data-representation of
each process in all processes. Not a good idea, it's easier to keep the
knowledge of the non-contigous-ness of the remote buffer only in the
remote process.
* Collectives: You don't want to pin/unpin the same region over and
over, it's overkill when multiple processes are reading for the same
exact buffer (broadcast) or from contigous parts of the same buffer
(scatter).

So what we do in KNEM is:
* declare a memory region (sets of non-contigous segments + protection),
aka get_user_pages and return an associated cookie id
* have syscalls to read/write from region given a cookie, an offset in
the region and a length
This one-sided interface looks like an InfiniBand model, but only for
intra-node data transfers.

So OpenMPI and MPICH declare regions, pass their cookies through their
shared-memory buffer, and the remote process reads from there. Then,
they notify the first process that it may destroy the region (can be
automatic if the region creator passed a specific flag saying destroy
after first use).

Brice


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
@ 2010-09-16  6:32       ` Brice Goglin
  0 siblings, 0 replies; 62+ messages in thread
From: Brice Goglin @ 2010-09-16  6:32 UTC (permalink / raw)
  To: Christopher Yeoh
  Cc: Avi Kivity, linux-kernel, Linux Memory Management List, Ingo Molnar

Le 15/09/2010 16:42, Christopher Yeoh a ecrit :
> On Wed, 15 Sep 2010 12:58:15 +0200
> Avi Kivity <avi@redhat.com> wrote:
>
>   
>>   On 09/15/2010 03:18 AM, Christopher Yeoh wrote:
>>     
>>> The basic idea behind cross memory attach is to allow MPI programs
>>> doing intra-node communication to do a single copy of the message
>>> rather than a double copy of the message via shared memory.
>>>       
>> If the host has a dma engine (many modern ones do) you can reduce
>> this to zero copies (at least, zero processor copies).
>>     
> Yes, this interface doesn't really support that. I've tried to keep
> things really simple here, but I see potential for increasing
> level/complexity of support with diminishing returns:
>
> 1. single copy (basically what the current implementation does)
> 2. support for async dma offload (rather arch specific)
> 3. ability to map part of another process's address space directly into
>    the current one. Would have setup/tear down overhead, but this would
>    be useful specifically for reduction operations where we don't even
>    need to really copy the data once at all, but use it directly in
>    arithmetic/logical operations on the receiver.
>
> For reference, there is also knem http://runtime.bordeaux.inria.fr/knem/
> which does implement (2) for I/OAT, though it looks to me the interface
> and implementation are relatively speaking quite a bit more complex.
>   

I am the guy doing KNEM so I can comment on this. The I/OAT part of KNEM
was mostly a research topic, it's mostly useless on current machines
since the memcpy performance is much larger than I/OAT DMA Engine. We
also have an offload model with a kernel thread, but it wasn't used a
lot so far. These features can be ignored for the current discussion.

We've been working on this for a while with MPICH and OpenMPI developers
(both already use KNEM), and here's what I think is missing in
Christopher's proposal:
* Vectorial buffer support: MPI likes things like datatypes, which make
buffers non-contigous. You could add vectorial buffer support to your
interface, but the users would have to store the data-representation of
each process in all processes. Not a good idea, it's easier to keep the
knowledge of the non-contigous-ness of the remote buffer only in the
remote process.
* Collectives: You don't want to pin/unpin the same region over and
over, it's overkill when multiple processes are reading for the same
exact buffer (broadcast) or from contigous parts of the same buffer
(scatter).

So what we do in KNEM is:
* declare a memory region (sets of non-contigous segments + protection),
aka get_user_pages and return an associated cookie id
* have syscalls to read/write from region given a cookie, an offset in
the region and a length
This one-sided interface looks like an InfiniBand model, but only for
intra-node data transfers.

So OpenMPI and MPICH declare regions, pass their cookies through their
shared-memory buffer, and the remote process reads from there. Then,
they notify the first process that it may destroy the region (can be
automatic if the region creator passed a specific flag saying destroy
after first use).

Brice

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-16  1:58       ` KOSAKI Motohiro
@ 2010-09-16  8:08         ` Ingo Molnar
  -1 siblings, 0 replies; 62+ messages in thread
From: Ingo Molnar @ 2010-09-16  8:08 UTC (permalink / raw)
  To: KOSAKI Motohiro, Alexander Viro, Chris Wright, Andrew Morton
  Cc: Bryan Donlan, Avi Kivity, Christopher Yeoh, linux-kernel,
	Linux Memory Management List, Linus Torvalds


* KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> wrote:

> > On Wed, Sep 15, 2010 at 19:58, Avi Kivity <avi@redhat.com> wrote:
> > 
> > > Instead of those two syscalls, how about a vmfd(pid_t pid, ulong start,
> > > ulong len) system call which returns an file descriptor that represents a
> > > portion of the process address space.  You can then use preadv() and
> > > pwritev() to copy memory, and io_submit(IO_CMD_PREADV) and
> > > io_submit(IO_CMD_PWRITEV) for asynchronous variants (especially useful with
> > > a dma engine, since that adds latency).
> > >
> > > With some care (and use of mmu_notifiers) you can even mmap() your vmfd and
> > > access remote process memory directly.
> > 
> > Rather than introducing a new vmfd() API for this, why not just add
> > implementations for these more efficient operations to the existing
> > /proc/$pid/mem interface?
> 
> As far as I heared from my friend, old HP MPI implementation used 
> /proc/$pid/mem for this purpose. (I don't know current status). 
> However almost implementation doesn't do that because /proc/$pid/mem 
> required the process is ptraced. As far as I understand , very old 
> /proc/$pid/mem doesn't require it. but It changed for security 
> concern. Then, Anybody haven't want to change this interface because 
> they worry break security.
> 
> But, I don't know what exactly protected "the process is ptraced" 
> check. If anyone explain the reason and we can remove it. I'm not 
> againt at all.

I did some Git digging - that ptrace check for /proc/$pid/mem read/write 
goes all the way back to the beginning of written human history, aka 
Linux v2.6.12-rc2.

I researched the fragmented history of the stone ages as well, i checked 
out numerous cave paintings, and while much was lost, i was able to 
recover this old fragment of a clue in the cave called 'patch-2.3.27', 
carbon-dated back as far as the previous millenium (!):

  mem_read() in fs/proc/base.c:

+ *  1999, Al Viro. Rewritten. Now it covers the whole per-process part.
+ *  Instead of using magical inumbers to determine the kind of object
+ *  we allocate and fill in-core inodes upon lookup. They don't even
+ *  go into icache. We cache the reference to task_struct upon lookup too.
+ *  Eventually it should become a filesystem in its own. We don't use the
+ *  rest of procfs anymore.

In such a long timespan language has changed much, so not all of this 
scribbling can be interpreted - but one thing appears to be sure: this 
is where the MAY_PTRACE() restriction was introduced to /proc/$pid/mem - 
as part of a massive rewrite.

Alas, the reason for the restriction was not documented, and is feared 
to be lost forever.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
@ 2010-09-16  8:08         ` Ingo Molnar
  0 siblings, 0 replies; 62+ messages in thread
From: Ingo Molnar @ 2010-09-16  8:08 UTC (permalink / raw)
  To: KOSAKI Motohiro, Alexander Viro, Chris Wright, Andrew Morton
  Cc: Bryan Donlan, Avi Kivity, Christopher Yeoh, linux-kernel,
	Linux Memory Management List, Linus Torvalds


* KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> wrote:

> > On Wed, Sep 15, 2010 at 19:58, Avi Kivity <avi@redhat.com> wrote:
> > 
> > > Instead of those two syscalls, how about a vmfd(pid_t pid, ulong start,
> > > ulong len) system call which returns an file descriptor that represents a
> > > portion of the process address space.  You can then use preadv() and
> > > pwritev() to copy memory, and io_submit(IO_CMD_PREADV) and
> > > io_submit(IO_CMD_PWRITEV) for asynchronous variants (especially useful with
> > > a dma engine, since that adds latency).
> > >
> > > With some care (and use of mmu_notifiers) you can even mmap() your vmfd and
> > > access remote process memory directly.
> > 
> > Rather than introducing a new vmfd() API for this, why not just add
> > implementations for these more efficient operations to the existing
> > /proc/$pid/mem interface?
> 
> As far as I heared from my friend, old HP MPI implementation used 
> /proc/$pid/mem for this purpose. (I don't know current status). 
> However almost implementation doesn't do that because /proc/$pid/mem 
> required the process is ptraced. As far as I understand , very old 
> /proc/$pid/mem doesn't require it. but It changed for security 
> concern. Then, Anybody haven't want to change this interface because 
> they worry break security.
> 
> But, I don't know what exactly protected "the process is ptraced" 
> check. If anyone explain the reason and we can remove it. I'm not 
> againt at all.

I did some Git digging - that ptrace check for /proc/$pid/mem read/write 
goes all the way back to the beginning of written human history, aka 
Linux v2.6.12-rc2.

I researched the fragmented history of the stone ages as well, i checked 
out numerous cave paintings, and while much was lost, i was able to 
recover this old fragment of a clue in the cave called 'patch-2.3.27', 
carbon-dated back as far as the previous millenium (!):

  mem_read() in fs/proc/base.c:

+ *  1999, Al Viro. Rewritten. Now it covers the whole per-process part.
+ *  Instead of using magical inumbers to determine the kind of object
+ *  we allocate and fill in-core inodes upon lookup. They don't even
+ *  go into icache. We cache the reference to task_struct upon lookup too.
+ *  Eventually it should become a filesystem in its own. We don't use the
+ *  rest of procfs anymore.

In such a long timespan language has changed much, so not all of this 
scribbling can be interpreted - but one thing appears to be sure: this 
is where the MAY_PTRACE() restriction was introduced to /proc/$pid/mem - 
as part of a massive rewrite.

Alas, the reason for the restriction was not documented, and is feared 
to be lost forever.

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-16  6:32       ` Brice Goglin
@ 2010-09-16  9:15         ` Brice Goglin
  -1 siblings, 0 replies; 62+ messages in thread
From: Brice Goglin @ 2010-09-16  9:15 UTC (permalink / raw)
  To: Christopher Yeoh
  Cc: Avi Kivity, linux-kernel, Linux Memory Management List, Ingo Molnar

Le 16/09/2010 08:32, Brice Goglin a écrit :
> I am the guy doing KNEM so I can comment on this. The I/OAT part of KNEM
> was mostly a research topic, it's mostly useless on current machines
> since the memcpy performance is much larger than I/OAT DMA Engine. We
> also have an offload model with a kernel thread, but it wasn't used a
> lot so far. These features can be ignored for the current discussion.

I've just created a knem branch where I removed all the above, and some
other stuff that are not necessary for normal users. So it just contains
the region management code and two commands to copy between regions or
between a region and some local iovecs.

Commands are visible at (still uses ioctls since it doesn't matter while
discussing the features):
https://gforge.inria.fr/scm/viewvc.php/*checkout*/branches/kernel/driver/linux/knem_main.c?root=knem&content-type=text%2Fplain

And the actual driver is at:
https://gforge.inria.fr/scm/viewvc.php/*checkout*/branches/kernel/common/knem_io.h?root=knem&content-type=text%2Fplain

Brice



^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
@ 2010-09-16  9:15         ` Brice Goglin
  0 siblings, 0 replies; 62+ messages in thread
From: Brice Goglin @ 2010-09-16  9:15 UTC (permalink / raw)
  To: Christopher Yeoh
  Cc: Avi Kivity, linux-kernel, Linux Memory Management List, Ingo Molnar

Le 16/09/2010 08:32, Brice Goglin a ecrit :
> I am the guy doing KNEM so I can comment on this. The I/OAT part of KNEM
> was mostly a research topic, it's mostly useless on current machines
> since the memcpy performance is much larger than I/OAT DMA Engine. We
> also have an offload model with a kernel thread, but it wasn't used a
> lot so far. These features can be ignored for the current discussion.

I've just created a knem branch where I removed all the above, and some
other stuff that are not necessary for normal users. So it just contains
the region management code and two commands to copy between regions or
between a region and some local iovecs.

Commands are visible at (still uses ioctls since it doesn't matter while
discussing the features):
https://gforge.inria.fr/scm/viewvc.php/*checkout*/branches/kernel/driver/linux/knem_main.c?root=knem&content-type=text%2Fplain

And the actual driver is at:
https://gforge.inria.fr/scm/viewvc.php/*checkout*/branches/kernel/common/knem_io.h?root=knem&content-type=text%2Fplain

Brice


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-16  1:18       ` Christopher Yeoh
@ 2010-09-16  9:26         ` Avi Kivity
  -1 siblings, 0 replies; 62+ messages in thread
From: Avi Kivity @ 2010-09-16  9:26 UTC (permalink / raw)
  To: Christopher Yeoh
  Cc: Bryan Donlan, linux-kernel, Linux Memory Management List, Ingo Molnar

  On 09/16/2010 03:18 AM, Christopher Yeoh wrote:
> On Wed, 15 Sep 2010 23:46:09 +0900
> Bryan Donlan<bdonlan@gmail.com>  wrote:
>
> >  On Wed, Sep 15, 2010 at 19:58, Avi Kivity<avi@redhat.com>  wrote:
> >
> >  >  Instead of those two syscalls, how about a vmfd(pid_t pid, ulong
> >  >  start, ulong len) system call which returns an file descriptor that
> >  >  represents a portion of the process address space.  You can then
> >  >  use preadv() and pwritev() to copy memory, and
> >  >  io_submit(IO_CMD_PREADV) and io_submit(IO_CMD_PWRITEV) for
> >  >  asynchronous variants (especially useful with a dma engine, since
> >  >  that adds latency).
> >  >
> >  >  With some care (and use of mmu_notifiers) you can even mmap() your
> >  >  vmfd and access remote process memory directly.
> >
> >  Rather than introducing a new vmfd() API for this, why not just add
> >  implementations for these more efficient operations to the existing
> >  /proc/$pid/mem interface?
>
> Perhaps I'm misunderstanding something here, but
> accessing /proc/$pid/mem requires ptracing the target process.
> We can't really have all these MPI processes ptraceing each other
> just to send/receive a message....
>

You could have each process open /proc/self/mem and pass the fd using 
SCM_RIGHTS.

That eliminates a race; with copy_to_process(), by the time the pid is 
looked up it might designate a different process.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
@ 2010-09-16  9:26         ` Avi Kivity
  0 siblings, 0 replies; 62+ messages in thread
From: Avi Kivity @ 2010-09-16  9:26 UTC (permalink / raw)
  To: Christopher Yeoh
  Cc: Bryan Donlan, linux-kernel, Linux Memory Management List, Ingo Molnar

  On 09/16/2010 03:18 AM, Christopher Yeoh wrote:
> On Wed, 15 Sep 2010 23:46:09 +0900
> Bryan Donlan<bdonlan@gmail.com>  wrote:
>
> >  On Wed, Sep 15, 2010 at 19:58, Avi Kivity<avi@redhat.com>  wrote:
> >
> >  >  Instead of those two syscalls, how about a vmfd(pid_t pid, ulong
> >  >  start, ulong len) system call which returns an file descriptor that
> >  >  represents a portion of the process address space.  You can then
> >  >  use preadv() and pwritev() to copy memory, and
> >  >  io_submit(IO_CMD_PREADV) and io_submit(IO_CMD_PWRITEV) for
> >  >  asynchronous variants (especially useful with a dma engine, since
> >  >  that adds latency).
> >  >
> >  >  With some care (and use of mmu_notifiers) you can even mmap() your
> >  >  vmfd and access remote process memory directly.
> >
> >  Rather than introducing a new vmfd() API for this, why not just add
> >  implementations for these more efficient operations to the existing
> >  /proc/$pid/mem interface?
>
> Perhaps I'm misunderstanding something here, but
> accessing /proc/$pid/mem requires ptracing the target process.
> We can't really have all these MPI processes ptraceing each other
> just to send/receive a message....
>

You could have each process open /proc/self/mem and pass the fd using 
SCM_RIGHTS.

That eliminates a race; with copy_to_process(), by the time the pid is 
looked up it might designate a different process.

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-16  9:15         ` Brice Goglin
@ 2010-09-16 14:00           ` Christopher Yeoh
  -1 siblings, 0 replies; 62+ messages in thread
From: Christopher Yeoh @ 2010-09-16 14:00 UTC (permalink / raw)
  To: Brice Goglin; +Cc: linux-kernel, Linux Memory Management List

On Thu, 16 Sep 2010 11:15:10 +0200
Brice Goglin <Brice.Goglin@inria.fr> wrote:

> Le 16/09/2010 08:32, Brice Goglin a écrit :
> > I am the guy doing KNEM so I can comment on this. The I/OAT part of
> > KNEM was mostly a research topic, it's mostly useless on current
> > machines since the memcpy performance is much larger than I/OAT DMA
> > Engine. We also have an offload model with a kernel thread, but it
> > wasn't used a lot so far. These features can be ignored for the
> > current discussion.
> 
> I've just created a knem branch where I removed all the above, and
> some other stuff that are not necessary for normal users. So it just
> contains the region management code and two commands to copy between
> regions or between a region and some local iovecs.

When I did the original hpcc runs for CMA vs shared mem double copy I
also did some KNEM runs as a bit of a sanity check. The CMA OpenMPI
implementation actually uses the infrastructure KNEM put into the
OpenMPI shared mem btl - thanks for that btw it made things much easier
for me to test CMA.

Interestingly although KNEM and CMA fundamentally are doing very
similar things, at least with hpcc I didn't see as much of a gain with
KNEM as with CMA:

MB/s				
Naturally Ordered	4	8	16	32
Base	1235	935	622	419
CMA	4741	3769	1977	703
KNEM	3362	3091	1857	681
				
MB/s				
Randomly Ordered	4	8	16	32
Base	1227	947	638	412
CMA	4666	3682	1978	710
KNEM	3348	3050	1883	684
				
MB/s				
Max Ping Pong	4	8	16	32
Base	2028	1938	1928	1882
CMA	7424	7510	7598	7708
KNEM	5661	5476	6050	6290

I don't know the reason behind the difference - if its something
perculiar to hpcc,  or if there's extra overhead the way that
knem does setup for copying, or if knem wasn't configured
optimally. I haven't done any comparison IMB or NPB runs...

syscall and setup overhead does have some measurable effect - although I
don't have the numbers for it here, neither KNEM nor CMA does quite as
well with hpcc when compared against a hacked version of hpcc  where
everything is declared ahead of time as shared memory so the receiver
can just do a single copy from userspace - which I think is
representative of a theoretical maximum gain from the single copy
approach.

Chris
-- 
cyeoh@au.ibm.com

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
@ 2010-09-16 14:00           ` Christopher Yeoh
  0 siblings, 0 replies; 62+ messages in thread
From: Christopher Yeoh @ 2010-09-16 14:00 UTC (permalink / raw)
  To: Brice Goglin; +Cc: linux-kernel, Linux Memory Management List

On Thu, 16 Sep 2010 11:15:10 +0200
Brice Goglin <Brice.Goglin@inria.fr> wrote:

> Le 16/09/2010 08:32, Brice Goglin a écrit :
> > I am the guy doing KNEM so I can comment on this. The I/OAT part of
> > KNEM was mostly a research topic, it's mostly useless on current
> > machines since the memcpy performance is much larger than I/OAT DMA
> > Engine. We also have an offload model with a kernel thread, but it
> > wasn't used a lot so far. These features can be ignored for the
> > current discussion.
> 
> I've just created a knem branch where I removed all the above, and
> some other stuff that are not necessary for normal users. So it just
> contains the region management code and two commands to copy between
> regions or between a region and some local iovecs.

When I did the original hpcc runs for CMA vs shared mem double copy I
also did some KNEM runs as a bit of a sanity check. The CMA OpenMPI
implementation actually uses the infrastructure KNEM put into the
OpenMPI shared mem btl - thanks for that btw it made things much easier
for me to test CMA.

Interestingly although KNEM and CMA fundamentally are doing very
similar things, at least with hpcc I didn't see as much of a gain with
KNEM as with CMA:

MB/s				
Naturally Ordered	4	8	16	32
Base	1235	935	622	419
CMA	4741	3769	1977	703
KNEM	3362	3091	1857	681
				
MB/s				
Randomly Ordered	4	8	16	32
Base	1227	947	638	412
CMA	4666	3682	1978	710
KNEM	3348	3050	1883	684
				
MB/s				
Max Ping Pong	4	8	16	32
Base	2028	1938	1928	1882
CMA	7424	7510	7598	7708
KNEM	5661	5476	6050	6290

I don't know the reason behind the difference - if its something
perculiar to hpcc,  or if there's extra overhead the way that
knem does setup for copying, or if knem wasn't configured
optimally. I haven't done any comparison IMB or NPB runs...

syscall and setup overhead does have some measurable effect - although I
don't have the numbers for it here, neither KNEM nor CMA does quite as
well with hpcc when compared against a hacked version of hpcc  where
everything is declared ahead of time as shared memory so the receiver
can just do a single copy from userspace - which I think is
representative of a theoretical maximum gain from the single copy
approach.

Chris
-- 
cyeoh@au.ibm.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-15 15:11 ` Linus Torvalds
  2010-09-15 15:14   ` Linus Torvalds
@ 2010-09-16 16:27   ` Peter Zijlstra
  2010-09-16 16:54     ` Linus Torvalds
  1 sibling, 1 reply; 62+ messages in thread
From: Peter Zijlstra @ 2010-09-16 16:27 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Christopher Yeoh, linux-kernel

On Wed, 2010-09-15 at 08:11 -0700, Linus Torvalds wrote:

>    if (copy_to)
>       ret = copy_from_user(..);
>    else
>       ret = copy_to_user(..);
>    kunmap(process_pages[i]);
>    if (ret)
>       goto out; 

Shouldn't we be using kmap_atomic()

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-16 16:27   ` Peter Zijlstra
@ 2010-09-16 16:54     ` Linus Torvalds
  2010-09-16 17:13       ` Peter Zijlstra
  0 siblings, 1 reply; 62+ messages in thread
From: Linus Torvalds @ 2010-09-16 16:54 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Christopher Yeoh, linux-kernel

On Thu, Sep 16, 2010 at 9:27 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Wed, 2010-09-15 at 08:11 -0700, Linus Torvalds wrote:
>
>>    if (copy_to)
>>       ret = copy_from_user(..);
>>    else
>>       ret = copy_to_user(..);
>>    kunmap(process_pages[i]);
>>    if (ret)
>>       goto out;
>
> Shouldn't we be using kmap_atomic()

Over a copy_to/from_user? Not bloody likely.

                     Linus

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-16 16:54     ` Linus Torvalds
@ 2010-09-16 17:13       ` Peter Zijlstra
  2010-09-16 17:34         ` Linus Torvalds
  0 siblings, 1 reply; 62+ messages in thread
From: Peter Zijlstra @ 2010-09-16 17:13 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Christopher Yeoh, linux-kernel

On Thu, 2010-09-16 at 09:54 -0700, Linus Torvalds wrote:
> On Thu, Sep 16, 2010 at 9:27 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> > On Wed, 2010-09-15 at 08:11 -0700, Linus Torvalds wrote:
> >
> >>    if (copy_to)
> >>       ret = copy_from_user(..);
> >>    else
> >>       ret = copy_to_user(..);
> >>    kunmap(process_pages[i]);
> >>    if (ret)
> >>       goto out;
> >
> > Shouldn't we be using kmap_atomic()
> 
> Over a copy_to/from_user? Not bloody likely.

Gah, indeed. OK, since its not nested kmap() should indeed work. The
alternative is using get_user_pages() on both address spaces, but I
guess that makes things unnecessarily complex.



^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-16 17:13       ` Peter Zijlstra
@ 2010-09-16 17:34         ` Linus Torvalds
  2010-09-16 17:47           ` Peter Zijlstra
  2010-09-19  4:55           ` Yuhong Bao
  0 siblings, 2 replies; 62+ messages in thread
From: Linus Torvalds @ 2010-09-16 17:34 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Christopher Yeoh, linux-kernel

On Thu, Sep 16, 2010 at 10:13 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>>
>> Over a copy_to/from_user? Not bloody likely.
>
> Gah, indeed. OK, since its not nested kmap() should indeed work. The
> alternative is using get_user_pages() on both address spaces, but I
> guess that makes things unnecessarily complex.

.. and perform horribly badly. And since the whole point was to do
this really efficiently, that's not good.

What *would* work would be to have a fast case that does kmap_atomic()
together with a copy_to/from_user_atomic(). And when that fast-case
fails, do the full kmap. Slightly more complex than the suggested
patch, but not horribly so (just a few more lines, no fundamental
complexities).

Of course, these days I would seriously suggest against trying to
optimize the kmap() case. It only matters on crap hardware these days.
Anybody running HIGHMEM in 2010 and thinks that it makes sense
deserves the pain the get. We should not complicate the kernel further
for it, and sane architectures will have a no-op kmap().

So the real cost there is likely not the kmap as much as the
set_page_dirty_lock() for the copy_to case. But you'd need to profile
it to see how big of a hit it is compared to the copy itself.

                           Linus

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-16 17:34         ` Linus Torvalds
@ 2010-09-16 17:47           ` Peter Zijlstra
  2010-09-16 17:54             ` Linus Torvalds
  2010-09-19  4:55           ` Yuhong Bao
  1 sibling, 1 reply; 62+ messages in thread
From: Peter Zijlstra @ 2010-09-16 17:47 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Christopher Yeoh, linux-kernel

On Thu, 2010-09-16 at 10:34 -0700, Linus Torvalds wrote:
> 
> Of course, these days I would seriously suggest against trying to
> optimize the kmap() case. It only matters on crap hardware these days.
> Anybody running HIGHMEM in 2010 and thinks that it makes sense
> deserves the pain the get. We should not complicate the kernel further
> for it, and sane architectures will have a no-op kmap(). 

OK, fully agreed. Someone ought to tell ARM though :-)


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-16 17:47           ` Peter Zijlstra
@ 2010-09-16 17:54             ` Linus Torvalds
  2010-09-16 18:00               ` Linus Torvalds
  2010-09-19 19:20               ` Yuhong Bao
  0 siblings, 2 replies; 62+ messages in thread
From: Linus Torvalds @ 2010-09-16 17:54 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Christopher Yeoh, linux-kernel

On Thu, Sep 16, 2010 at 10:47 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Thu, 2010-09-16 at 10:34 -0700, Linus Torvalds wrote:
>>
>> Of course, these days I would seriously suggest against trying to
>> optimize the kmap() case. It only matters on crap hardware these days.
>> Anybody running HIGHMEM in 2010 and thinks that it makes sense
>> deserves the pain the get. We should not complicate the kernel further
>> for it, and sane architectures will have a no-op kmap().
>
> OK, fully agreed. Someone ought to tell ARM though :-)

You know what? I don't care. If the fact that ARM is messing up means
that they will never be able to do well in the micro-server space,
that's _their_ problem.

I fought HIGHMEM tooth and nail when it appeared originally. I lost,
because we really didn't have any choice. But there is no way I'm
going to say "oh, HIGHMEM still makes sense in 2010 because the ARM
guys are now making all the same mistakes Intel did in 1992". Because
these days we _do_ have a choice.

And all the rumors are that there will be a 64-bit ARM too. So their
PAE mess will be out before, but nobody sane should really consider it
a primary issue. It will work, but it will work suboptimally.  That's
what you get when you have bad hardware design.

                       Linus

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-16 17:54             ` Linus Torvalds
@ 2010-09-16 18:00               ` Linus Torvalds
  2010-09-19  4:44                 ` Yuhong Bao
  2010-09-19 19:20               ` Yuhong Bao
  1 sibling, 1 reply; 62+ messages in thread
From: Linus Torvalds @ 2010-09-16 18:00 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Christopher Yeoh, linux-kernel

On Thu, Sep 16, 2010 at 10:54 AM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>   But there is no way I'm
> going to say "oh, HIGHMEM still makes sense in 2010 because the ARM
> guys are now making all the same mistakes Intel did in 1992".

Off by a couple of years. Intel did it wit the PPro, in 1995,
actually. But it's still going to be 16 years later by the time ARM
LPAE actually ships, I guess.

                        Linus

^ permalink raw reply	[flat|nested] 62+ messages in thread

* RE: [RFC][PATCH] Cross Memory Attach
  2010-09-16 18:00               ` Linus Torvalds
@ 2010-09-19  4:44                 ` Yuhong Bao
  0 siblings, 0 replies; 62+ messages in thread
From: Yuhong Bao @ 2010-09-19  4:44 UTC (permalink / raw)
  To: torvalds, peterz; +Cc: cyeoh, linux-kernel



> On Thu, Sep 16, 2010 at 10:54 AM, Linus Torvalds
>  wrote:
> > But there is no way I'm
> > going to say "oh, HIGHMEM still makes sense in 2010 because the ARM
> > guys are now making all the same mistakes Intel did in 1992".
>
> Off by a couple of years. Intel did it wit the PPro, in 1995,
> actually. But it's still going to be 16 years later by the time ARM
> LPAE actually ships, I guess.

Yep, back when the 386 was designed, having 4GB for both physical and virtual address space
(which was what caused HIGHMEM to be needed at all with more than 1GB in the first place)
was not a mistake.
 		 	   		  

^ permalink raw reply	[flat|nested] 62+ messages in thread

* RE: [RFC][PATCH] Cross Memory Attach
  2010-09-16 17:34         ` Linus Torvalds
  2010-09-16 17:47           ` Peter Zijlstra
@ 2010-09-19  4:55           ` Yuhong Bao
  1 sibling, 0 replies; 62+ messages in thread
From: Yuhong Bao @ 2010-09-19  4:55 UTC (permalink / raw)
  To: torvalds, peterz; +Cc: cyeoh, linux-kernel


> Of course, these days I would seriously suggest against trying to
> optimize the kmap() case. It only matters on crap hardware these days.
> Anybody running HIGHMEM in 2010 and thinks that it makes sense
> deserves the pain the get. We should not complicate the kernel further
> for it, and sane architectures will have a no-op kmap().

Well, keep in mind most even vaguely recent 32-bit x86 distro kernels
have HIGHMEM4G enabled by default.
Some recent ones even have HIGHMEM64G enabled in order to get NX support.
(And yes, as I said before, there are many processors that cannot run in long mode
but have NX)

Yuhong Bao
 		 	   		  

^ permalink raw reply	[flat|nested] 62+ messages in thread

* RE: [RFC][PATCH] Cross Memory Attach
  2010-09-16 17:54             ` Linus Torvalds
  2010-09-16 18:00               ` Linus Torvalds
@ 2010-09-19 19:20               ` Yuhong Bao
  2010-09-19 21:48                 ` Russell King - ARM Linux
  1 sibling, 1 reply; 62+ messages in thread
From: Yuhong Bao @ 2010-09-19 19:20 UTC (permalink / raw)
  To: torvalds, peterz; +Cc: cyeoh, linux-kernel, linux


(Adding Russell King of ARM Linux to CC list)

----------------------------------------
> From: torvalds@linux-foundation.org
> Date: Thu, 16 Sep 2010 10:54:29 -0700
> Subject: Re: [RFC][PATCH] Cross Memory Attach
> To: peterz@infradead.org
> CC: cyeoh@au1.ibm.com; linux-kernel@vger.kernel.org
>
> On Thu, Sep 16, 2010 at 10:47 AM, Peter Zijlstra  wrote:
> > On Thu, 2010-09-16 at 10:34 -0700, Linus Torvalds wrote:
> >>
> >> Of course, these days I would seriously suggest against trying to
> >> optimize the kmap() case. It only matters on crap hardware these days.
> >> Anybody running HIGHMEM in 2010 and thinks that it makes sense
> >> deserves the pain the get. We should not complicate the kernel further
> >> for it, and sane architectures will have a no-op kmap().
> >
> > OK, fully agreed. Someone ought to tell ARM though :-)
>
> You know what? I don't care. If the fact that ARM is messing up means
> that they will never be able to do well in the micro-server space,
> that's _their_ problem.
>
> I fought HIGHMEM tooth and nail when it appeared originally. I lost,
> because we really didn't have any choice. But there is no way I'm
> going to say "oh, HIGHMEM still makes sense in 2010 because the ARM
> guys are now making all the same mistakes Intel did in 1992". Because
> these days we _do_ have a choice.
>
> And all the rumors are that there will be a 64-bit ARM too. So their
> PAE mess will be out before, but nobody sane should really consider it
> a primary issue. It will work, but it will work suboptimally. That's
> what you get when you have bad hardware design.
>
> Linus
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
>
 		 	   		  

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-19 19:20               ` Yuhong Bao
@ 2010-09-19 21:48                 ` Russell King - ARM Linux
  2010-09-19 22:47                   ` Yuhong Bao
  0 siblings, 1 reply; 62+ messages in thread
From: Russell King - ARM Linux @ 2010-09-19 21:48 UTC (permalink / raw)
  To: Yuhong Bao; +Cc: torvalds, peterz, cyeoh, linux-kernel

On Sun, Sep 19, 2010 at 12:20:59PM -0700, Yuhong Bao wrote:
> 
> (Adding Russell King of ARM Linux to CC list)

I'm not sure why you're repeatedly sending me this email (this is the
second in less than 24 hours.)

In any case, it's not like I can influence the direction ARM Ltd take
their architecture - I only hear about stuff after the decisions have
been taken and the direction set.  (Sometimes I hear about stuff just
before the public announcement.)

So, if you think I can do anything to stop the move towards LPAS
(large physical address space) and tell ARM to go to 64-bit directly,
you're sadly mistaken.

> ----------------------------------------
> > From: torvalds@linux-foundation.org
> > Date: Thu, 16 Sep 2010 10:54:29 -0700
> > Subject: Re: [RFC][PATCH] Cross Memory Attach
> > To: peterz@infradead.org
> > CC: cyeoh@au1.ibm.com; linux-kernel@vger.kernel.org
> >
> > On Thu, Sep 16, 2010 at 10:47 AM, Peter Zijlstra  wrote:
> > > On Thu, 2010-09-16 at 10:34 -0700, Linus Torvalds wrote:
> > >>
> > >> Of course, these days I would seriously suggest against trying to
> > >> optimize the kmap() case. It only matters on crap hardware these days.
> > >> Anybody running HIGHMEM in 2010 and thinks that it makes sense
> > >> deserves the pain the get. We should not complicate the kernel further
> > >> for it, and sane architectures will have a no-op kmap().
> > >
> > > OK, fully agreed. Someone ought to tell ARM though :-)
> >
> > You know what? I don't care. If the fact that ARM is messing up means
> > that they will never be able to do well in the micro-server space,
> > that's _their_ problem.
> >
> > I fought HIGHMEM tooth and nail when it appeared originally. I lost,
> > because we really didn't have any choice. But there is no way I'm
> > going to say "oh, HIGHMEM still makes sense in 2010 because the ARM
> > guys are now making all the same mistakes Intel did in 1992". Because
> > these days we _do_ have a choice.
> >
> > And all the rumors are that there will be a 64-bit ARM too. So their
> > PAE mess will be out before, but nobody sane should really consider it
> > a primary issue. It will work, but it will work suboptimally. That's
> > what you get when you have bad hardware design.
> >
> > Linus
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at http://www.tux.org/lkml/
> >
> >
>  		 	   		  

^ permalink raw reply	[flat|nested] 62+ messages in thread

* RE: [RFC][PATCH] Cross Memory Attach
  2010-09-19 21:48                 ` Russell King - ARM Linux
@ 2010-09-19 22:47                   ` Yuhong Bao
  0 siblings, 0 replies; 62+ messages in thread
From: Yuhong Bao @ 2010-09-19 22:47 UTC (permalink / raw)
  To: linux; +Cc: torvalds, peterz, cyeoh, linux-kernel


> > (Adding Russell King of ARM Linux to CC list)
>
> I'm not sure why you're repeatedly sending me this email (this is the
> second in less than 24 hours.)
Sorry, the first one was sent with multipart/alternate with both HTML
and plain text, leading it to be rejected by LKML.

> In any case, it's not like I can influence the direction ARM Ltd take
> their architecture - I only hear about stuff after the decisions have
> been taken and the direction set. (Sometimes I hear about stuff just
> before the public announcement.)
>
> So, if you think I can do anything to stop the move towards LPAS
> (large physical address space) and tell ARM to go to 64-bit directly,
> you're sadly mistaken.
If I know of email address of any ARM engineers, I'd add them to CC list too.

Yuhong Bao
 		 	   		  

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-09-16  9:26         ` Avi Kivity
@ 2010-11-02  3:37           ` Christopher Yeoh
  -1 siblings, 0 replies; 62+ messages in thread
From: Christopher Yeoh @ 2010-11-02  3:37 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Bryan Donlan, linux-kernel, Linux Memory Management List, Ingo Molnar

On Thu, 16 Sep 2010 11:26:36 +0200
Avi Kivity <avi@redhat.com> wrote:
>   On 09/16/2010 03:18 AM, Christopher Yeoh wrote:
> > On Wed, 15 Sep 2010 23:46:09 +0900
> > Bryan Donlan<bdonlan@gmail.com>  wrote:
> >
> > >  On Wed, Sep 15, 2010 at 19:58, Avi Kivity<avi@redhat.com>  wrote:
> > >
> > >  >  Instead of those two syscalls, how about a vmfd(pid_t pid,
> > >  > ulong start, ulong len) system call which returns an file
> > >  > descriptor that represents a portion of the process address
> > >  > space.  You can then use preadv() and pwritev() to copy
> > >  > memory, and io_submit(IO_CMD_PREADV) and
> > >  > io_submit(IO_CMD_PWRITEV) for asynchronous variants
> > >  > (especially useful with a dma engine, since that adds latency).
> > >  >
> > >  >  With some care (and use of mmu_notifiers) you can even mmap()
> > >  > your vmfd and access remote process memory directly.
> > >
> > >  Rather than introducing a new vmfd() API for this, why not just
> > > add implementations for these more efficient operations to the
> > > existing /proc/$pid/mem interface?
> >
> > Perhaps I'm misunderstanding something here, but
> > accessing /proc/$pid/mem requires ptracing the target process.
> > We can't really have all these MPI processes ptraceing each other
> > just to send/receive a message....
> >
> 
> You could have each process open /proc/self/mem and pass the fd using 
> SCM_RIGHTS.
> 
> That eliminates a race; with copy_to_process(), by the time the pid
> is looked up it might designate a different process.

Just to revive an old thread (I've been on holidays), but this doesn't
work either. the ptrace check is done by mem_read (eg on each read) so
even if you do pass the fd using SCM_RIGHTS, reads on the fd still
fail. 

So unless there's good reason to believe that the ptrace permission
check is no longer needed, the /proc/pid/mem interface doesn't seem to
be an option for what we want to do.

Oh and interestingly reading from /proc/pid/mem involves a double copy
- copy to a temporary kernel page and then out to userspace. But that is
fixable.

Regards,

Chris
-- 
cyeoh@ozlabs.org

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
@ 2010-11-02  3:37           ` Christopher Yeoh
  0 siblings, 0 replies; 62+ messages in thread
From: Christopher Yeoh @ 2010-11-02  3:37 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Bryan Donlan, linux-kernel, Linux Memory Management List, Ingo Molnar

On Thu, 16 Sep 2010 11:26:36 +0200
Avi Kivity <avi@redhat.com> wrote:
>   On 09/16/2010 03:18 AM, Christopher Yeoh wrote:
> > On Wed, 15 Sep 2010 23:46:09 +0900
> > Bryan Donlan<bdonlan@gmail.com>  wrote:
> >
> > >  On Wed, Sep 15, 2010 at 19:58, Avi Kivity<avi@redhat.com>  wrote:
> > >
> > >  >  Instead of those two syscalls, how about a vmfd(pid_t pid,
> > >  > ulong start, ulong len) system call which returns an file
> > >  > descriptor that represents a portion of the process address
> > >  > space.  You can then use preadv() and pwritev() to copy
> > >  > memory, and io_submit(IO_CMD_PREADV) and
> > >  > io_submit(IO_CMD_PWRITEV) for asynchronous variants
> > >  > (especially useful with a dma engine, since that adds latency).
> > >  >
> > >  >  With some care (and use of mmu_notifiers) you can even mmap()
> > >  > your vmfd and access remote process memory directly.
> > >
> > >  Rather than introducing a new vmfd() API for this, why not just
> > > add implementations for these more efficient operations to the
> > > existing /proc/$pid/mem interface?
> >
> > Perhaps I'm misunderstanding something here, but
> > accessing /proc/$pid/mem requires ptracing the target process.
> > We can't really have all these MPI processes ptraceing each other
> > just to send/receive a message....
> >
> 
> You could have each process open /proc/self/mem and pass the fd using 
> SCM_RIGHTS.
> 
> That eliminates a race; with copy_to_process(), by the time the pid
> is looked up it might designate a different process.

Just to revive an old thread (I've been on holidays), but this doesn't
work either. the ptrace check is done by mem_read (eg on each read) so
even if you do pass the fd using SCM_RIGHTS, reads on the fd still
fail. 

So unless there's good reason to believe that the ptrace permission
check is no longer needed, the /proc/pid/mem interface doesn't seem to
be an option for what we want to do.

Oh and interestingly reading from /proc/pid/mem involves a double copy
- copy to a temporary kernel page and then out to userspace. But that is
fixable.

Regards,

Chris
-- 
cyeoh@ozlabs.org

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
  2010-11-02  3:37           ` Christopher Yeoh
@ 2010-11-02 11:10             ` Avi Kivity
  -1 siblings, 0 replies; 62+ messages in thread
From: Avi Kivity @ 2010-11-02 11:10 UTC (permalink / raw)
  To: Christopher Yeoh
  Cc: Bryan Donlan, linux-kernel, Linux Memory Management List, Ingo Molnar

  On 11/01/2010 11:37 PM, Christopher Yeoh wrote:
> >
> >  You could have each process open /proc/self/mem and pass the fd using
> >  SCM_RIGHTS.
> >
> >  That eliminates a race; with copy_to_process(), by the time the pid
> >  is looked up it might designate a different process.
>
> Just to revive an old thread (I've been on holidays), but this doesn't
> work either. the ptrace check is done by mem_read (eg on each read) so
> even if you do pass the fd using SCM_RIGHTS, reads on the fd still
> fail.
>
> So unless there's good reason to believe that the ptrace permission
> check is no longer needed, the /proc/pid/mem interface doesn't seem to
> be an option for what we want to do.
>

Perhaps move the check to open().  I can understand the desire to avoid 
letting random processes peek each other's memory, but once a process 
has opened its own /proc/self/mem and explicitly passed it to another, 
we should allow it.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [RFC][PATCH] Cross Memory Attach
@ 2010-11-02 11:10             ` Avi Kivity
  0 siblings, 0 replies; 62+ messages in thread
From: Avi Kivity @ 2010-11-02 11:10 UTC (permalink / raw)
  To: Christopher Yeoh
  Cc: Bryan Donlan, linux-kernel, Linux Memory Management List, Ingo Molnar

  On 11/01/2010 11:37 PM, Christopher Yeoh wrote:
> >
> >  You could have each process open /proc/self/mem and pass the fd using
> >  SCM_RIGHTS.
> >
> >  That eliminates a race; with copy_to_process(), by the time the pid
> >  is looked up it might designate a different process.
>
> Just to revive an old thread (I've been on holidays), but this doesn't
> work either. the ptrace check is done by mem_read (eg on each read) so
> even if you do pass the fd using SCM_RIGHTS, reads on the fd still
> fail.
>
> So unless there's good reason to believe that the ptrace permission
> check is no longer needed, the /proc/pid/mem interface doesn't seem to
> be an option for what we want to do.
>

Perhaps move the check to open().  I can understand the desire to avoid 
letting random processes peek each other's memory, but once a process 
has opened its own /proc/self/mem and explicitly passed it to another, 
we should allow it.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 62+ messages in thread

end of thread, other threads:[~2010-11-02 11:34 UTC | newest]

Thread overview: 62+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-09-15  1:18 [RFC][PATCH] Cross Memory Attach Christopher Yeoh
2010-09-15  8:02 ` Ingo Molnar
2010-09-15  8:02   ` Ingo Molnar
2010-09-15  8:16   ` Ingo Molnar
2010-09-15  8:16     ` Ingo Molnar
2010-09-15 13:23     ` Christopher Yeoh
2010-09-15 13:23       ` Christopher Yeoh
2010-09-15 13:20   ` Christopher Yeoh
2010-09-15 13:20     ` Christopher Yeoh
2010-09-15 10:58 ` Avi Kivity
2010-09-15 10:58   ` Avi Kivity
2010-09-15 13:51   ` Ingo Molnar
2010-09-15 13:51     ` Ingo Molnar
2010-09-15 16:10     ` Avi Kivity
2010-09-15 16:10       ` Avi Kivity
2010-09-15 14:42   ` Christopher Yeoh
2010-09-15 14:42     ` Christopher Yeoh
2010-09-15 14:52     ` Linus Torvalds
2010-09-15 14:52       ` Linus Torvalds
2010-09-15 15:44       ` Robin Holt
2010-09-15 15:44         ` Robin Holt
2010-09-16  6:32     ` Brice Goglin
2010-09-16  6:32       ` Brice Goglin
2010-09-16  9:15       ` Brice Goglin
2010-09-16  9:15         ` Brice Goglin
2010-09-16 14:00         ` Christopher Yeoh
2010-09-16 14:00           ` Christopher Yeoh
2010-09-15 14:46   ` Bryan Donlan
2010-09-15 14:46     ` Bryan Donlan
2010-09-15 16:13     ` Avi Kivity
2010-09-15 16:13       ` Avi Kivity
2010-09-15 19:35       ` Eric W. Biederman
2010-09-15 19:35         ` Eric W. Biederman
2010-09-16  1:18     ` Christopher Yeoh
2010-09-16  1:18       ` Christopher Yeoh
2010-09-16  9:26       ` Avi Kivity
2010-09-16  9:26         ` Avi Kivity
2010-11-02  3:37         ` Christopher Yeoh
2010-11-02  3:37           ` Christopher Yeoh
2010-11-02 11:10           ` Avi Kivity
2010-11-02 11:10             ` Avi Kivity
2010-09-16  1:58     ` KOSAKI Motohiro
2010-09-16  1:58       ` KOSAKI Motohiro
2010-09-16  8:08       ` Ingo Molnar
2010-09-16  8:08         ` Ingo Molnar
2010-09-15 15:11 ` Linus Torvalds
2010-09-15 15:14   ` Linus Torvalds
2010-09-16  2:25     ` Christopher Yeoh
2010-09-16 16:27   ` Peter Zijlstra
2010-09-16 16:54     ` Linus Torvalds
2010-09-16 17:13       ` Peter Zijlstra
2010-09-16 17:34         ` Linus Torvalds
2010-09-16 17:47           ` Peter Zijlstra
2010-09-16 17:54             ` Linus Torvalds
2010-09-16 18:00               ` Linus Torvalds
2010-09-19  4:44                 ` Yuhong Bao
2010-09-19 19:20               ` Yuhong Bao
2010-09-19 21:48                 ` Russell King - ARM Linux
2010-09-19 22:47                   ` Yuhong Bao
2010-09-19  4:55           ` Yuhong Bao
2010-09-15 16:07 ` Valdis.Kletnieks
2010-09-16  2:17   ` Christopher Yeoh

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.