linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* Re: [RFC PATCH] fs/exec: Add the support for ELF program's NUMA replication
  2021-09-06 16:16 [RFC PATCH] fs/exec: Add the support for ELF program's NUMA replication Huang Shijie
@ 2021-09-06  9:35 ` David Hildenbrand
  2021-09-09  9:46   ` Huang Shijie
  2021-09-06 12:09 ` Matthew Wilcox
  1 sibling, 1 reply; 6+ messages in thread
From: David Hildenbrand @ 2021-09-06  9:35 UTC (permalink / raw)
  To: Huang Shijie, viro
  Cc: akpm, jlayton, bfields, torvalds, linux-fsdevel, linux-kernel,
	linux-mm, song.bao.hua, patches, zwang

On 06.09.21 18:16, Huang Shijie wrote:
> This patch adds AT_NUMA_REPLICATION for execveat().
> 
> If this flag is set, the kernel will trigger COW(copy on write)
> on the mmapped ELF binary. So the program will have a copied-page
> on its NUMA node, even if the original page in page cache is
> on other NUMA nodes.

Am I missing something important or is this just absolutely not what we 
want?

This means that for each and every invocation of the binary, we'll COW 
the complete binary -- an awful lot of wasted main memory and swap.

This is not a NUMA replication, this is en executable ELF code replication.

> 
> Signed-off-by: Huang Shijie <shijie@os.amperecomputing.com>
> ---
>   fs/binfmt_elf.c            | 27 ++++++++++++++++++++++-----
>   fs/exec.c                  |  5 ++++-
>   include/linux/binfmts.h    |  1 +
>   include/linux/mm.h         |  2 ++
>   include/uapi/linux/fcntl.h |  2 ++
>   mm/mprotect.c              |  2 +-
>   6 files changed, 32 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 439ed81e755a..fac8f4a4555a 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -362,13 +362,14 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
>   
>   static unsigned long elf_map(struct file *filep, unsigned long addr,
>   		const struct elf_phdr *eppnt, int prot, int type,
> -		unsigned long total_size)
> +		unsigned long total_size, int numa_replication)
>   {
>   	unsigned long map_addr;
>   	unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr);
>   	unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr);
>   	addr = ELF_PAGESTART(addr);
>   	size = ELF_PAGEALIGN(size);
> +	int ret;
>   
>   	/* mmap() will return -EINVAL if given a zero size, but a
>   	 * segment with zero filesize is perfectly valid */
> @@ -385,11 +386,26 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
>   	*/
>   	if (total_size) {
>   		total_size = ELF_PAGEALIGN(total_size);
> -		map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
> +
> +		if (numa_replication) {
> +			/* Trigger the COW for this ELF code section */
> +			map_addr = vm_mmap(filep, addr, total_size, prot | PROT_WRITE,
> +						type | MAP_POPULATE, off);
> +			if (!IS_ERR_VALUE(map_addr) && !(prot & PROT_WRITE)) {
> +				/* Change back */
> +				ret = do_mprotect_pkey(map_addr, total_size, prot, -1);
> +				if (ret)
> +					return ret;
> +			}
> +		} else {
> +			map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
> +		}
> +
>   		if (!BAD_ADDR(map_addr))
>   			vm_munmap(map_addr+size, total_size-size);
> -	} else
> +	} else {
>   		map_addr = vm_mmap(filep, addr, size, prot, type, off);
> +	}
>   
>   	if ((type & MAP_FIXED_NOREPLACE) &&
>   	    PTR_ERR((void *)map_addr) == -EEXIST)
> @@ -635,7 +651,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
>   				load_addr = -vaddr;
>   
>   			map_addr = elf_map(interpreter, load_addr + vaddr,
> -					eppnt, elf_prot, elf_type, total_size);
> +					eppnt, elf_prot, elf_type, total_size, 0);
>   			total_size = 0;
>   			error = map_addr;
>   			if (BAD_ADDR(map_addr))
> @@ -1139,7 +1155,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
>   		}
>   
>   		error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
> -				elf_prot, elf_flags, total_size);
> +				elf_prot, elf_flags, total_size,
> +				bprm->support_numa_replication);
>   		if (BAD_ADDR(error)) {
>   			retval = IS_ERR((void *)error) ?
>   				PTR_ERR((void*)error) : -EINVAL;
> diff --git a/fs/exec.c b/fs/exec.c
> index 38f63451b928..d27efa540641 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -900,7 +900,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
>   		.lookup_flags = LOOKUP_FOLLOW,
>   	};
>   
> -	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
> +	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH | AT_NUMA_REPLICATION)) != 0)
>   		return ERR_PTR(-EINVAL);
>   	if (flags & AT_SYMLINK_NOFOLLOW)
>   		open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
> @@ -1828,6 +1828,9 @@ static int bprm_execve(struct linux_binprm *bprm,
>   	if (retval)
>   		goto out;
>   
> +	/* Do we support NUMA replication for this program? */
> +	bprm->support_numa_replication = flags & AT_NUMA_REPLICATION;
> +
>   	retval = exec_binprm(bprm);
>   	if (retval < 0)
>   		goto out;
> diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
> index 049cf9421d83..1874e1732f20 100644
> --- a/include/linux/binfmts.h
> +++ b/include/linux/binfmts.h
> @@ -64,6 +64,7 @@ struct linux_binprm {
>   	struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */
>   
>   	char buf[BINPRM_BUF_SIZE];
> +	int support_numa_replication;
>   } __randomize_layout;
>   
>   #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 7ca22e6e694a..76611381be2a 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3244,6 +3244,8 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping,
>   #endif
>   
>   extern int sysctl_nr_trim_pages;
> +int do_mprotect_pkey(unsigned long start, size_t len,
> +			unsigned long prot, int pkey);
>   
>   #ifdef CONFIG_PRINTK
>   void mem_dump_obj(void *object);
> diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
> index 2f86b2ad6d7e..de99c5ae8eca 100644
> --- a/include/uapi/linux/fcntl.h
> +++ b/include/uapi/linux/fcntl.h
> @@ -111,4 +111,6 @@
>   
>   #define AT_RECURSIVE		0x8000	/* Apply to the entire subtree */
>   
> +#define AT_NUMA_REPLICATION	0x10000	/* Support NUMA replication for the ELF program */
> +
>   #endif /* _UAPI_LINUX_FCNTL_H */
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 883e2cc85cad..d1f8cececfed 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -519,7 +519,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
>   /*
>    * pkey==-1 when doing a legacy mprotect()
>    */
> -static int do_mprotect_pkey(unsigned long start, size_t len,
> +int do_mprotect_pkey(unsigned long start, size_t len,
>   		unsigned long prot, int pkey)
>   {
>   	unsigned long nstart, end, tmp, reqprot;
> 


-- 
Thanks,

David / dhildenb



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH] fs/exec: Add the support for ELF program's NUMA replication
  2021-09-06 16:16 [RFC PATCH] fs/exec: Add the support for ELF program's NUMA replication Huang Shijie
  2021-09-06  9:35 ` David Hildenbrand
@ 2021-09-06 12:09 ` Matthew Wilcox
  2021-09-09  9:48   ` Huang Shijie
  1 sibling, 1 reply; 6+ messages in thread
From: Matthew Wilcox @ 2021-09-06 12:09 UTC (permalink / raw)
  To: Huang Shijie
  Cc: viro, akpm, jlayton, bfields, torvalds, linux-fsdevel,
	linux-kernel, linux-mm, song.bao.hua, patches, zwang

On Mon, Sep 06, 2021 at 04:16:13PM +0000, Huang Shijie wrote:
> This patch adds AT_NUMA_REPLICATION for execveat().
> 
> If this flag is set, the kernel will trigger COW(copy on write)
> on the mmapped ELF binary. So the program will have a copied-page
> on its NUMA node, even if the original page in page cache is
> on other NUMA nodes.

This does absolutely nothing for programs which run on multiple NUMA
nodes at the same time.

Also, I dislike the abuse of the term "COW" for this.  It's COF --
Copy On Fault.


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [RFC PATCH] fs/exec: Add the support for ELF program's NUMA replication
@ 2021-09-06 16:16 Huang Shijie
  2021-09-06  9:35 ` David Hildenbrand
  2021-09-06 12:09 ` Matthew Wilcox
  0 siblings, 2 replies; 6+ messages in thread
From: Huang Shijie @ 2021-09-06 16:16 UTC (permalink / raw)
  To: viro
  Cc: akpm, jlayton, bfields, torvalds, linux-fsdevel, linux-kernel,
	linux-mm, song.bao.hua, patches, zwang, Huang Shijie

This patch adds AT_NUMA_REPLICATION for execveat().

If this flag is set, the kernel will trigger COW(copy on write)
on the mmapped ELF binary. So the program will have a copied-page
on its NUMA node, even if the original page in page cache is
on other NUMA nodes.

Signed-off-by: Huang Shijie <shijie@os.amperecomputing.com>
---
 fs/binfmt_elf.c            | 27 ++++++++++++++++++++++-----
 fs/exec.c                  |  5 ++++-
 include/linux/binfmts.h    |  1 +
 include/linux/mm.h         |  2 ++
 include/uapi/linux/fcntl.h |  2 ++
 mm/mprotect.c              |  2 +-
 6 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 439ed81e755a..fac8f4a4555a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -362,13 +362,14 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
 
 static unsigned long elf_map(struct file *filep, unsigned long addr,
 		const struct elf_phdr *eppnt, int prot, int type,
-		unsigned long total_size)
+		unsigned long total_size, int numa_replication)
 {
 	unsigned long map_addr;
 	unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr);
 	unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr);
 	addr = ELF_PAGESTART(addr);
 	size = ELF_PAGEALIGN(size);
+	int ret;
 
 	/* mmap() will return -EINVAL if given a zero size, but a
 	 * segment with zero filesize is perfectly valid */
@@ -385,11 +386,26 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
 	*/
 	if (total_size) {
 		total_size = ELF_PAGEALIGN(total_size);
-		map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
+
+		if (numa_replication) {
+			/* Trigger the COW for this ELF code section */
+			map_addr = vm_mmap(filep, addr, total_size, prot | PROT_WRITE,
+						type | MAP_POPULATE, off);
+			if (!IS_ERR_VALUE(map_addr) && !(prot & PROT_WRITE)) {
+				/* Change back */
+				ret = do_mprotect_pkey(map_addr, total_size, prot, -1);
+				if (ret)
+					return ret;
+			}
+		} else {
+			map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
+		}
+
 		if (!BAD_ADDR(map_addr))
 			vm_munmap(map_addr+size, total_size-size);
-	} else
+	} else {
 		map_addr = vm_mmap(filep, addr, size, prot, type, off);
+	}
 
 	if ((type & MAP_FIXED_NOREPLACE) &&
 	    PTR_ERR((void *)map_addr) == -EEXIST)
@@ -635,7 +651,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 				load_addr = -vaddr;
 
 			map_addr = elf_map(interpreter, load_addr + vaddr,
-					eppnt, elf_prot, elf_type, total_size);
+					eppnt, elf_prot, elf_type, total_size, 0);
 			total_size = 0;
 			error = map_addr;
 			if (BAD_ADDR(map_addr))
@@ -1139,7 +1155,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
 		}
 
 		error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
-				elf_prot, elf_flags, total_size);
+				elf_prot, elf_flags, total_size,
+				bprm->support_numa_replication);
 		if (BAD_ADDR(error)) {
 			retval = IS_ERR((void *)error) ?
 				PTR_ERR((void*)error) : -EINVAL;
diff --git a/fs/exec.c b/fs/exec.c
index 38f63451b928..d27efa540641 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -900,7 +900,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
 		.lookup_flags = LOOKUP_FOLLOW,
 	};
 
-	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH | AT_NUMA_REPLICATION)) != 0)
 		return ERR_PTR(-EINVAL);
 	if (flags & AT_SYMLINK_NOFOLLOW)
 		open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
@@ -1828,6 +1828,9 @@ static int bprm_execve(struct linux_binprm *bprm,
 	if (retval)
 		goto out;
 
+	/* Do we support NUMA replication for this program? */
+	bprm->support_numa_replication = flags & AT_NUMA_REPLICATION;
+
 	retval = exec_binprm(bprm);
 	if (retval < 0)
 		goto out;
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 049cf9421d83..1874e1732f20 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -64,6 +64,7 @@ struct linux_binprm {
 	struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */
 
 	char buf[BINPRM_BUF_SIZE];
+	int support_numa_replication;
 } __randomize_layout;
 
 #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7ca22e6e694a..76611381be2a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3244,6 +3244,8 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping,
 #endif
 
 extern int sysctl_nr_trim_pages;
+int do_mprotect_pkey(unsigned long start, size_t len,
+			unsigned long prot, int pkey);
 
 #ifdef CONFIG_PRINTK
 void mem_dump_obj(void *object);
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 2f86b2ad6d7e..de99c5ae8eca 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -111,4 +111,6 @@
 
 #define AT_RECURSIVE		0x8000	/* Apply to the entire subtree */
 
+#define AT_NUMA_REPLICATION	0x10000	/* Support NUMA replication for the ELF program */
+
 #endif /* _UAPI_LINUX_FCNTL_H */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 883e2cc85cad..d1f8cececfed 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -519,7 +519,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 /*
  * pkey==-1 when doing a legacy mprotect()
  */
-static int do_mprotect_pkey(unsigned long start, size_t len,
+int do_mprotect_pkey(unsigned long start, size_t len,
 		unsigned long prot, int pkey)
 {
 	unsigned long nstart, end, tmp, reqprot;
-- 
2.30.2



^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH] fs/exec: Add the support for ELF program's NUMA replication
  2021-09-09  9:46   ` Huang Shijie
@ 2021-09-09  8:58     ` David Hildenbrand
  0 siblings, 0 replies; 6+ messages in thread
From: David Hildenbrand @ 2021-09-09  8:58 UTC (permalink / raw)
  To: Huang Shijie
  Cc: viro, akpm, jlayton, bfields, torvalds, linux-fsdevel,
	linux-kernel, linux-mm, song.bao.hua, patches, zwang

On 09.09.21 11:46, Huang Shijie wrote:
> On Mon, Sep 06, 2021 at 11:35:01AM +0200, David Hildenbrand wrote:
>> On 06.09.21 18:16, Huang Shijie wrote:
>>> This patch adds AT_NUMA_REPLICATION for execveat().
>>>
>>> If this flag is set, the kernel will trigger COW(copy on write)
>>> on the mmapped ELF binary. So the program will have a copied-page
>>> on its NUMA node, even if the original page in page cache is
>>> on other NUMA nodes.
>>
>> Am I missing something important or is this just absolutely not what we
>> want?
> 
> Please see the thread:
> https://marc.info/?l=linux-kernel&m=163070220429222&w=2
> 
> Linus did not think it is a good choice to implement the "per-numa node page cache"

That doesn't make this approach any better.

I don't think we want this in the kernel. If user space wants to waste 
memory, it can happily mmap() however it wants. The advisory is to not 
do it.

-- 
Thanks,

David / dhildenb



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH] fs/exec: Add the support for ELF program's NUMA replication
  2021-09-06  9:35 ` David Hildenbrand
@ 2021-09-09  9:46   ` Huang Shijie
  2021-09-09  8:58     ` David Hildenbrand
  0 siblings, 1 reply; 6+ messages in thread
From: Huang Shijie @ 2021-09-09  9:46 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: viro, akpm, jlayton, bfields, torvalds, linux-fsdevel,
	linux-kernel, linux-mm, song.bao.hua, patches, zwang

On Mon, Sep 06, 2021 at 11:35:01AM +0200, David Hildenbrand wrote:
> On 06.09.21 18:16, Huang Shijie wrote:
> > This patch adds AT_NUMA_REPLICATION for execveat().
> > 
> > If this flag is set, the kernel will trigger COW(copy on write)
> > on the mmapped ELF binary. So the program will have a copied-page
> > on its NUMA node, even if the original page in page cache is
> > on other NUMA nodes.
> 
> Am I missing something important or is this just absolutely not what we
> want?

Please see the thread:
https://marc.info/?l=linux-kernel&m=163070220429222&w=2

Linus did not think it is a good choice to implement the "per-numa node page cache"

> 
> This means that for each and every invocation of the binary, we'll COW the
> complete binary -- an awful lot of wasted main memory and swap.
Only the one who cares about the performance in NUMA will use this interface.
Most of the people never use it..


Thanks
Huang Shijie


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH] fs/exec: Add the support for ELF program's NUMA replication
  2021-09-06 12:09 ` Matthew Wilcox
@ 2021-09-09  9:48   ` Huang Shijie
  0 siblings, 0 replies; 6+ messages in thread
From: Huang Shijie @ 2021-09-09  9:48 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: viro, akpm, jlayton, bfields, torvalds, linux-fsdevel,
	linux-kernel, linux-mm, song.bao.hua, patches, zwang

On Mon, Sep 06, 2021 at 01:09:22PM +0100, Matthew Wilcox wrote:
> On Mon, Sep 06, 2021 at 04:16:13PM +0000, Huang Shijie wrote:
> > This patch adds AT_NUMA_REPLICATION for execveat().
> > 
> > If this flag is set, the kernel will trigger COW(copy on write)
> > on the mmapped ELF binary. So the program will have a copied-page
> > on its NUMA node, even if the original page in page cache is
> > on other NUMA nodes.
> 
> This does absolutely nothing for programs which run on multiple NUMA
> nodes at the same time.
Yes. Not suit for that case (Mysql database..).

> 
> Also, I dislike the abuse of the term "COW" for this.  It's COF --
> Copy On Fault.
Maybe it is better to add new flags "NUMA_REPLICATION" in mmap(). :)

Thanks
Huang Shijie


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-09-09  8:58 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-06 16:16 [RFC PATCH] fs/exec: Add the support for ELF program's NUMA replication Huang Shijie
2021-09-06  9:35 ` David Hildenbrand
2021-09-09  9:46   ` Huang Shijie
2021-09-09  8:58     ` David Hildenbrand
2021-09-06 12:09 ` Matthew Wilcox
2021-09-09  9:48   ` Huang Shijie

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).