All of lore.kernel.org
 help / color / mirror / Atom feed
* Re: [RFC PATCH] fs/exec: Add the support for ELF program's NUMA replication
  2021-09-06 16:16 [RFC PATCH] fs/exec: Add the support for ELF program's NUMA replication Huang Shijie
@ 2021-09-06  9:35 ` David Hildenbrand
  2021-09-09  9:46   ` Huang Shijie
  2021-09-06 12:09 ` Matthew Wilcox
  2021-09-09  1:43 ` kernel test robot
  2 siblings, 1 reply; 7+ messages in thread
From: David Hildenbrand @ 2021-09-06  9:35 UTC (permalink / raw)
  To: Huang Shijie, viro
  Cc: akpm, jlayton, bfields, torvalds, linux-fsdevel, linux-kernel,
	linux-mm, song.bao.hua, patches, zwang

On 06.09.21 18:16, Huang Shijie wrote:
> This patch adds AT_NUMA_REPLICATION for execveat().
> 
> If this flag is set, the kernel will trigger COW(copy on write)
> on the mmapped ELF binary. So the program will have a copied-page
> on its NUMA node, even if the original page in page cache is
> on other NUMA nodes.

Am I missing something important or is this just absolutely not what we 
want?

This means that for each and every invocation of the binary, we'll COW 
the complete binary -- an awful lot of wasted main memory and swap.

This is not a NUMA replication, this is en executable ELF code replication.

> 
> Signed-off-by: Huang Shijie <shijie@os.amperecomputing.com>
> ---
>   fs/binfmt_elf.c            | 27 ++++++++++++++++++++++-----
>   fs/exec.c                  |  5 ++++-
>   include/linux/binfmts.h    |  1 +
>   include/linux/mm.h         |  2 ++
>   include/uapi/linux/fcntl.h |  2 ++
>   mm/mprotect.c              |  2 +-
>   6 files changed, 32 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 439ed81e755a..fac8f4a4555a 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -362,13 +362,14 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
>   
>   static unsigned long elf_map(struct file *filep, unsigned long addr,
>   		const struct elf_phdr *eppnt, int prot, int type,
> -		unsigned long total_size)
> +		unsigned long total_size, int numa_replication)
>   {
>   	unsigned long map_addr;
>   	unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr);
>   	unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr);
>   	addr = ELF_PAGESTART(addr);
>   	size = ELF_PAGEALIGN(size);
> +	int ret;
>   
>   	/* mmap() will return -EINVAL if given a zero size, but a
>   	 * segment with zero filesize is perfectly valid */
> @@ -385,11 +386,26 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
>   	*/
>   	if (total_size) {
>   		total_size = ELF_PAGEALIGN(total_size);
> -		map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
> +
> +		if (numa_replication) {
> +			/* Trigger the COW for this ELF code section */
> +			map_addr = vm_mmap(filep, addr, total_size, prot | PROT_WRITE,
> +						type | MAP_POPULATE, off);
> +			if (!IS_ERR_VALUE(map_addr) && !(prot & PROT_WRITE)) {
> +				/* Change back */
> +				ret = do_mprotect_pkey(map_addr, total_size, prot, -1);
> +				if (ret)
> +					return ret;
> +			}
> +		} else {
> +			map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
> +		}
> +
>   		if (!BAD_ADDR(map_addr))
>   			vm_munmap(map_addr+size, total_size-size);
> -	} else
> +	} else {
>   		map_addr = vm_mmap(filep, addr, size, prot, type, off);
> +	}
>   
>   	if ((type & MAP_FIXED_NOREPLACE) &&
>   	    PTR_ERR((void *)map_addr) == -EEXIST)
> @@ -635,7 +651,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
>   				load_addr = -vaddr;
>   
>   			map_addr = elf_map(interpreter, load_addr + vaddr,
> -					eppnt, elf_prot, elf_type, total_size);
> +					eppnt, elf_prot, elf_type, total_size, 0);
>   			total_size = 0;
>   			error = map_addr;
>   			if (BAD_ADDR(map_addr))
> @@ -1139,7 +1155,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
>   		}
>   
>   		error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
> -				elf_prot, elf_flags, total_size);
> +				elf_prot, elf_flags, total_size,
> +				bprm->support_numa_replication);
>   		if (BAD_ADDR(error)) {
>   			retval = IS_ERR((void *)error) ?
>   				PTR_ERR((void*)error) : -EINVAL;
> diff --git a/fs/exec.c b/fs/exec.c
> index 38f63451b928..d27efa540641 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -900,7 +900,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
>   		.lookup_flags = LOOKUP_FOLLOW,
>   	};
>   
> -	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
> +	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH | AT_NUMA_REPLICATION)) != 0)
>   		return ERR_PTR(-EINVAL);
>   	if (flags & AT_SYMLINK_NOFOLLOW)
>   		open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
> @@ -1828,6 +1828,9 @@ static int bprm_execve(struct linux_binprm *bprm,
>   	if (retval)
>   		goto out;
>   
> +	/* Do we support NUMA replication for this program? */
> +	bprm->support_numa_replication = flags & AT_NUMA_REPLICATION;
> +
>   	retval = exec_binprm(bprm);
>   	if (retval < 0)
>   		goto out;
> diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
> index 049cf9421d83..1874e1732f20 100644
> --- a/include/linux/binfmts.h
> +++ b/include/linux/binfmts.h
> @@ -64,6 +64,7 @@ struct linux_binprm {
>   	struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */
>   
>   	char buf[BINPRM_BUF_SIZE];
> +	int support_numa_replication;
>   } __randomize_layout;
>   
>   #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 7ca22e6e694a..76611381be2a 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3244,6 +3244,8 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping,
>   #endif
>   
>   extern int sysctl_nr_trim_pages;
> +int do_mprotect_pkey(unsigned long start, size_t len,
> +			unsigned long prot, int pkey);
>   
>   #ifdef CONFIG_PRINTK
>   void mem_dump_obj(void *object);
> diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
> index 2f86b2ad6d7e..de99c5ae8eca 100644
> --- a/include/uapi/linux/fcntl.h
> +++ b/include/uapi/linux/fcntl.h
> @@ -111,4 +111,6 @@
>   
>   #define AT_RECURSIVE		0x8000	/* Apply to the entire subtree */
>   
> +#define AT_NUMA_REPLICATION	0x10000	/* Support NUMA replication for the ELF program */
> +
>   #endif /* _UAPI_LINUX_FCNTL_H */
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 883e2cc85cad..d1f8cececfed 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -519,7 +519,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
>   /*
>    * pkey==-1 when doing a legacy mprotect()
>    */
> -static int do_mprotect_pkey(unsigned long start, size_t len,
> +int do_mprotect_pkey(unsigned long start, size_t len,
>   		unsigned long prot, int pkey)
>   {
>   	unsigned long nstart, end, tmp, reqprot;
> 


-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH] fs/exec: Add the support for ELF program's NUMA replication
  2021-09-06 16:16 [RFC PATCH] fs/exec: Add the support for ELF program's NUMA replication Huang Shijie
  2021-09-06  9:35 ` David Hildenbrand
@ 2021-09-06 12:09 ` Matthew Wilcox
  2021-09-09  9:48   ` Huang Shijie
  2021-09-09  1:43 ` kernel test robot
  2 siblings, 1 reply; 7+ messages in thread
From: Matthew Wilcox @ 2021-09-06 12:09 UTC (permalink / raw)
  To: Huang Shijie
  Cc: viro, akpm, jlayton, bfields, torvalds, linux-fsdevel,
	linux-kernel, linux-mm, song.bao.hua, patches, zwang

On Mon, Sep 06, 2021 at 04:16:13PM +0000, Huang Shijie wrote:
> This patch adds AT_NUMA_REPLICATION for execveat().
> 
> If this flag is set, the kernel will trigger COW(copy on write)
> on the mmapped ELF binary. So the program will have a copied-page
> on its NUMA node, even if the original page in page cache is
> on other NUMA nodes.

This does absolutely nothing for programs which run on multiple NUMA
nodes at the same time.

Also, I dislike the abuse of the term "COW" for this.  It's COF --
Copy On Fault.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [RFC PATCH] fs/exec: Add the support for ELF program's NUMA replication
@ 2021-09-06 16:16 Huang Shijie
  2021-09-06  9:35 ` David Hildenbrand
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Huang Shijie @ 2021-09-06 16:16 UTC (permalink / raw)
  To: viro
  Cc: akpm, jlayton, bfields, torvalds, linux-fsdevel, linux-kernel,
	linux-mm, song.bao.hua, patches, zwang, Huang Shijie

This patch adds AT_NUMA_REPLICATION for execveat().

If this flag is set, the kernel will trigger COW(copy on write)
on the mmapped ELF binary. So the program will have a copied-page
on its NUMA node, even if the original page in page cache is
on other NUMA nodes.

Signed-off-by: Huang Shijie <shijie@os.amperecomputing.com>
---
 fs/binfmt_elf.c            | 27 ++++++++++++++++++++++-----
 fs/exec.c                  |  5 ++++-
 include/linux/binfmts.h    |  1 +
 include/linux/mm.h         |  2 ++
 include/uapi/linux/fcntl.h |  2 ++
 mm/mprotect.c              |  2 +-
 6 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 439ed81e755a..fac8f4a4555a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -362,13 +362,14 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
 
 static unsigned long elf_map(struct file *filep, unsigned long addr,
 		const struct elf_phdr *eppnt, int prot, int type,
-		unsigned long total_size)
+		unsigned long total_size, int numa_replication)
 {
 	unsigned long map_addr;
 	unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr);
 	unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr);
 	addr = ELF_PAGESTART(addr);
 	size = ELF_PAGEALIGN(size);
+	int ret;
 
 	/* mmap() will return -EINVAL if given a zero size, but a
 	 * segment with zero filesize is perfectly valid */
@@ -385,11 +386,26 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
 	*/
 	if (total_size) {
 		total_size = ELF_PAGEALIGN(total_size);
-		map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
+
+		if (numa_replication) {
+			/* Trigger the COW for this ELF code section */
+			map_addr = vm_mmap(filep, addr, total_size, prot | PROT_WRITE,
+						type | MAP_POPULATE, off);
+			if (!IS_ERR_VALUE(map_addr) && !(prot & PROT_WRITE)) {
+				/* Change back */
+				ret = do_mprotect_pkey(map_addr, total_size, prot, -1);
+				if (ret)
+					return ret;
+			}
+		} else {
+			map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
+		}
+
 		if (!BAD_ADDR(map_addr))
 			vm_munmap(map_addr+size, total_size-size);
-	} else
+	} else {
 		map_addr = vm_mmap(filep, addr, size, prot, type, off);
+	}
 
 	if ((type & MAP_FIXED_NOREPLACE) &&
 	    PTR_ERR((void *)map_addr) == -EEXIST)
@@ -635,7 +651,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 				load_addr = -vaddr;
 
 			map_addr = elf_map(interpreter, load_addr + vaddr,
-					eppnt, elf_prot, elf_type, total_size);
+					eppnt, elf_prot, elf_type, total_size, 0);
 			total_size = 0;
 			error = map_addr;
 			if (BAD_ADDR(map_addr))
@@ -1139,7 +1155,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
 		}
 
 		error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
-				elf_prot, elf_flags, total_size);
+				elf_prot, elf_flags, total_size,
+				bprm->support_numa_replication);
 		if (BAD_ADDR(error)) {
 			retval = IS_ERR((void *)error) ?
 				PTR_ERR((void*)error) : -EINVAL;
diff --git a/fs/exec.c b/fs/exec.c
index 38f63451b928..d27efa540641 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -900,7 +900,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
 		.lookup_flags = LOOKUP_FOLLOW,
 	};
 
-	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH | AT_NUMA_REPLICATION)) != 0)
 		return ERR_PTR(-EINVAL);
 	if (flags & AT_SYMLINK_NOFOLLOW)
 		open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
@@ -1828,6 +1828,9 @@ static int bprm_execve(struct linux_binprm *bprm,
 	if (retval)
 		goto out;
 
+	/* Do we support NUMA replication for this program? */
+	bprm->support_numa_replication = flags & AT_NUMA_REPLICATION;
+
 	retval = exec_binprm(bprm);
 	if (retval < 0)
 		goto out;
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 049cf9421d83..1874e1732f20 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -64,6 +64,7 @@ struct linux_binprm {
 	struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */
 
 	char buf[BINPRM_BUF_SIZE];
+	int support_numa_replication;
 } __randomize_layout;
 
 #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7ca22e6e694a..76611381be2a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3244,6 +3244,8 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping,
 #endif
 
 extern int sysctl_nr_trim_pages;
+int do_mprotect_pkey(unsigned long start, size_t len,
+			unsigned long prot, int pkey);
 
 #ifdef CONFIG_PRINTK
 void mem_dump_obj(void *object);
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 2f86b2ad6d7e..de99c5ae8eca 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -111,4 +111,6 @@
 
 #define AT_RECURSIVE		0x8000	/* Apply to the entire subtree */
 
+#define AT_NUMA_REPLICATION	0x10000	/* Support NUMA replication for the ELF program */
+
 #endif /* _UAPI_LINUX_FCNTL_H */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 883e2cc85cad..d1f8cececfed 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -519,7 +519,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 /*
  * pkey==-1 when doing a legacy mprotect()
  */
-static int do_mprotect_pkey(unsigned long start, size_t len,
+int do_mprotect_pkey(unsigned long start, size_t len,
 		unsigned long prot, int pkey)
 {
 	unsigned long nstart, end, tmp, reqprot;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH] fs/exec: Add the support for ELF program's NUMA replication
  2021-09-06 16:16 [RFC PATCH] fs/exec: Add the support for ELF program's NUMA replication Huang Shijie
  2021-09-06  9:35 ` David Hildenbrand
  2021-09-06 12:09 ` Matthew Wilcox
@ 2021-09-09  1:43 ` kernel test robot
  2 siblings, 0 replies; 7+ messages in thread
From: kernel test robot @ 2021-09-09  1:43 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 4202 bytes --]

Hi Huang,

[FYI, it's a private test report for your RFC patch.]
[auto build test WARNING on hnaz-linux-mm/master]
[also build test WARNING on linux/master v5.14]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Huang-Shijie/fs-exec-Add-the-support-for-ELF-program-s-NUMA-replication/20210906-162119
base:   https://github.com/hnaz/linux-mm master
config: parisc-randconfig-s031-20210908 (attached as .config)
compiler: hppa-linux-gcc (GCC) 11.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # apt-get install sparse
        # sparse version: v0.6.4-dirty
        # https://github.com/0day-ci/linux/commit/b3c6ac3aa420536e234f841ab5196349b5624dcb
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Huang-Shijie/fs-exec-Add-the-support-for-ELF-program-s-NUMA-replication/20210906-162119
        git checkout b3c6ac3aa420536e234f841ab5196349b5624dcb
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' ARCH=parisc 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   fs/binfmt_elf.c: In function 'elf_map':
>> fs/binfmt_elf.c:372:9: warning: ISO C90 forbids mixed declarations and code [-Wdeclaration-after-statement]
     372 |         int ret;
         |         ^~~


vim +372 fs/binfmt_elf.c

   362	
   363	static unsigned long elf_map(struct file *filep, unsigned long addr,
   364			const struct elf_phdr *eppnt, int prot, int type,
   365			unsigned long total_size, int numa_replication)
   366	{
   367		unsigned long map_addr;
   368		unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr);
   369		unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr);
   370		addr = ELF_PAGESTART(addr);
   371		size = ELF_PAGEALIGN(size);
 > 372		int ret;
   373	
   374		/* mmap() will return -EINVAL if given a zero size, but a
   375		 * segment with zero filesize is perfectly valid */
   376		if (!size)
   377			return addr;
   378	
   379		/*
   380		* total_size is the size of the ELF (interpreter) image.
   381		* The _first_ mmap needs to know the full size, otherwise
   382		* randomization might put this image into an overlapping
   383		* position with the ELF binary image. (since size < total_size)
   384		* So we first map the 'big' image - and unmap the remainder at
   385		* the end. (which unmap is needed for ELF images with holes.)
   386		*/
   387		if (total_size) {
   388			total_size = ELF_PAGEALIGN(total_size);
   389	
   390			if (numa_replication) {
   391				/* Trigger the COW for this ELF code section */
   392				map_addr = vm_mmap(filep, addr, total_size, prot | PROT_WRITE,
   393							type | MAP_POPULATE, off);
   394				if (!IS_ERR_VALUE(map_addr) && !(prot & PROT_WRITE)) {
   395					/* Change back */
   396					ret = do_mprotect_pkey(map_addr, total_size, prot, -1);
   397					if (ret)
   398						return ret;
   399				}
   400			} else {
   401				map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
   402			}
   403	
   404			if (!BAD_ADDR(map_addr))
   405				vm_munmap(map_addr+size, total_size-size);
   406		} else {
   407			map_addr = vm_mmap(filep, addr, size, prot, type, off);
   408		}
   409	
   410		if ((type & MAP_FIXED_NOREPLACE) &&
   411		    PTR_ERR((void *)map_addr) == -EEXIST)
   412			pr_info("%d (%s): Uhuuh, elf segment@%px requested but the memory is mapped already\n",
   413				task_pid_nr(current), current->comm, (void *)addr);
   414	
   415		return(map_addr);
   416	}
   417	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 28622 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH] fs/exec: Add the support for ELF program's NUMA replication
  2021-09-09  9:46   ` Huang Shijie
@ 2021-09-09  8:58     ` David Hildenbrand
  0 siblings, 0 replies; 7+ messages in thread
From: David Hildenbrand @ 2021-09-09  8:58 UTC (permalink / raw)
  To: Huang Shijie
  Cc: viro, akpm, jlayton, bfields, torvalds, linux-fsdevel,
	linux-kernel, linux-mm, song.bao.hua, patches, zwang

On 09.09.21 11:46, Huang Shijie wrote:
> On Mon, Sep 06, 2021 at 11:35:01AM +0200, David Hildenbrand wrote:
>> On 06.09.21 18:16, Huang Shijie wrote:
>>> This patch adds AT_NUMA_REPLICATION for execveat().
>>>
>>> If this flag is set, the kernel will trigger COW(copy on write)
>>> on the mmapped ELF binary. So the program will have a copied-page
>>> on its NUMA node, even if the original page in page cache is
>>> on other NUMA nodes.
>>
>> Am I missing something important or is this just absolutely not what we
>> want?
> 
> Please see the thread:
> https://marc.info/?l=linux-kernel&m=163070220429222&w=2
> 
> Linus did not think it is a good choice to implement the "per-numa node page cache"

That doesn't make this approach any better.

I don't think we want this in the kernel. If user space wants to waste 
memory, it can happily mmap() however it wants. The advisory is to not 
do it.

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH] fs/exec: Add the support for ELF program's NUMA replication
  2021-09-06  9:35 ` David Hildenbrand
@ 2021-09-09  9:46   ` Huang Shijie
  2021-09-09  8:58     ` David Hildenbrand
  0 siblings, 1 reply; 7+ messages in thread
From: Huang Shijie @ 2021-09-09  9:46 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: viro, akpm, jlayton, bfields, torvalds, linux-fsdevel,
	linux-kernel, linux-mm, song.bao.hua, patches, zwang

On Mon, Sep 06, 2021 at 11:35:01AM +0200, David Hildenbrand wrote:
> On 06.09.21 18:16, Huang Shijie wrote:
> > This patch adds AT_NUMA_REPLICATION for execveat().
> > 
> > If this flag is set, the kernel will trigger COW(copy on write)
> > on the mmapped ELF binary. So the program will have a copied-page
> > on its NUMA node, even if the original page in page cache is
> > on other NUMA nodes.
> 
> Am I missing something important or is this just absolutely not what we
> want?

Please see the thread:
https://marc.info/?l=linux-kernel&m=163070220429222&w=2

Linus did not think it is a good choice to implement the "per-numa node page cache"

> 
> This means that for each and every invocation of the binary, we'll COW the
> complete binary -- an awful lot of wasted main memory and swap.
Only the one who cares about the performance in NUMA will use this interface.
Most of the people never use it..


Thanks
Huang Shijie

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH] fs/exec: Add the support for ELF program's NUMA replication
  2021-09-06 12:09 ` Matthew Wilcox
@ 2021-09-09  9:48   ` Huang Shijie
  0 siblings, 0 replies; 7+ messages in thread
From: Huang Shijie @ 2021-09-09  9:48 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: viro, akpm, jlayton, bfields, torvalds, linux-fsdevel,
	linux-kernel, linux-mm, song.bao.hua, patches, zwang

On Mon, Sep 06, 2021 at 01:09:22PM +0100, Matthew Wilcox wrote:
> On Mon, Sep 06, 2021 at 04:16:13PM +0000, Huang Shijie wrote:
> > This patch adds AT_NUMA_REPLICATION for execveat().
> > 
> > If this flag is set, the kernel will trigger COW(copy on write)
> > on the mmapped ELF binary. So the program will have a copied-page
> > on its NUMA node, even if the original page in page cache is
> > on other NUMA nodes.
> 
> This does absolutely nothing for programs which run on multiple NUMA
> nodes at the same time.
Yes. Not suit for that case (Mysql database..).

> 
> Also, I dislike the abuse of the term "COW" for this.  It's COF --
> Copy On Fault.
Maybe it is better to add new flags "NUMA_REPLICATION" in mmap(). :)

Thanks
Huang Shijie

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-09-09  8:58 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-06 16:16 [RFC PATCH] fs/exec: Add the support for ELF program's NUMA replication Huang Shijie
2021-09-06  9:35 ` David Hildenbrand
2021-09-09  9:46   ` Huang Shijie
2021-09-09  8:58     ` David Hildenbrand
2021-09-06 12:09 ` Matthew Wilcox
2021-09-09  9:48   ` Huang Shijie
2021-09-09  1:43 ` kernel test robot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.