On x86, 5-level paging enables 56-bit userspace virtual address space. Not all user space is ready to handle wide addresses. It's known that at least some JIT compilers use higher bits in pointers to encode their information. It collides with valid pointers with 5-level paging and leads to crashes. To mitigate this, we are not going to allocate virtual address space above 47-bit by default. But userspace can ask for allocation from full address space by specifying hint address (with or without MAP_FIXED) above 47-bits. If hint address set above 47-bit, but MAP_FIXED is not specified, we try to look for unmapped area by specified address. If it's already occupied, we look for unmapped area in *full* address space, rather than from 47-bit window. A high hint address would only affect the allocation in question, but not any future mmap()s. Specifying high hint address on older kernel or on machine without 5-level paging support is safe. The hint will be ignored and kernel will fall back to allocation from 47-bit address space. This approach helps to easily make application's memory allocator aware about large address space without manually tracking allocated virtual address space. One important case we need to handle here is interaction with MPX. MPX (without MAWA( extension cannot handle addresses above 47-bit, so we need to make sure that MPX cannot be enabled we already have VMA above the boundary and forbid creating such VMAs once MPX is enabled. Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Reviewed-by: Dmitry Safonov <dsafonov@virtuozzo.com> Cc: linux-api@vger.kernel.org --- arch/x86/include/asm/elf.h | 4 ++-- arch/x86/include/asm/mpx.h | 9 +++++++++ arch/x86/include/asm/processor.h | 11 ++++++++--- arch/x86/kernel/sys_x86_64.c | 30 ++++++++++++++++++++++++++---- arch/x86/mm/hugetlbpage.c | 27 +++++++++++++++++++++++---- arch/x86/mm/mmap.c | 6 +++--- arch/x86/mm/mpx.c | 33 ++++++++++++++++++++++++++++++++- 7 files changed, 103 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index e8ab9a46bc68..7a30513a4046 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -250,7 +250,7 @@ extern int force_personality32; the loader. We need to make sure that it is out of the way of the program that it will "exec", and that there is sufficient room for the brk. */ -#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2) +#define ELF_ET_DYN_BASE (TASK_SIZE_LOW / 3 * 2) /* This yields a mask that user programs can use to figure out what instruction set this CPU supports. This could be done in user space, @@ -304,7 +304,7 @@ static inline int mmap_is_ia32(void) } extern unsigned long tasksize_32bit(void); -extern unsigned long tasksize_64bit(void); +extern unsigned long tasksize_64bit(int full_addr_space); extern unsigned long get_mmap_base(int is_legacy); #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h index a0d662be4c5b..7d7404756bb4 100644 --- a/arch/x86/include/asm/mpx.h +++ b/arch/x86/include/asm/mpx.h @@ -73,6 +73,9 @@ static inline void mpx_mm_init(struct mm_struct *mm) } void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start, unsigned long end); + +unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len, + unsigned long flags); #else static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) { @@ -94,6 +97,12 @@ static inline void mpx_notify_unmap(struct mm_struct *mm, unsigned long start, unsigned long end) { } + +static inline unsigned long mpx_unmapped_area_check(unsigned long addr, + unsigned long len, unsigned long flags) +{ + return addr; +} #endif /* CONFIG_X86_INTEL_MPX */ #endif /* _ASM_X86_MPX_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 3cada998a402..aaed58b03ddb 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -795,6 +795,7 @@ static inline void spin_lock_prefetch(const void *x) #define IA32_PAGE_OFFSET PAGE_OFFSET #define TASK_SIZE PAGE_OFFSET #define TASK_SIZE_MAX TASK_SIZE +#define DEFAULT_MAP_WINDOW TASK_SIZE #define STACK_TOP TASK_SIZE #define STACK_TOP_MAX STACK_TOP @@ -834,7 +835,9 @@ static inline void spin_lock_prefetch(const void *x) * particular problem by preventing anything from being mapped * at the maximum canonical address. */ -#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE) +#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) + +#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) /* This decides where the kernel will search for a free chunk of vm * space during mmap's. @@ -842,12 +845,14 @@ static inline void spin_lock_prefetch(const void *x) #define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ 0xc0000000 : 0xFFFFe000) +#define TASK_SIZE_LOW (test_thread_flag(TIF_ADDR32) ? \ + IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW) #define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \ IA32_PAGE_OFFSET : TASK_SIZE_MAX) #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \ IA32_PAGE_OFFSET : TASK_SIZE_MAX) -#define STACK_TOP TASK_SIZE +#define STACK_TOP TASK_SIZE_LOW #define STACK_TOP_MAX TASK_SIZE_MAX #define INIT_THREAD { \ @@ -870,7 +875,7 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, * space during mmap's. */ #define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3)) -#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE) +#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW) #define KSTK_EIP(task) (task_pt_regs(task)->ip) diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 207b8f2582c7..74d1587b181d 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -21,6 +21,7 @@ #include <asm/compat.h> #include <asm/ia32.h> #include <asm/syscalls.h> +#include <asm/mpx.h> /* * Align a virtual address to avoid aliasing in the I$ on AMD F15h. @@ -100,8 +101,8 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, return error; } -static void find_start_end(unsigned long flags, unsigned long *begin, - unsigned long *end) +static void find_start_end(unsigned long addr, unsigned long flags, + unsigned long *begin, unsigned long *end) { if (!in_compat_syscall() && (flags & MAP_32BIT)) { /* This is usually used needed to map code in small @@ -120,7 +121,10 @@ static void find_start_end(unsigned long flags, unsigned long *begin, } *begin = get_mmap_base(1); - *end = in_compat_syscall() ? tasksize_32bit() : tasksize_64bit(); + if (in_compat_syscall()) + *end = tasksize_32bit(); + else + *end = tasksize_64bit(addr > DEFAULT_MAP_WINDOW); } unsigned long @@ -132,10 +136,14 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, struct vm_unmapped_area_info info; unsigned long begin, end; + addr = mpx_unmapped_area_check(addr, len, flags); + if (IS_ERR_VALUE(addr)) + return addr; + if (flags & MAP_FIXED) return addr; - find_start_end(flags, &begin, &end); + find_start_end(addr, flags, &begin, &end); if (len > end) return -ENOMEM; @@ -171,6 +179,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, unsigned long addr = addr0; struct vm_unmapped_area_info info; + addr = mpx_unmapped_area_check(addr, len, flags); + if (IS_ERR_VALUE(addr)) + return addr; + /* requested length too big for entire address space */ if (len > TASK_SIZE) return -ENOMEM; @@ -195,6 +207,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = get_mmap_base(0); + + /* + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area + * in the full address space. + * + * !in_compat_syscall() check to avoid high addresses for x32. + */ + if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) + info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; + info.align_mask = 0; info.align_offset = pgoff << PAGE_SHIFT; if (filp) { diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 302f43fd9c28..730f00250acb 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -18,6 +18,7 @@ #include <asm/tlbflush.h> #include <asm/pgalloc.h> #include <asm/elf.h> +#include <asm/mpx.h> #if 0 /* This is just for testing */ struct page * @@ -85,25 +86,38 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, info.flags = 0; info.length = len; info.low_limit = get_mmap_base(1); + + /* + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area + * in the full address space. + */ info.high_limit = in_compat_syscall() ? - tasksize_32bit() : tasksize_64bit(); + tasksize_32bit() : tasksize_64bit(addr > DEFAULT_MAP_WINDOW); + info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); } static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, - unsigned long addr0, unsigned long len, + unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); struct vm_unmapped_area_info info; - unsigned long addr; info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = get_mmap_base(0); + + /* + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area + * in the full address space. + */ + if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) + info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); @@ -118,7 +132,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = TASK_SIZE; + info.high_limit = TASK_SIZE_LOW; addr = vm_unmapped_area(&info); } @@ -135,6 +149,11 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (len & ~huge_page_mask(h)) return -EINVAL; + + addr = mpx_unmapped_area_check(addr, len, flags); + if (IS_ERR_VALUE(addr)) + return addr; + if (len > TASK_SIZE) return -ENOMEM; diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 19ad095b41df..199050249d60 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -42,9 +42,9 @@ unsigned long tasksize_32bit(void) return IA32_PAGE_OFFSET; } -unsigned long tasksize_64bit(void) +unsigned long tasksize_64bit(int full_addr_space) { - return TASK_SIZE_MAX; + return full_addr_space ? TASK_SIZE_MAX : DEFAULT_MAP_WINDOW; } static unsigned long stack_maxrandom_size(unsigned long task_size) @@ -140,7 +140,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) mm->get_unmapped_area = arch_get_unmapped_area_topdown; arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, - arch_rnd(mmap64_rnd_bits), tasksize_64bit()); + arch_rnd(mmap64_rnd_bits), tasksize_64bit(0)); #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES /* diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 1c34b767c84c..8c8da27e8549 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -355,10 +355,19 @@ int mpx_enable_management(void) */ bd_base = mpx_get_bounds_dir(); down_write(&mm->mmap_sem); + + /* MPX doesn't support addresses above 47-bits yet. */ + if (find_vma(mm, DEFAULT_MAP_WINDOW)) { + pr_warn_once("%s (%d): MPX cannot handle addresses " + "above 47-bits. Disabling.", + current->comm, current->pid); + ret = -ENXIO; + goto out; + } mm->context.bd_addr = bd_base; if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR) ret = -ENXIO; - +out: up_write(&mm->mmap_sem); return ret; } @@ -1030,3 +1039,25 @@ void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, if (ret) force_sig(SIGSEGV, current); } + +/* MPX cannot handle addresses above 47-bits yet. */ +unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len, + unsigned long flags) +{ + if (!kernel_managing_mpx_tables(current->mm)) + return addr; + if (addr + len <= DEFAULT_MAP_WINDOW) + return addr; + if (flags & MAP_FIXED) + return -ENOMEM; + + /* + * Requested len is larger than whole area we're allowed to map in. + * Resetting hinting address wouldn't do much good -- fail early. + */ + if (len > DEFAULT_MAP_WINDOW) + return -ENOMEM; + + /* Look for unmap area within DEFAULT_MAP_WINDOW */ + return 0; +} -- 2.11.0
[-- Attachment #1: Type: text/plain, Size: 5255 bytes --] Hi Kirill, [auto build test ERROR on linus/master] [also build test ERROR on v4.12-rc1 next-20170515] [cannot apply to tip/x86/core xen-tip/linux-next] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Kirill-A-Shutemov/x86-5-level-paging-enabling-for-v4-12-Part-4/20170515-202736 config: i386-defconfig (attached as .config) compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901 reproduce: # save the attached .config to linux build tree make ARCH=i386 All error/warnings (new ones prefixed by >>): In file included from include/linux/cache.h:4:0, from include/linux/printk.h:8, from include/linux/kernel.h:13, from mm/mmap.c:11: mm/mmap.c: In function 'arch_get_unmapped_area_topdown': arch/x86/include/asm/processor.h:878:50: error: 'TASK_SIZE_LOW' undeclared (first use in this function) #define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW) ^ include/uapi/linux/kernel.h:10:41: note: in definition of macro '__ALIGN_KERNEL_MASK' #define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask)) ^ include/linux/kernel.h:49:22: note: in expansion of macro '__ALIGN_KERNEL' #define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) ^~~~~~~~~~~~~~ include/linux/mm.h:132:26: note: in expansion of macro 'ALIGN' #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) ^~~~~ arch/x86/include/asm/processor.h:877:42: note: in expansion of macro 'PAGE_ALIGN' #define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3)) ^~~~~~~~~~ arch/x86/include/asm/processor.h:878:29: note: in expansion of macro '__TASK_UNMAPPED_BASE' #define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW) ^~~~~~~~~~~~~~~~~~~~ >> mm/mmap.c:2043:20: note: in expansion of macro 'TASK_UNMAPPED_BASE' info.low_limit = TASK_UNMAPPED_BASE; ^~~~~~~~~~~~~~~~~~ arch/x86/include/asm/processor.h:878:50: note: each undeclared identifier is reported only once for each function it appears in #define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW) ^ include/uapi/linux/kernel.h:10:41: note: in definition of macro '__ALIGN_KERNEL_MASK' #define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask)) ^ include/linux/kernel.h:49:22: note: in expansion of macro '__ALIGN_KERNEL' #define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) ^~~~~~~~~~~~~~ include/linux/mm.h:132:26: note: in expansion of macro 'ALIGN' #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) ^~~~~ arch/x86/include/asm/processor.h:877:42: note: in expansion of macro 'PAGE_ALIGN' #define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3)) ^~~~~~~~~~ arch/x86/include/asm/processor.h:878:29: note: in expansion of macro '__TASK_UNMAPPED_BASE' #define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW) ^~~~~~~~~~~~~~~~~~~~ >> mm/mmap.c:2043:20: note: in expansion of macro 'TASK_UNMAPPED_BASE' info.low_limit = TASK_UNMAPPED_BASE; ^~~~~~~~~~~~~~~~~~ -- In file included from include/linux/elf.h:4:0, from include/linux/module.h:15, from fs/binfmt_elf.c:12: fs/binfmt_elf.c: In function 'load_elf_binary': >> arch/x86/include/asm/elf.h:253:27: error: 'TASK_SIZE_LOW' undeclared (first use in this function) #define ELF_ET_DYN_BASE (TASK_SIZE_LOW / 3 * 2) ^ >> fs/binfmt_elf.c:937:16: note: in expansion of macro 'ELF_ET_DYN_BASE' load_bias = ELF_ET_DYN_BASE - vaddr; ^~~~~~~~~~~~~~~ arch/x86/include/asm/elf.h:253:27: note: each undeclared identifier is reported only once for each function it appears in #define ELF_ET_DYN_BASE (TASK_SIZE_LOW / 3 * 2) ^ >> fs/binfmt_elf.c:937:16: note: in expansion of macro 'ELF_ET_DYN_BASE' load_bias = ELF_ET_DYN_BASE - vaddr; ^~~~~~~~~~~~~~~ vim +/TASK_SIZE_LOW +253 arch/x86/include/asm/elf.h 247 248 /* This is the location that an ET_DYN program is loaded if exec'ed. Typical 249 use of this is to invoke "./ld.so someprog" to test out a new version of 250 the loader. We need to make sure that it is out of the way of the program 251 that it will "exec", and that there is sufficient room for the brk. */ 252 > 253 #define ELF_ET_DYN_BASE (TASK_SIZE_LOW / 3 * 2) 254 255 /* This yields a mask that user programs can use to figure out what 256 instruction set this CPU supports. This could be done in user space, --- 0-DAY kernel test infrastructure Open Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation [-- Attachment #2: .config.gz --] [-- Type: application/gzip, Size: 26192 bytes --]
On Mon, May 15, 2017 at 10:49:43PM +0800, kbuild test robot wrote: > Hi Kirill, > > [auto build test ERROR on linus/master] > [also build test ERROR on v4.12-rc1 next-20170515] > [cannot apply to tip/x86/core xen-tip/linux-next] > [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] > > url: https://github.com/0day-ci/linux/commits/Kirill-A-Shutemov/x86-5-level-paging-enabling-for-v4-12-Part-4/20170515-202736 > config: i386-defconfig (attached as .config) > compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901 > reproduce: > # save the attached .config to linux build tree > make ARCH=i386 > > All error/warnings (new ones prefixed by >>): > > In file included from include/linux/cache.h:4:0, > from include/linux/printk.h:8, > from include/linux/kernel.h:13, > from mm/mmap.c:11: > mm/mmap.c: In function 'arch_get_unmapped_area_topdown': > arch/x86/include/asm/processor.h:878:50: error: 'TASK_SIZE_LOW' undeclared (first use in this function) > #define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW) Thanks. Fixup is below. Let me know if I need to send the full patch: diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index aaed58b03ddb..65663de9287b 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -794,6 +794,7 @@ static inline void spin_lock_prefetch(const void *x) */ #define IA32_PAGE_OFFSET PAGE_OFFSET #define TASK_SIZE PAGE_OFFSET +#define TASK_SIZE_LOW TASK_SIZE #define TASK_SIZE_MAX TASK_SIZE #define DEFAULT_MAP_WINDOW TASK_SIZE #define STACK_TOP TASK_SIZE -- Kirill A. Shutemov
On Mon 15-05-17 15:12:18, Kirill A. Shutemov wrote: [...] > @@ -195,6 +207,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, > info.length = len; > info.low_limit = PAGE_SIZE; > info.high_limit = get_mmap_base(0); > + > + /* > + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area > + * in the full address space. > + * > + * !in_compat_syscall() check to avoid high addresses for x32. > + */ > + if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) > + info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; > + > info.align_mask = 0; > info.align_offset = pgoff << PAGE_SHIFT; > if (filp) { I have two questions/concerns here. The above assumes that any address above 1<<47 will use the _whole_ address space. Is this what we want? What if somebody does mmap(1<<52, ...) because he wants to (ab)use 53+ bits for some other purpose? Shouldn't we cap the high_limit by the given address? Another thing would be that /* requesting a specific address */ if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && (!vma || addr + len <= vma->vm_start)) return addr; } would fail for mmap(-1UL, ...) which is good because we do want to fallback to vm_unmapped_area and have randomized address which is ensured by your info.high_limit += ... but that wouldn't work for mmap(1<<N, ...) where N>47. So the first such mapping won't be randomized while others will be. This is quite unexpected I would say. So it should be documented at least or maybe we want to skip the above shortcut for addr > DEFAULT_MAP_WINDOW altogether. The patch looks sensible other than that. -- Michal Hocko SUSE Labs -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
On Thu, May 18, 2017 at 01:43:59PM +0200, Michal Hocko wrote: > On Mon 15-05-17 15:12:18, Kirill A. Shutemov wrote: > [...] > > @@ -195,6 +207,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, > > info.length = len; > > info.low_limit = PAGE_SIZE; > > info.high_limit = get_mmap_base(0); > > + > > + /* > > + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area > > + * in the full address space. > > + * > > + * !in_compat_syscall() check to avoid high addresses for x32. > > + */ > > + if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) > > + info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; > > + > > info.align_mask = 0; > > info.align_offset = pgoff << PAGE_SHIFT; > > if (filp) { > > I have two questions/concerns here. The above assumes that any address above > 1<<47 will use the _whole_ address space. Is this what we want? Yes, I believe so. > What if somebody does mmap(1<<52, ...) because he wants to (ab)use 53+ > bits for some other purpose? Shouldn't we cap the high_limit by the > given address? This would screw existing semantics of hint address -- "map here if free, please". > Another thing would be that > /* requesting a specific address */ > if (addr) { > addr = PAGE_ALIGN(addr); > vma = find_vma(mm, addr); > if (TASK_SIZE - len >= addr && > (!vma || addr + len <= vma->vm_start)) > return addr; > } > > would fail for mmap(-1UL, ...) which is good because we do want to > fallback to vm_unmapped_area and have randomized address which is > ensured by your info.high_limit += ... but that wouldn't work for > mmap(1<<N, ...) where N>47. So the first such mapping won't be > randomized while others will be. This is quite unexpected I would say. > So it should be documented at least or maybe we want to skip the above > shortcut for addr > DEFAULT_MAP_WINDOW altogether. Again, you're missing existing semantics of hint address. You may have a reason to set hint address above 47-bit, besides getting access to full address space. -- Kirill A. Shutemov
On Thu 18-05-17 18:19:52, Kirill A. Shutemov wrote: > On Thu, May 18, 2017 at 01:43:59PM +0200, Michal Hocko wrote: > > On Mon 15-05-17 15:12:18, Kirill A. Shutemov wrote: > > [...] > > > @@ -195,6 +207,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, > > > info.length = len; > > > info.low_limit = PAGE_SIZE; > > > info.high_limit = get_mmap_base(0); > > > + > > > + /* > > > + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area > > > + * in the full address space. > > > + * > > > + * !in_compat_syscall() check to avoid high addresses for x32. > > > + */ > > > + if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) > > > + info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; > > > + > > > info.align_mask = 0; > > > info.align_offset = pgoff << PAGE_SHIFT; > > > if (filp) { > > > > I have two questions/concerns here. The above assumes that any address above > > 1<<47 will use the _whole_ address space. Is this what we want? > > Yes, I believe so. > > > What if somebody does mmap(1<<52, ...) because he wants to (ab)use 53+ > > bits for some other purpose? Shouldn't we cap the high_limit by the > > given address? > > This would screw existing semantics of hint address -- "map here if > free, please". Well, the given address is just _hint_. We are still allowed to map to a different place. And it is not specified whether the resulting mapping is above or below that address. So I do not think it would screw the existing semantic. Or do I miss something? -- Michal Hocko SUSE Labs -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
On Thu, May 18, 2017 at 05:27:36PM +0200, Michal Hocko wrote: > On Thu 18-05-17 18:19:52, Kirill A. Shutemov wrote: > > On Thu, May 18, 2017 at 01:43:59PM +0200, Michal Hocko wrote: > > > On Mon 15-05-17 15:12:18, Kirill A. Shutemov wrote: > > > [...] > > > > @@ -195,6 +207,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, > > > > info.length = len; > > > > info.low_limit = PAGE_SIZE; > > > > info.high_limit = get_mmap_base(0); > > > > + > > > > + /* > > > > + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area > > > > + * in the full address space. > > > > + * > > > > + * !in_compat_syscall() check to avoid high addresses for x32. > > > > + */ > > > > + if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) > > > > + info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; > > > > + > > > > info.align_mask = 0; > > > > info.align_offset = pgoff << PAGE_SHIFT; > > > > if (filp) { > > > > > > I have two questions/concerns here. The above assumes that any address above > > > 1<<47 will use the _whole_ address space. Is this what we want? > > > > Yes, I believe so. > > > > > What if somebody does mmap(1<<52, ...) because he wants to (ab)use 53+ > > > bits for some other purpose? Shouldn't we cap the high_limit by the > > > given address? > > > > This would screw existing semantics of hint address -- "map here if > > free, please". > > Well, the given address is just _hint_. We are still allowed to map to a > different place. And it is not specified whether the resulting mapping > is above or below that address. So I do not think it would screw the > existing semantic. Or do I miss something? You are right, that this behaviour is not fixed by any standard or written down in documentation, but it's de-facto policy of Linux mmap(2) the beginning. And we need to be very careful when messing with this. I believe that qemu linux-user to some extend relies on this behaviour to do 32-bit allocations on 64-bit machine. https://github.com/qemu/qemu/blob/master/linux-user/mmap.c#L256 -- Kirill A. Shutemov -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
On Thu 18-05-17 18:41:35, Kirill A. Shutemov wrote: > On Thu, May 18, 2017 at 05:27:36PM +0200, Michal Hocko wrote: > > On Thu 18-05-17 18:19:52, Kirill A. Shutemov wrote: > > > On Thu, May 18, 2017 at 01:43:59PM +0200, Michal Hocko wrote: > > > > On Mon 15-05-17 15:12:18, Kirill A. Shutemov wrote: > > > > [...] > > > > > @@ -195,6 +207,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, > > > > > info.length = len; > > > > > info.low_limit = PAGE_SIZE; > > > > > info.high_limit = get_mmap_base(0); > > > > > + > > > > > + /* > > > > > + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area > > > > > + * in the full address space. > > > > > + * > > > > > + * !in_compat_syscall() check to avoid high addresses for x32. > > > > > + */ > > > > > + if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) > > > > > + info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; > > > > > + > > > > > info.align_mask = 0; > > > > > info.align_offset = pgoff << PAGE_SHIFT; > > > > > if (filp) { > > > > > > > > I have two questions/concerns here. The above assumes that any address above > > > > 1<<47 will use the _whole_ address space. Is this what we want? > > > > > > Yes, I believe so. > > > > > > > What if somebody does mmap(1<<52, ...) because he wants to (ab)use 53+ > > > > bits for some other purpose? Shouldn't we cap the high_limit by the > > > > given address? > > > > > > This would screw existing semantics of hint address -- "map here if > > > free, please". > > > > Well, the given address is just _hint_. We are still allowed to map to a > > different place. And it is not specified whether the resulting mapping > > is above or below that address. So I do not think it would screw the > > existing semantic. Or do I miss something? > > You are right, that this behaviour is not fixed by any standard or written > down in documentation, but it's de-facto policy of Linux mmap(2) the > beginning. > > And we need to be very careful when messing with this. I am sorry but I still do not understand. You already touch this semantic. mmap(-1UL,...) will already returns basically arbitrary address. All I am asking for is that mmap doesn't return higher address than the given one whent address > 1<<47. We do not have any such users currently so it won't be a change in behavior while it would allow different sized address spaces naturally. -- Michal Hocko SUSE Labs -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
On Thu 18-05-17 17:50:03, Michal Hocko wrote: > On Thu 18-05-17 18:41:35, Kirill A. Shutemov wrote: > > On Thu, May 18, 2017 at 05:27:36PM +0200, Michal Hocko wrote: > > > On Thu 18-05-17 18:19:52, Kirill A. Shutemov wrote: > > > > On Thu, May 18, 2017 at 01:43:59PM +0200, Michal Hocko wrote: > > > > > On Mon 15-05-17 15:12:18, Kirill A. Shutemov wrote: > > > > > [...] > > > > > > @@ -195,6 +207,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, > > > > > > info.length = len; > > > > > > info.low_limit = PAGE_SIZE; > > > > > > info.high_limit = get_mmap_base(0); > > > > > > + > > > > > > + /* > > > > > > + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area > > > > > > + * in the full address space. > > > > > > + * > > > > > > + * !in_compat_syscall() check to avoid high addresses for x32. > > > > > > + */ > > > > > > + if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) > > > > > > + info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; > > > > > > + > > > > > > info.align_mask = 0; > > > > > > info.align_offset = pgoff << PAGE_SHIFT; > > > > > > if (filp) { > > > > > > > > > > I have two questions/concerns here. The above assumes that any address above > > > > > 1<<47 will use the _whole_ address space. Is this what we want? > > > > > > > > Yes, I believe so. > > > > > > > > > What if somebody does mmap(1<<52, ...) because he wants to (ab)use 53+ > > > > > bits for some other purpose? Shouldn't we cap the high_limit by the > > > > > given address? > > > > > > > > This would screw existing semantics of hint address -- "map here if > > > > free, please". > > > > > > Well, the given address is just _hint_. We are still allowed to map to a > > > different place. And it is not specified whether the resulting mapping > > > is above or below that address. So I do not think it would screw the > > > existing semantic. Or do I miss something? > > > > You are right, that this behaviour is not fixed by any standard or written > > down in documentation, but it's de-facto policy of Linux mmap(2) the > > beginning. > > > > And we need to be very careful when messing with this. > > I am sorry but I still do not understand. You already touch this > semantic. mmap(-1UL,...) will already returns basically arbitrary > address. All I am asking for is that mmap doesn't return higher address > than the given one whent address > 1<<47. We do not have any such users > currently so it won't be a change in behavior while it would allow > different sized address spaces naturally. I basically mean something like the following --- diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 74d1587b181d..d6f66ff02d0a 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -195,7 +195,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, goto bottomup; /* requesting a specific address */ - if (addr) { + if (addr && addr <= DEFAULT_MAP_WINDOW) { addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && @@ -215,7 +215,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, * !in_compat_syscall() check to avoid high addresses for x32. */ if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) - info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; + info.high_limit += min(TASK_SIZE_MAX, address) - DEFAULT_MAP_WINDOW; info.align_mask = 0; info.align_offset = pgoff << PAGE_SHIFT; -- Michal Hocko SUSE Labs -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
On Thu, May 18, 2017 at 05:59:14PM +0200, Michal Hocko wrote: > On Thu 18-05-17 17:50:03, Michal Hocko wrote: > > On Thu 18-05-17 18:41:35, Kirill A. Shutemov wrote: > > > On Thu, May 18, 2017 at 05:27:36PM +0200, Michal Hocko wrote: > > > > On Thu 18-05-17 18:19:52, Kirill A. Shutemov wrote: > > > > > On Thu, May 18, 2017 at 01:43:59PM +0200, Michal Hocko wrote: > > > > > > On Mon 15-05-17 15:12:18, Kirill A. Shutemov wrote: > > > > > > [...] > > > > > > > @@ -195,6 +207,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, > > > > > > > info.length = len; > > > > > > > info.low_limit = PAGE_SIZE; > > > > > > > info.high_limit = get_mmap_base(0); > > > > > > > + > > > > > > > + /* > > > > > > > + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area > > > > > > > + * in the full address space. > > > > > > > + * > > > > > > > + * !in_compat_syscall() check to avoid high addresses for x32. > > > > > > > + */ > > > > > > > + if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) > > > > > > > + info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; > > > > > > > + > > > > > > > info.align_mask = 0; > > > > > > > info.align_offset = pgoff << PAGE_SHIFT; > > > > > > > if (filp) { > > > > > > > > > > > > I have two questions/concerns here. The above assumes that any address above > > > > > > 1<<47 will use the _whole_ address space. Is this what we want? > > > > > > > > > > Yes, I believe so. > > > > > > > > > > > What if somebody does mmap(1<<52, ...) because he wants to (ab)use 53+ > > > > > > bits for some other purpose? Shouldn't we cap the high_limit by the > > > > > > given address? > > > > > > > > > > This would screw existing semantics of hint address -- "map here if > > > > > free, please". > > > > > > > > Well, the given address is just _hint_. We are still allowed to map to a > > > > different place. And it is not specified whether the resulting mapping > > > > is above or below that address. So I do not think it would screw the > > > > existing semantic. Or do I miss something? > > > > > > You are right, that this behaviour is not fixed by any standard or written > > > down in documentation, but it's de-facto policy of Linux mmap(2) the > > > beginning. > > > > > > And we need to be very careful when messing with this. > > > > I am sorry but I still do not understand. You already touch this > > semantic. mmap(-1UL,...) will already returns basically arbitrary > > address. All I am asking for is that mmap doesn't return higher address > > than the given one whent address > 1<<47. We do not have any such users > > currently so it won't be a change in behavior while it would allow > > different sized address spaces naturally. > > I basically mean something like the following > --- > diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c > index 74d1587b181d..d6f66ff02d0a 100644 > --- a/arch/x86/kernel/sys_x86_64.c > +++ b/arch/x86/kernel/sys_x86_64.c > @@ -195,7 +195,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, > goto bottomup; > > /* requesting a specific address */ > - if (addr) { > + if (addr && addr <= DEFAULT_MAP_WINDOW) { > addr = PAGE_ALIGN(addr); > vma = find_vma(mm, addr); > if (TASK_SIZE - len >= addr && > @@ -215,7 +215,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, > * !in_compat_syscall() check to avoid high addresses for x32. > */ > if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) > - info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; > + info.high_limit += min(TASK_SIZE_MAX, address) - DEFAULT_MAP_WINDOW; > > info.align_mask = 0; > info.align_offset = pgoff << PAGE_SHIFT; You try to stretch the interface too far. With the patch you propose we have totally different behaviour wrt hint address if it below and above 47-bits: * <= 47-bits: allocate VM [addr; addr + len - 1], if free; * > 47-bits: allocate VM anywhere under addr; Sorry, no. That's ugly. If you feel that we need to guarantee that bits above certain limit are unused, introduce new interface. We have enough logic encoded in hint address already. -- Kirill A. Shutemov -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
On Thu 18-05-17 19:22:55, Kirill A. Shutemov wrote: > On Thu, May 18, 2017 at 05:59:14PM +0200, Michal Hocko wrote: [...] > > I basically mean something like the following > > --- > > diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c > > index 74d1587b181d..d6f66ff02d0a 100644 > > --- a/arch/x86/kernel/sys_x86_64.c > > +++ b/arch/x86/kernel/sys_x86_64.c > > @@ -195,7 +195,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, > > goto bottomup; > > > > /* requesting a specific address */ > > - if (addr) { > > + if (addr && addr <= DEFAULT_MAP_WINDOW) { > > addr = PAGE_ALIGN(addr); > > vma = find_vma(mm, addr); > > if (TASK_SIZE - len >= addr && > > @@ -215,7 +215,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, > > * !in_compat_syscall() check to avoid high addresses for x32. > > */ > > if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) > > - info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; > > + info.high_limit += min(TASK_SIZE_MAX, address) - DEFAULT_MAP_WINDOW; > > > > info.align_mask = 0; > > info.align_offset = pgoff << PAGE_SHIFT; > > You try to stretch the interface too far. With the patch you propose we > have totally different behaviour wrt hint address if it below and above > 47-bits: > > * <= 47-bits: allocate VM [addr; addr + len - 1], if free; unless I am missing something fundamental here this is not how it works. We just map a different range if the requested one is not free (in absence of MAP_FIXED). And we do that in top->down direction so this is already how it works. And you _do_ rely on the same thing when allowing larger than 47b except you start from the top of the supported address space. So how exactly is your new behavior any different and more clear? Say you would do mmap(1<<48, ...) # you will get 1<<48 mmap(1<<48, ...) # you will get something below TASK_SIZE_MAX > * > 47-bits: allocate VM anywhere under addr; > > Sorry, no. That's ugly. > > If you feel that we need to guarantee that bits above certain limit are > unused, introduce new interface. We have enough logic encoded in hint > address already. -- Michal Hocko SUSE Labs
On Thu 18-05-17 19:13:30, Michal Hocko wrote:
> On Thu 18-05-17 19:22:55, Kirill A. Shutemov wrote:
> > On Thu, May 18, 2017 at 05:59:14PM +0200, Michal Hocko wrote:
> [...]
> > > I basically mean something like the following
> > > ---
> > > diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
> > > index 74d1587b181d..d6f66ff02d0a 100644
> > > --- a/arch/x86/kernel/sys_x86_64.c
> > > +++ b/arch/x86/kernel/sys_x86_64.c
> > > @@ -195,7 +195,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
> > > goto bottomup;
> > >
> > > /* requesting a specific address */
> > > - if (addr) {
> > > + if (addr && addr <= DEFAULT_MAP_WINDOW) {
> > > addr = PAGE_ALIGN(addr);
> > > vma = find_vma(mm, addr);
> > > if (TASK_SIZE - len >= addr &&
> > > @@ -215,7 +215,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
> > > * !in_compat_syscall() check to avoid high addresses for x32.
> > > */
> > > if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall())
> > > - info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
> > > + info.high_limit += min(TASK_SIZE_MAX, address) - DEFAULT_MAP_WINDOW;
> > >
> > > info.align_mask = 0;
> > > info.align_offset = pgoff << PAGE_SHIFT;
> >
> > You try to stretch the interface too far. With the patch you propose we
> > have totally different behaviour wrt hint address if it below and above
> > 47-bits:
> >
> > * <= 47-bits: allocate VM [addr; addr + len - 1], if free;
>
> unless I am missing something fundamental here this is not how it works.
> We just map a different range if the requested one is not free (in
> absence of MAP_FIXED). And we do that in top->down direction so this is
> already how it works. And you _do_ rely on the same thing when allowing
> larger than 47b except you start from the top of the supported address
> space. So how exactly is your new behavior any different and more clear?
OK, I take that back because I am clearly wrong. We simply always start
from top. Sorry about my confusion.
Feel free to add
Acked-by: Michal Hocko <mhocko-IBi9RG/b67k@public.gmane.org>
--
Michal Hocko
SUSE Labs