All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] x86, mm: start mmap allocation for libs from low addresses
@ 2011-08-12 10:29 ` Vasiliy Kulikov
  0 siblings, 0 replies; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-08-12 10:29 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar
  Cc: kernel-hardening, H. Peter Anvin, Peter Zijlstra, Andrew Morton,
	x86, linux-kernel, linux-mm

This patch changes mmap base address allocator logic to incline to
allocate addresses from the first 16 Mbs of address space.  These
addresses start from zero byte (0x00AABBCC).  Using such addresses
breaks ret2libc exploits abusing string buffer overflows.  As library
addresses contain zero byte, addresses of library functions may not
present in the string, which is used to overflow the buffer.  As a
result, it makes it impossible to change the return address on the stack
to the address of some library function (e.g. system(3)).

The logic is applied to 32 bit tasks, both for 32 bit kernels and for 32
bit tasks running on 64 bit kernels.  64 bit tasks already have zero
bytes in addresses of library functions.  Other architectures may reuse
the logic.

The first Mb is excluded from the range because of the compatibility with
programs like Wine and Dosemu.

If the sum of libraries sizes plus executable size doesn't exceed 15 Mb,
the only pages out of ASCII-protected range are VDSO and vsyscall.
However, they don't provide enough material for obtaining arbitrary code
execution and are not dangerous without using other executable pages.

If 16 Mbs are over, we fallback to the old allocation algorithm.

Without the patch:

$ ldd /bin/ls
	linux-gate.so.1 =>  (0xf779c000)
        librt.so.1 => /lib/librt.so.1 (0xb7fcf000)
        libtermcap.so.2 => /lib/libtermcap.so.2 (0xb7fca000)
        libc.so.6 => /lib/libc.so.6 (0xb7eae000)
        libpthread.so.0 => /lib/libpthread.so.0 (0xb7e5b000)
        /lib/ld-linux.so.2 (0xb7fe6000)

With the patch:

$ ldd /bin/ls
	linux-gate.so.1 =>  (0xf772a000)
	librt.so.1 => /lib/librt.so.1 (0x0014a000)
	libtermcap.so.2 => /lib/libtermcap.so.2 (0x0015e000)
	libc.so.6 => /lib/libc.so.6 (0x00162000)
	libpthread.so.0 => /lib/libpthread.so.0 (0x00283000)
	/lib/ld-linux.so.2 (0x00131000)

The same logic was used in -ow patch for 2.0-2.4 kernels and in
exec-shield for 2.6.x kernels.  Parts of the code were taken from RHEL6
version of exec-shield.

Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
--
 arch/x86/mm/mmap.c       |    5 +++
 include/linux/mm_types.h |    4 +++
 include/linux/sched.h    |    3 ++
 mm/mmap.c                |   66 ++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 73 insertions(+), 5 deletions(-)

diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 1dab519..0c82a94 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -131,6 +131,11 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	} else {
 		mm->mmap_base = mmap_base();
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+		if (mmap_is_ia32()) {
+			mm->get_unmapped_exec_area =
+				arch_get_unmapped_exec_area;
+			mm->lib_mmap_base = 0x00110000 + mmap_rnd();
+		}
 		mm->unmap_area = arch_unmap_area_topdown;
 	}
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 027935c..5f2dca9 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -225,9 +225,13 @@ struct mm_struct {
 	unsigned long (*get_unmapped_area) (struct file *filp,
 				unsigned long addr, unsigned long len,
 				unsigned long pgoff, unsigned long flags);
+	unsigned long (*get_unmapped_exec_area) (struct file *filp,
+				unsigned long addr, unsigned long len,
+				unsigned long pgoff, unsigned long flags);
 	void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
 #endif
 	unsigned long mmap_base;		/* base of mmap area */
+	unsigned long lib_mmap_base;		/* base of mmap libraries area (includes zero symbol) */
 	unsigned long task_size;		/* size of task vm space */
 	unsigned long cached_hole_size; 	/* if non-zero, the largest hole below free_area_cache */
 	unsigned long free_area_cache;		/* first hole of size cached_hole_size or larger */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f024c63..8feaba9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -394,6 +394,9 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 			  unsigned long flags);
 extern void arch_unmap_area(struct mm_struct *, unsigned long);
 extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
+extern unsigned long
+arch_get_unmapped_exec_area(struct file *, unsigned long,
+		unsigned long, unsigned long, unsigned long);
 #else
 static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
 #endif
diff --git a/mm/mmap.c b/mm/mmap.c
index d49736f..489510c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -50,6 +50,10 @@ static void unmap_region(struct mm_struct *mm,
 		struct vm_area_struct *vma, struct vm_area_struct *prev,
 		unsigned long start, unsigned long end);
 
+static unsigned long
+get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned long len,
+		unsigned long pgoff, unsigned long flags, bool exec);
+
 /*
  * WARNING: the debugging will use recursive algorithms so never enable this
  * unless you know what you are doing.
@@ -989,7 +993,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	/* Obtain the address to map to. we verify (or select) it and ensure
 	 * that it represents a valid section of the address space.
 	 */
-	addr = get_unmapped_area(file, addr, len, pgoff, flags);
+	addr = get_unmapped_area_prot(file, addr, len, pgoff, flags,
+			prot & PROT_EXEC);
 	if (addr & ~PAGE_MASK)
 		return addr;
 
@@ -1528,6 +1533,46 @@ bottomup:
 }
 #endif
 
+unsigned long
+arch_get_unmapped_exec_area(struct file *filp, unsigned long addr0,
+		unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+	unsigned long addr = addr0;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+
+	if (len > TASK_SIZE)
+		return -ENOMEM;
+
+	if (flags & MAP_FIXED)
+		return addr;
+
+	/* We ALWAYS start from the beginning as base addresses
+	 * with zero high bits is a valued resource */
+	addr = max_t(unsigned long, mm->lib_mmap_base, mmap_min_addr);
+
+	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+		/* At this point:  (!vma || addr < vma->vm_end). */
+		if (TASK_SIZE - len < addr)
+			return -ENOMEM;
+
+		/* We don't want to touch brk of not DYNAMIC elf binaries */
+		if (mm->brk && addr > mm->brk)
+			goto failed;
+
+		if (!vma || addr + len <= vma->vm_start)
+			return addr;
+
+		addr = vma->vm_end;
+		/* If 0x01000000 is touched, the algo gives up */
+		if (addr >= 0x01000000)
+			goto failed;
+	}
+
+failed:
+	return current->mm->get_unmapped_area(filp, addr0, len, pgoff, flags);
+}
+
 void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
 {
 	/*
@@ -1541,9 +1586,9 @@ void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
 		mm->free_area_cache = mm->mmap_base;
 }
 
-unsigned long
-get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
-		unsigned long pgoff, unsigned long flags)
+static unsigned long
+get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned long len,
+		unsigned long pgoff, unsigned long flags, bool exec)
 {
 	unsigned long (*get_area)(struct file *, unsigned long,
 				  unsigned long, unsigned long, unsigned long);
@@ -1556,7 +1601,11 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 	if (len > TASK_SIZE)
 		return -ENOMEM;
 
-	get_area = current->mm->get_unmapped_area;
+	if (exec && current->mm->get_unmapped_exec_area)
+		get_area = current->mm->get_unmapped_exec_area;
+	else
+		get_area = current->mm->get_unmapped_area;
+
 	if (file && file->f_op && file->f_op->get_unmapped_area)
 		get_area = file->f_op->get_unmapped_area;
 	addr = get_area(file, addr, len, pgoff, flags);
@@ -1571,6 +1620,13 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 	return arch_rebalance_pgtables(addr, len);
 }
 
+unsigned long
+get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+		unsigned long pgoff, unsigned long flags)
+{
+	return get_unmapped_area_prot(file, addr, len, pgoff, flags, false);
+}
+
 EXPORT_SYMBOL(get_unmapped_area);
 
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
---

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* [RFC] x86, mm: start mmap allocation for libs from low addresses
@ 2011-08-12 10:29 ` Vasiliy Kulikov
  0 siblings, 0 replies; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-08-12 10:29 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar
  Cc: kernel-hardening, H. Peter Anvin, Peter Zijlstra, Andrew Morton,
	x86, linux-kernel, linux-mm

This patch changes mmap base address allocator logic to incline to
allocate addresses from the first 16 Mbs of address space.  These
addresses start from zero byte (0x00AABBCC).  Using such addresses
breaks ret2libc exploits abusing string buffer overflows.  As library
addresses contain zero byte, addresses of library functions may not
present in the string, which is used to overflow the buffer.  As a
result, it makes it impossible to change the return address on the stack
to the address of some library function (e.g. system(3)).

The logic is applied to 32 bit tasks, both for 32 bit kernels and for 32
bit tasks running on 64 bit kernels.  64 bit tasks already have zero
bytes in addresses of library functions.  Other architectures may reuse
the logic.

The first Mb is excluded from the range because of the compatibility with
programs like Wine and Dosemu.

If the sum of libraries sizes plus executable size doesn't exceed 15 Mb,
the only pages out of ASCII-protected range are VDSO and vsyscall.
However, they don't provide enough material for obtaining arbitrary code
execution and are not dangerous without using other executable pages.

If 16 Mbs are over, we fallback to the old allocation algorithm.

Without the patch:

$ ldd /bin/ls
	linux-gate.so.1 =>  (0xf779c000)
        librt.so.1 => /lib/librt.so.1 (0xb7fcf000)
        libtermcap.so.2 => /lib/libtermcap.so.2 (0xb7fca000)
        libc.so.6 => /lib/libc.so.6 (0xb7eae000)
        libpthread.so.0 => /lib/libpthread.so.0 (0xb7e5b000)
        /lib/ld-linux.so.2 (0xb7fe6000)

With the patch:

$ ldd /bin/ls
	linux-gate.so.1 =>  (0xf772a000)
	librt.so.1 => /lib/librt.so.1 (0x0014a000)
	libtermcap.so.2 => /lib/libtermcap.so.2 (0x0015e000)
	libc.so.6 => /lib/libc.so.6 (0x00162000)
	libpthread.so.0 => /lib/libpthread.so.0 (0x00283000)
	/lib/ld-linux.so.2 (0x00131000)

The same logic was used in -ow patch for 2.0-2.4 kernels and in
exec-shield for 2.6.x kernels.  Parts of the code were taken from RHEL6
version of exec-shield.

Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
--
 arch/x86/mm/mmap.c       |    5 +++
 include/linux/mm_types.h |    4 +++
 include/linux/sched.h    |    3 ++
 mm/mmap.c                |   66 ++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 73 insertions(+), 5 deletions(-)

diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 1dab519..0c82a94 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -131,6 +131,11 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	} else {
 		mm->mmap_base = mmap_base();
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+		if (mmap_is_ia32()) {
+			mm->get_unmapped_exec_area =
+				arch_get_unmapped_exec_area;
+			mm->lib_mmap_base = 0x00110000 + mmap_rnd();
+		}
 		mm->unmap_area = arch_unmap_area_topdown;
 	}
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 027935c..5f2dca9 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -225,9 +225,13 @@ struct mm_struct {
 	unsigned long (*get_unmapped_area) (struct file *filp,
 				unsigned long addr, unsigned long len,
 				unsigned long pgoff, unsigned long flags);
+	unsigned long (*get_unmapped_exec_area) (struct file *filp,
+				unsigned long addr, unsigned long len,
+				unsigned long pgoff, unsigned long flags);
 	void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
 #endif
 	unsigned long mmap_base;		/* base of mmap area */
+	unsigned long lib_mmap_base;		/* base of mmap libraries area (includes zero symbol) */
 	unsigned long task_size;		/* size of task vm space */
 	unsigned long cached_hole_size; 	/* if non-zero, the largest hole below free_area_cache */
 	unsigned long free_area_cache;		/* first hole of size cached_hole_size or larger */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f024c63..8feaba9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -394,6 +394,9 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 			  unsigned long flags);
 extern void arch_unmap_area(struct mm_struct *, unsigned long);
 extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
+extern unsigned long
+arch_get_unmapped_exec_area(struct file *, unsigned long,
+		unsigned long, unsigned long, unsigned long);
 #else
 static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
 #endif
diff --git a/mm/mmap.c b/mm/mmap.c
index d49736f..489510c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -50,6 +50,10 @@ static void unmap_region(struct mm_struct *mm,
 		struct vm_area_struct *vma, struct vm_area_struct *prev,
 		unsigned long start, unsigned long end);
 
+static unsigned long
+get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned long len,
+		unsigned long pgoff, unsigned long flags, bool exec);
+
 /*
  * WARNING: the debugging will use recursive algorithms so never enable this
  * unless you know what you are doing.
@@ -989,7 +993,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	/* Obtain the address to map to. we verify (or select) it and ensure
 	 * that it represents a valid section of the address space.
 	 */
-	addr = get_unmapped_area(file, addr, len, pgoff, flags);
+	addr = get_unmapped_area_prot(file, addr, len, pgoff, flags,
+			prot & PROT_EXEC);
 	if (addr & ~PAGE_MASK)
 		return addr;
 
@@ -1528,6 +1533,46 @@ bottomup:
 }
 #endif
 
+unsigned long
+arch_get_unmapped_exec_area(struct file *filp, unsigned long addr0,
+		unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+	unsigned long addr = addr0;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+
+	if (len > TASK_SIZE)
+		return -ENOMEM;
+
+	if (flags & MAP_FIXED)
+		return addr;
+
+	/* We ALWAYS start from the beginning as base addresses
+	 * with zero high bits is a valued resource */
+	addr = max_t(unsigned long, mm->lib_mmap_base, mmap_min_addr);
+
+	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+		/* At this point:  (!vma || addr < vma->vm_end). */
+		if (TASK_SIZE - len < addr)
+			return -ENOMEM;
+
+		/* We don't want to touch brk of not DYNAMIC elf binaries */
+		if (mm->brk && addr > mm->brk)
+			goto failed;
+
+		if (!vma || addr + len <= vma->vm_start)
+			return addr;
+
+		addr = vma->vm_end;
+		/* If 0x01000000 is touched, the algo gives up */
+		if (addr >= 0x01000000)
+			goto failed;
+	}
+
+failed:
+	return current->mm->get_unmapped_area(filp, addr0, len, pgoff, flags);
+}
+
 void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
 {
 	/*
@@ -1541,9 +1586,9 @@ void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
 		mm->free_area_cache = mm->mmap_base;
 }
 
-unsigned long
-get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
-		unsigned long pgoff, unsigned long flags)
+static unsigned long
+get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned long len,
+		unsigned long pgoff, unsigned long flags, bool exec)
 {
 	unsigned long (*get_area)(struct file *, unsigned long,
 				  unsigned long, unsigned long, unsigned long);
@@ -1556,7 +1601,11 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 	if (len > TASK_SIZE)
 		return -ENOMEM;
 
-	get_area = current->mm->get_unmapped_area;
+	if (exec && current->mm->get_unmapped_exec_area)
+		get_area = current->mm->get_unmapped_exec_area;
+	else
+		get_area = current->mm->get_unmapped_area;
+
 	if (file && file->f_op && file->f_op->get_unmapped_area)
 		get_area = file->f_op->get_unmapped_area;
 	addr = get_area(file, addr, len, pgoff, flags);
@@ -1571,6 +1620,13 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 	return arch_rebalance_pgtables(addr, len);
 }
 
+unsigned long
+get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+		unsigned long pgoff, unsigned long flags)
+{
+	return get_unmapped_area_prot(file, addr, len, pgoff, flags, false);
+}
+
 EXPORT_SYMBOL(get_unmapped_area);
 
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
---

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* [kernel-hardening] [RFC] x86, mm: start mmap allocation for libs from low addresses
@ 2011-08-12 10:29 ` Vasiliy Kulikov
  0 siblings, 0 replies; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-08-12 10:29 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar
  Cc: kernel-hardening, H. Peter Anvin, Peter Zijlstra, Andrew Morton,
	x86, linux-kernel, linux-mm

This patch changes mmap base address allocator logic to incline to
allocate addresses from the first 16 Mbs of address space.  These
addresses start from zero byte (0x00AABBCC).  Using such addresses
breaks ret2libc exploits abusing string buffer overflows.  As library
addresses contain zero byte, addresses of library functions may not
present in the string, which is used to overflow the buffer.  As a
result, it makes it impossible to change the return address on the stack
to the address of some library function (e.g. system(3)).

The logic is applied to 32 bit tasks, both for 32 bit kernels and for 32
bit tasks running on 64 bit kernels.  64 bit tasks already have zero
bytes in addresses of library functions.  Other architectures may reuse
the logic.

The first Mb is excluded from the range because of the compatibility with
programs like Wine and Dosemu.

If the sum of libraries sizes plus executable size doesn't exceed 15 Mb,
the only pages out of ASCII-protected range are VDSO and vsyscall.
However, they don't provide enough material for obtaining arbitrary code
execution and are not dangerous without using other executable pages.

If 16 Mbs are over, we fallback to the old allocation algorithm.

Without the patch:

$ ldd /bin/ls
	linux-gate.so.1 =>  (0xf779c000)
        librt.so.1 => /lib/librt.so.1 (0xb7fcf000)
        libtermcap.so.2 => /lib/libtermcap.so.2 (0xb7fca000)
        libc.so.6 => /lib/libc.so.6 (0xb7eae000)
        libpthread.so.0 => /lib/libpthread.so.0 (0xb7e5b000)
        /lib/ld-linux.so.2 (0xb7fe6000)

With the patch:

$ ldd /bin/ls
	linux-gate.so.1 =>  (0xf772a000)
	librt.so.1 => /lib/librt.so.1 (0x0014a000)
	libtermcap.so.2 => /lib/libtermcap.so.2 (0x0015e000)
	libc.so.6 => /lib/libc.so.6 (0x00162000)
	libpthread.so.0 => /lib/libpthread.so.0 (0x00283000)
	/lib/ld-linux.so.2 (0x00131000)

The same logic was used in -ow patch for 2.0-2.4 kernels and in
exec-shield for 2.6.x kernels.  Parts of the code were taken from RHEL6
version of exec-shield.

Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
--
 arch/x86/mm/mmap.c       |    5 +++
 include/linux/mm_types.h |    4 +++
 include/linux/sched.h    |    3 ++
 mm/mmap.c                |   66 ++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 73 insertions(+), 5 deletions(-)

diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 1dab519..0c82a94 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -131,6 +131,11 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	} else {
 		mm->mmap_base = mmap_base();
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+		if (mmap_is_ia32()) {
+			mm->get_unmapped_exec_area =
+				arch_get_unmapped_exec_area;
+			mm->lib_mmap_base = 0x00110000 + mmap_rnd();
+		}
 		mm->unmap_area = arch_unmap_area_topdown;
 	}
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 027935c..5f2dca9 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -225,9 +225,13 @@ struct mm_struct {
 	unsigned long (*get_unmapped_area) (struct file *filp,
 				unsigned long addr, unsigned long len,
 				unsigned long pgoff, unsigned long flags);
+	unsigned long (*get_unmapped_exec_area) (struct file *filp,
+				unsigned long addr, unsigned long len,
+				unsigned long pgoff, unsigned long flags);
 	void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
 #endif
 	unsigned long mmap_base;		/* base of mmap area */
+	unsigned long lib_mmap_base;		/* base of mmap libraries area (includes zero symbol) */
 	unsigned long task_size;		/* size of task vm space */
 	unsigned long cached_hole_size; 	/* if non-zero, the largest hole below free_area_cache */
 	unsigned long free_area_cache;		/* first hole of size cached_hole_size or larger */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f024c63..8feaba9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -394,6 +394,9 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 			  unsigned long flags);
 extern void arch_unmap_area(struct mm_struct *, unsigned long);
 extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
+extern unsigned long
+arch_get_unmapped_exec_area(struct file *, unsigned long,
+		unsigned long, unsigned long, unsigned long);
 #else
 static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
 #endif
diff --git a/mm/mmap.c b/mm/mmap.c
index d49736f..489510c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -50,6 +50,10 @@ static void unmap_region(struct mm_struct *mm,
 		struct vm_area_struct *vma, struct vm_area_struct *prev,
 		unsigned long start, unsigned long end);
 
+static unsigned long
+get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned long len,
+		unsigned long pgoff, unsigned long flags, bool exec);
+
 /*
  * WARNING: the debugging will use recursive algorithms so never enable this
  * unless you know what you are doing.
@@ -989,7 +993,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	/* Obtain the address to map to. we verify (or select) it and ensure
 	 * that it represents a valid section of the address space.
 	 */
-	addr = get_unmapped_area(file, addr, len, pgoff, flags);
+	addr = get_unmapped_area_prot(file, addr, len, pgoff, flags,
+			prot & PROT_EXEC);
 	if (addr & ~PAGE_MASK)
 		return addr;
 
@@ -1528,6 +1533,46 @@ bottomup:
 }
 #endif
 
+unsigned long
+arch_get_unmapped_exec_area(struct file *filp, unsigned long addr0,
+		unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+	unsigned long addr = addr0;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+
+	if (len > TASK_SIZE)
+		return -ENOMEM;
+
+	if (flags & MAP_FIXED)
+		return addr;
+
+	/* We ALWAYS start from the beginning as base addresses
+	 * with zero high bits is a valued resource */
+	addr = max_t(unsigned long, mm->lib_mmap_base, mmap_min_addr);
+
+	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+		/* At this point:  (!vma || addr < vma->vm_end). */
+		if (TASK_SIZE - len < addr)
+			return -ENOMEM;
+
+		/* We don't want to touch brk of not DYNAMIC elf binaries */
+		if (mm->brk && addr > mm->brk)
+			goto failed;
+
+		if (!vma || addr + len <= vma->vm_start)
+			return addr;
+
+		addr = vma->vm_end;
+		/* If 0x01000000 is touched, the algo gives up */
+		if (addr >= 0x01000000)
+			goto failed;
+	}
+
+failed:
+	return current->mm->get_unmapped_area(filp, addr0, len, pgoff, flags);
+}
+
 void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
 {
 	/*
@@ -1541,9 +1586,9 @@ void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
 		mm->free_area_cache = mm->mmap_base;
 }
 
-unsigned long
-get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
-		unsigned long pgoff, unsigned long flags)
+static unsigned long
+get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned long len,
+		unsigned long pgoff, unsigned long flags, bool exec)
 {
 	unsigned long (*get_area)(struct file *, unsigned long,
 				  unsigned long, unsigned long, unsigned long);
@@ -1556,7 +1601,11 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 	if (len > TASK_SIZE)
 		return -ENOMEM;
 
-	get_area = current->mm->get_unmapped_area;
+	if (exec && current->mm->get_unmapped_exec_area)
+		get_area = current->mm->get_unmapped_exec_area;
+	else
+		get_area = current->mm->get_unmapped_area;
+
 	if (file && file->f_op && file->f_op->get_unmapped_area)
 		get_area = file->f_op->get_unmapped_area;
 	addr = get_area(file, addr, len, pgoff, flags);
@@ -1571,6 +1620,13 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 	return arch_rebalance_pgtables(addr, len);
 }
 
+unsigned long
+get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+		unsigned long pgoff, unsigned long flags)
+{
+	return get_unmapped_area_prot(file, addr, len, pgoff, flags, false);
+}
+
 EXPORT_SYMBOL(get_unmapped_area);
 
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
---

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* Re: [kernel-hardening] [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-08-12 10:29 ` Vasiliy Kulikov
  (?)
  (?)
@ 2011-08-12 10:58 ` Solar Designer
  2011-08-12 11:05   ` Vasiliy Kulikov
  2011-08-25 17:19   ` Vasiliy Kulikov
  -1 siblings, 2 replies; 47+ messages in thread
From: Solar Designer @ 2011-08-12 10:58 UTC (permalink / raw)
  To: kernel-hardening

Vasiliy,

On Fri, Aug 12, 2011 at 02:29:54PM +0400, Vasiliy Kulikov wrote:
> This patch changes mmap base address allocator logic to incline to
> allocate addresses from the first 16 Mbs of address space.  These
> addresses start from zero byte (0x00AABBCC).  Using such addresses
> breaks ret2libc exploits abusing string buffer overflows.  As library
> addresses contain zero byte, addresses of library functions may not
> present in the string, which is used to overflow the buffer.  As a
> result, it makes it impossible to change the return address on the stack
> to the address of some library function (e.g. system(3)).

JFYI, this statement is too strong.  It is usually possible to pass such
addresses because the string does terminate with a NUL and this NUL gets
written.  C string based overflows where the terminating NUL is somehow
not written are uncommon (in fact, I don't recall any).  However,
requiring NUL as the most significant byte of a 32-bit address achieves
two things:

1. The overflow length has to be inferred/guessed exactly, because only
one NUL may be written.  Simply using a repeated pattern (with function
address and arguments) no longer works.

2. Passing function arguments in the straightforward manner no longer
works, because copying stops after the NUL.  The attacker's best bet may
be to find an entry point not at function boundary that sets registers
and then proceeds with or branches to the desired library code.  The
easiest way to set registers and branch would be a function epilogue -
pop/pop/.../ret - but then there's the difficulty in passing the address
to ret to (we have just one NUL and we've already used it to get to this
code).  Similarly, even via such pop's we can't pass an argument that
contains a NUL in it - e.g., the address of "/bin/sh" in libc (it
contains a NUL most significant byte too) or a zero value for root's
uid.  A possible bypass is via multiple overflows - if the overflow may
be triggered more than once before the vulnerable function returns, then
multiple NULs may be written, exactly one per overflow.  But this is
hopefully relatively rare.

Overall, this does make exploits harder and less reliable, just not in
the exact sense of making it impossible to point the return address at
libc.

Alexander

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [kernel-hardening] [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-08-12 10:58 ` [kernel-hardening] " Solar Designer
@ 2011-08-12 11:05   ` Vasiliy Kulikov
  2011-08-25 17:19   ` Vasiliy Kulikov
  1 sibling, 0 replies; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-08-12 11:05 UTC (permalink / raw)
  To: kernel-hardening

On Fri, Aug 12, 2011 at 14:58 +0400, Solar Designer wrote:
> On Fri, Aug 12, 2011 at 02:29:54PM +0400, Vasiliy Kulikov wrote:
> > As a
> > result, it makes it impossible to change the return address on the stack
> > to the address of some library function (e.g. system(3)).
> 
> JFYI, this statement is too strong.
[...]

You're right.  I was thinking about a single overflow for ret2lib with
1+ argument(s).  In general, my statement is wrong, sure.

Thank you!

-- 
Vasiliy

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-08-12 10:29 ` Vasiliy Kulikov
  (?)
@ 2011-08-12 23:19   ` H. Peter Anvin
  -1 siblings, 0 replies; 47+ messages in thread
From: H. Peter Anvin @ 2011-08-12 23:19 UTC (permalink / raw)
  To: Vasiliy Kulikov, Thomas Gleixner, Ingo Molnar
  Cc: kernel-hardening, Peter Zijlstra, Andrew Morton, x86,
	linux-kernel, linux-mm

Vasiliy Kulikov <segoon@openwall.com> wrote:

>This patch changes mmap base address allocator logic to incline to
>allocate addresses from the first 16 Mbs of address space.  These
>addresses start from zero byte (0x00AABBCC).  Using such addresses
>breaks ret2libc exploits abusing string buffer overflows.  As library
>addresses contain zero byte, addresses of library functions may not
>present in the string, which is used to overflow the buffer.  As a
>result, it makes it impossible to change the return address on the
>stack
>to the address of some library function (e.g. system(3)).
>
>The logic is applied to 32 bit tasks, both for 32 bit kernels and for
>32
>bit tasks running on 64 bit kernels.  64 bit tasks already have zero
>bytes in addresses of library functions.  Other architectures may reuse
>the logic.
>
>The first Mb is excluded from the range because of the compatibility
>with
>programs like Wine and Dosemu.
>
>If the sum of libraries sizes plus executable size doesn't exceed 15
>Mb,
>the only pages out of ASCII-protected range are VDSO and vsyscall.
>However, they don't provide enough material for obtaining arbitrary
>code
>execution and are not dangerous without using other executable pages.
>
>If 16 Mbs are over, we fallback to the old allocation algorithm.
>
>Without the patch:
>
>$ ldd /bin/ls
>	linux-gate.so.1 =>  (0xf779c000)
>        librt.so.1 => /lib/librt.so.1 (0xb7fcf000)
>        libtermcap.so.2 => /lib/libtermcap.so.2 (0xb7fca000)
>        libc.so.6 => /lib/libc.so.6 (0xb7eae000)
>        libpthread.so.0 => /lib/libpthread.so.0 (0xb7e5b000)
>        /lib/ld-linux.so.2 (0xb7fe6000)
>
>With the patch:
>
>$ ldd /bin/ls
>	linux-gate.so.1 =>  (0xf772a000)
>	librt.so.1 => /lib/librt.so.1 (0x0014a000)
>	libtermcap.so.2 => /lib/libtermcap.so.2 (0x0015e000)
>	libc.so.6 => /lib/libc.so.6 (0x00162000)
>	libpthread.so.0 => /lib/libpthread.so.0 (0x00283000)
>	/lib/ld-linux.so.2 (0x00131000)
>
>The same logic was used in -ow patch for 2.0-2.4 kernels and in
>exec-shield for 2.6.x kernels.  Parts of the code were taken from RHEL6
>version of exec-shield.
>
>Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
>--
> arch/x86/mm/mmap.c       |    5 +++
> include/linux/mm_types.h |    4 +++
> include/linux/sched.h    |    3 ++
>mm/mmap.c                |   66
>++++++++++++++++++++++++++++++++++++++++++---
> 4 files changed, 73 insertions(+), 5 deletions(-)
>
>diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
>index 1dab519..0c82a94 100644
>--- a/arch/x86/mm/mmap.c
>+++ b/arch/x86/mm/mmap.c
>@@ -131,6 +131,11 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
> 	} else {
> 		mm->mmap_base = mmap_base();
> 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
>+		if (mmap_is_ia32()) {
>+			mm->get_unmapped_exec_area =
>+				arch_get_unmapped_exec_area;
>+			mm->lib_mmap_base = 0x00110000 + mmap_rnd();
>+		}
> 		mm->unmap_area = arch_unmap_area_topdown;
> 	}
> }
>diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
>index 027935c..5f2dca9 100644
>--- a/include/linux/mm_types.h
>+++ b/include/linux/mm_types.h
>@@ -225,9 +225,13 @@ struct mm_struct {
> 	unsigned long (*get_unmapped_area) (struct file *filp,
> 				unsigned long addr, unsigned long len,
> 				unsigned long pgoff, unsigned long flags);
>+	unsigned long (*get_unmapped_exec_area) (struct file *filp,
>+				unsigned long addr, unsigned long len,
>+				unsigned long pgoff, unsigned long flags);
> 	void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
> #endif
> 	unsigned long mmap_base;		/* base of mmap area */
>+	unsigned long lib_mmap_base;		/* base of mmap libraries area
>(includes zero symbol) */
> 	unsigned long task_size;		/* size of task vm space */
>	unsigned long cached_hole_size; 	/* if non-zero, the largest hole
>below free_area_cache */
>	unsigned long free_area_cache;		/* first hole of size cached_hole_size
>or larger */
>diff --git a/include/linux/sched.h b/include/linux/sched.h
>index f024c63..8feaba9 100644
>--- a/include/linux/sched.h
>+++ b/include/linux/sched.h
>@@ -394,6 +394,9 @@ arch_get_unmapped_area_topdown(struct file *filp,
>unsigned long addr,
> 			  unsigned long flags);
> extern void arch_unmap_area(struct mm_struct *, unsigned long);
>extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
>+extern unsigned long
>+arch_get_unmapped_exec_area(struct file *, unsigned long,
>+		unsigned long, unsigned long, unsigned long);
> #else
> static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
> #endif
>diff --git a/mm/mmap.c b/mm/mmap.c
>index d49736f..489510c 100644
>--- a/mm/mmap.c
>+++ b/mm/mmap.c
>@@ -50,6 +50,10 @@ static void unmap_region(struct mm_struct *mm,
> 		struct vm_area_struct *vma, struct vm_area_struct *prev,
> 		unsigned long start, unsigned long end);
> 
>+static unsigned long
>+get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned
>long len,
>+		unsigned long pgoff, unsigned long flags, bool exec);
>+
> /*
>* WARNING: the debugging will use recursive algorithms so never enable
>this
>  * unless you know what you are doing.
>@@ -989,7 +993,8 @@ unsigned long do_mmap_pgoff(struct file *file,
>unsigned long addr,
> 	/* Obtain the address to map to. we verify (or select) it and ensure
> 	 * that it represents a valid section of the address space.
> 	 */
>-	addr = get_unmapped_area(file, addr, len, pgoff, flags);
>+	addr = get_unmapped_area_prot(file, addr, len, pgoff, flags,
>+			prot & PROT_EXEC);
> 	if (addr & ~PAGE_MASK)
> 		return addr;
> 
>@@ -1528,6 +1533,46 @@ bottomup:
> }
> #endif
> 
>+unsigned long
>+arch_get_unmapped_exec_area(struct file *filp, unsigned long addr0,
>+		unsigned long len, unsigned long pgoff, unsigned long flags)
>+{
>+	unsigned long addr = addr0;
>+	struct mm_struct *mm = current->mm;
>+	struct vm_area_struct *vma;
>+
>+	if (len > TASK_SIZE)
>+		return -ENOMEM;
>+
>+	if (flags & MAP_FIXED)
>+		return addr;
>+
>+	/* We ALWAYS start from the beginning as base addresses
>+	 * with zero high bits is a valued resource */
>+	addr = max_t(unsigned long, mm->lib_mmap_base, mmap_min_addr);
>+
>+	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
>+		/* At this point:  (!vma || addr < vma->vm_end). */
>+		if (TASK_SIZE - len < addr)
>+			return -ENOMEM;
>+
>+		/* We don't want to touch brk of not DYNAMIC elf binaries */
>+		if (mm->brk && addr > mm->brk)
>+			goto failed;
>+
>+		if (!vma || addr + len <= vma->vm_start)
>+			return addr;
>+
>+		addr = vma->vm_end;
>+		/* If 0x01000000 is touched, the algo gives up */
>+		if (addr >= 0x01000000)
>+			goto failed;
>+	}
>+
>+failed:
>+	return current->mm->get_unmapped_area(filp, addr0, len, pgoff,
>flags);
>+}
>+
> void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
> {
> 	/*
>@@ -1541,9 +1586,9 @@ void arch_unmap_area_topdown(struct mm_struct
>*mm, unsigned long addr)
> 		mm->free_area_cache = mm->mmap_base;
> }
> 
>-unsigned long
>-get_unmapped_area(struct file *file, unsigned long addr, unsigned long
>len,
>-		unsigned long pgoff, unsigned long flags)
>+static unsigned long
>+get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned
>long len,
>+		unsigned long pgoff, unsigned long flags, bool exec)
> {
> 	unsigned long (*get_area)(struct file *, unsigned long,
> 				  unsigned long, unsigned long, unsigned long);
>@@ -1556,7 +1601,11 @@ get_unmapped_area(struct file *file, unsigned
>long addr, unsigned long len,
> 	if (len > TASK_SIZE)
> 		return -ENOMEM;
> 
>-	get_area = current->mm->get_unmapped_area;
>+	if (exec && current->mm->get_unmapped_exec_area)
>+		get_area = current->mm->get_unmapped_exec_area;
>+	else
>+		get_area = current->mm->get_unmapped_area;
>+
> 	if (file && file->f_op && file->f_op->get_unmapped_area)
> 		get_area = file->f_op->get_unmapped_area;
> 	addr = get_area(file, addr, len, pgoff, flags);
>@@ -1571,6 +1620,13 @@ get_unmapped_area(struct file *file, unsigned
>long addr, unsigned long len,
> 	return arch_rebalance_pgtables(addr, len);
> }
> 
>+unsigned long
>+get_unmapped_area(struct file *file, unsigned long addr, unsigned long
>len,
>+		unsigned long pgoff, unsigned long flags)
>+{
>+	return get_unmapped_area_prot(file, addr, len, pgoff, flags, false);
>+}
>+
> EXPORT_SYMBOL(get_unmapped_area);
> 
>/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none.
>*/
>---

This also greatly reduces the address space available for randomization, and may get in the way of the default brk.  Is this a net win or lose?  Also, this zero byte is going to be at the last address, which means it might not help.  How about addresses of the form 0xAA00B000 instead?  The last bits are always 000 for a page address, of course...
-- 
Sent from my mobile phone. Please excuse my brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC] x86, mm: start mmap allocation for libs from low addresses
@ 2011-08-12 23:19   ` H. Peter Anvin
  0 siblings, 0 replies; 47+ messages in thread
From: H. Peter Anvin @ 2011-08-12 23:19 UTC (permalink / raw)
  To: Vasiliy Kulikov, Thomas Gleixner, Ingo Molnar
  Cc: kernel-hardening, Peter Zijlstra, Andrew Morton, x86,
	linux-kernel, linux-mm

Vasiliy Kulikov <segoon@openwall.com> wrote:

>This patch changes mmap base address allocator logic to incline to
>allocate addresses from the first 16 Mbs of address space.  These
>addresses start from zero byte (0x00AABBCC).  Using such addresses
>breaks ret2libc exploits abusing string buffer overflows.  As library
>addresses contain zero byte, addresses of library functions may not
>present in the string, which is used to overflow the buffer.  As a
>result, it makes it impossible to change the return address on the
>stack
>to the address of some library function (e.g. system(3)).
>
>The logic is applied to 32 bit tasks, both for 32 bit kernels and for
>32
>bit tasks running on 64 bit kernels.  64 bit tasks already have zero
>bytes in addresses of library functions.  Other architectures may reuse
>the logic.
>
>The first Mb is excluded from the range because of the compatibility
>with
>programs like Wine and Dosemu.
>
>If the sum of libraries sizes plus executable size doesn't exceed 15
>Mb,
>the only pages out of ASCII-protected range are VDSO and vsyscall.
>However, they don't provide enough material for obtaining arbitrary
>code
>execution and are not dangerous without using other executable pages.
>
>If 16 Mbs are over, we fallback to the old allocation algorithm.
>
>Without the patch:
>
>$ ldd /bin/ls
>	linux-gate.so.1 =>  (0xf779c000)
>        librt.so.1 => /lib/librt.so.1 (0xb7fcf000)
>        libtermcap.so.2 => /lib/libtermcap.so.2 (0xb7fca000)
>        libc.so.6 => /lib/libc.so.6 (0xb7eae000)
>        libpthread.so.0 => /lib/libpthread.so.0 (0xb7e5b000)
>        /lib/ld-linux.so.2 (0xb7fe6000)
>
>With the patch:
>
>$ ldd /bin/ls
>	linux-gate.so.1 =>  (0xf772a000)
>	librt.so.1 => /lib/librt.so.1 (0x0014a000)
>	libtermcap.so.2 => /lib/libtermcap.so.2 (0x0015e000)
>	libc.so.6 => /lib/libc.so.6 (0x00162000)
>	libpthread.so.0 => /lib/libpthread.so.0 (0x00283000)
>	/lib/ld-linux.so.2 (0x00131000)
>
>The same logic was used in -ow patch for 2.0-2.4 kernels and in
>exec-shield for 2.6.x kernels.  Parts of the code were taken from RHEL6
>version of exec-shield.
>
>Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
>--
> arch/x86/mm/mmap.c       |    5 +++
> include/linux/mm_types.h |    4 +++
> include/linux/sched.h    |    3 ++
>mm/mmap.c                |   66
>++++++++++++++++++++++++++++++++++++++++++---
> 4 files changed, 73 insertions(+), 5 deletions(-)
>
>diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
>index 1dab519..0c82a94 100644
>--- a/arch/x86/mm/mmap.c
>+++ b/arch/x86/mm/mmap.c
>@@ -131,6 +131,11 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
> 	} else {
> 		mm->mmap_base = mmap_base();
> 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
>+		if (mmap_is_ia32()) {
>+			mm->get_unmapped_exec_area =
>+				arch_get_unmapped_exec_area;
>+			mm->lib_mmap_base = 0x00110000 + mmap_rnd();
>+		}
> 		mm->unmap_area = arch_unmap_area_topdown;
> 	}
> }
>diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
>index 027935c..5f2dca9 100644
>--- a/include/linux/mm_types.h
>+++ b/include/linux/mm_types.h
>@@ -225,9 +225,13 @@ struct mm_struct {
> 	unsigned long (*get_unmapped_area) (struct file *filp,
> 				unsigned long addr, unsigned long len,
> 				unsigned long pgoff, unsigned long flags);
>+	unsigned long (*get_unmapped_exec_area) (struct file *filp,
>+				unsigned long addr, unsigned long len,
>+				unsigned long pgoff, unsigned long flags);
> 	void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
> #endif
> 	unsigned long mmap_base;		/* base of mmap area */
>+	unsigned long lib_mmap_base;		/* base of mmap libraries area
>(includes zero symbol) */
> 	unsigned long task_size;		/* size of task vm space */
>	unsigned long cached_hole_size; 	/* if non-zero, the largest hole
>below free_area_cache */
>	unsigned long free_area_cache;		/* first hole of size cached_hole_size
>or larger */
>diff --git a/include/linux/sched.h b/include/linux/sched.h
>index f024c63..8feaba9 100644
>--- a/include/linux/sched.h
>+++ b/include/linux/sched.h
>@@ -394,6 +394,9 @@ arch_get_unmapped_area_topdown(struct file *filp,
>unsigned long addr,
> 			  unsigned long flags);
> extern void arch_unmap_area(struct mm_struct *, unsigned long);
>extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
>+extern unsigned long
>+arch_get_unmapped_exec_area(struct file *, unsigned long,
>+		unsigned long, unsigned long, unsigned long);
> #else
> static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
> #endif
>diff --git a/mm/mmap.c b/mm/mmap.c
>index d49736f..489510c 100644
>--- a/mm/mmap.c
>+++ b/mm/mmap.c
>@@ -50,6 +50,10 @@ static void unmap_region(struct mm_struct *mm,
> 		struct vm_area_struct *vma, struct vm_area_struct *prev,
> 		unsigned long start, unsigned long end);
> 
>+static unsigned long
>+get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned
>long len,
>+		unsigned long pgoff, unsigned long flags, bool exec);
>+
> /*
>* WARNING: the debugging will use recursive algorithms so never enable
>this
>  * unless you know what you are doing.
>@@ -989,7 +993,8 @@ unsigned long do_mmap_pgoff(struct file *file,
>unsigned long addr,
> 	/* Obtain the address to map to. we verify (or select) it and ensure
> 	 * that it represents a valid section of the address space.
> 	 */
>-	addr = get_unmapped_area(file, addr, len, pgoff, flags);
>+	addr = get_unmapped_area_prot(file, addr, len, pgoff, flags,
>+			prot & PROT_EXEC);
> 	if (addr & ~PAGE_MASK)
> 		return addr;
> 
>@@ -1528,6 +1533,46 @@ bottomup:
> }
> #endif
> 
>+unsigned long
>+arch_get_unmapped_exec_area(struct file *filp, unsigned long addr0,
>+		unsigned long len, unsigned long pgoff, unsigned long flags)
>+{
>+	unsigned long addr = addr0;
>+	struct mm_struct *mm = current->mm;
>+	struct vm_area_struct *vma;
>+
>+	if (len > TASK_SIZE)
>+		return -ENOMEM;
>+
>+	if (flags & MAP_FIXED)
>+		return addr;
>+
>+	/* We ALWAYS start from the beginning as base addresses
>+	 * with zero high bits is a valued resource */
>+	addr = max_t(unsigned long, mm->lib_mmap_base, mmap_min_addr);
>+
>+	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
>+		/* At this point:  (!vma || addr < vma->vm_end). */
>+		if (TASK_SIZE - len < addr)
>+			return -ENOMEM;
>+
>+		/* We don't want to touch brk of not DYNAMIC elf binaries */
>+		if (mm->brk && addr > mm->brk)
>+			goto failed;
>+
>+		if (!vma || addr + len <= vma->vm_start)
>+			return addr;
>+
>+		addr = vma->vm_end;
>+		/* If 0x01000000 is touched, the algo gives up */
>+		if (addr >= 0x01000000)
>+			goto failed;
>+	}
>+
>+failed:
>+	return current->mm->get_unmapped_area(filp, addr0, len, pgoff,
>flags);
>+}
>+
> void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
> {
> 	/*
>@@ -1541,9 +1586,9 @@ void arch_unmap_area_topdown(struct mm_struct
>*mm, unsigned long addr)
> 		mm->free_area_cache = mm->mmap_base;
> }
> 
>-unsigned long
>-get_unmapped_area(struct file *file, unsigned long addr, unsigned long
>len,
>-		unsigned long pgoff, unsigned long flags)
>+static unsigned long
>+get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned
>long len,
>+		unsigned long pgoff, unsigned long flags, bool exec)
> {
> 	unsigned long (*get_area)(struct file *, unsigned long,
> 				  unsigned long, unsigned long, unsigned long);
>@@ -1556,7 +1601,11 @@ get_unmapped_area(struct file *file, unsigned
>long addr, unsigned long len,
> 	if (len > TASK_SIZE)
> 		return -ENOMEM;
> 
>-	get_area = current->mm->get_unmapped_area;
>+	if (exec && current->mm->get_unmapped_exec_area)
>+		get_area = current->mm->get_unmapped_exec_area;
>+	else
>+		get_area = current->mm->get_unmapped_area;
>+
> 	if (file && file->f_op && file->f_op->get_unmapped_area)
> 		get_area = file->f_op->get_unmapped_area;
> 	addr = get_area(file, addr, len, pgoff, flags);
>@@ -1571,6 +1620,13 @@ get_unmapped_area(struct file *file, unsigned
>long addr, unsigned long len,
> 	return arch_rebalance_pgtables(addr, len);
> }
> 
>+unsigned long
>+get_unmapped_area(struct file *file, unsigned long addr, unsigned long
>len,
>+		unsigned long pgoff, unsigned long flags)
>+{
>+	return get_unmapped_area_prot(file, addr, len, pgoff, flags, false);
>+}
>+
> EXPORT_SYMBOL(get_unmapped_area);
> 
>/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none.
>*/
>---

This also greatly reduces the address space available for randomization, and may get in the way of the default brk.  Is this a net win or lose?  Also, this zero byte is going to be at the last address, which means it might not help.  How about addresses of the form 0xAA00B000 instead?  The last bits are always 000 for a page address, of course...
-- 
Sent from my mobile phone. Please excuse my brevity and lack of formatting.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [kernel-hardening] Re: [RFC] x86, mm: start mmap allocation for libs from low addresses
@ 2011-08-12 23:19   ` H. Peter Anvin
  0 siblings, 0 replies; 47+ messages in thread
From: H. Peter Anvin @ 2011-08-12 23:19 UTC (permalink / raw)
  To: Vasiliy Kulikov, Thomas Gleixner, Ingo Molnar
  Cc: kernel-hardening, Peter Zijlstra, Andrew Morton, x86,
	linux-kernel, linux-mm

Vasiliy Kulikov <segoon@openwall.com> wrote:

>This patch changes mmap base address allocator logic to incline to
>allocate addresses from the first 16 Mbs of address space.  These
>addresses start from zero byte (0x00AABBCC).  Using such addresses
>breaks ret2libc exploits abusing string buffer overflows.  As library
>addresses contain zero byte, addresses of library functions may not
>present in the string, which is used to overflow the buffer.  As a
>result, it makes it impossible to change the return address on the
>stack
>to the address of some library function (e.g. system(3)).
>
>The logic is applied to 32 bit tasks, both for 32 bit kernels and for
>32
>bit tasks running on 64 bit kernels.  64 bit tasks already have zero
>bytes in addresses of library functions.  Other architectures may reuse
>the logic.
>
>The first Mb is excluded from the range because of the compatibility
>with
>programs like Wine and Dosemu.
>
>If the sum of libraries sizes plus executable size doesn't exceed 15
>Mb,
>the only pages out of ASCII-protected range are VDSO and vsyscall.
>However, they don't provide enough material for obtaining arbitrary
>code
>execution and are not dangerous without using other executable pages.
>
>If 16 Mbs are over, we fallback to the old allocation algorithm.
>
>Without the patch:
>
>$ ldd /bin/ls
>	linux-gate.so.1 =>  (0xf779c000)
>        librt.so.1 => /lib/librt.so.1 (0xb7fcf000)
>        libtermcap.so.2 => /lib/libtermcap.so.2 (0xb7fca000)
>        libc.so.6 => /lib/libc.so.6 (0xb7eae000)
>        libpthread.so.0 => /lib/libpthread.so.0 (0xb7e5b000)
>        /lib/ld-linux.so.2 (0xb7fe6000)
>
>With the patch:
>
>$ ldd /bin/ls
>	linux-gate.so.1 =>  (0xf772a000)
>	librt.so.1 => /lib/librt.so.1 (0x0014a000)
>	libtermcap.so.2 => /lib/libtermcap.so.2 (0x0015e000)
>	libc.so.6 => /lib/libc.so.6 (0x00162000)
>	libpthread.so.0 => /lib/libpthread.so.0 (0x00283000)
>	/lib/ld-linux.so.2 (0x00131000)
>
>The same logic was used in -ow patch for 2.0-2.4 kernels and in
>exec-shield for 2.6.x kernels.  Parts of the code were taken from RHEL6
>version of exec-shield.
>
>Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
>--
> arch/x86/mm/mmap.c       |    5 +++
> include/linux/mm_types.h |    4 +++
> include/linux/sched.h    |    3 ++
>mm/mmap.c                |   66
>++++++++++++++++++++++++++++++++++++++++++---
> 4 files changed, 73 insertions(+), 5 deletions(-)
>
>diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
>index 1dab519..0c82a94 100644
>--- a/arch/x86/mm/mmap.c
>+++ b/arch/x86/mm/mmap.c
>@@ -131,6 +131,11 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
> 	} else {
> 		mm->mmap_base = mmap_base();
> 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
>+		if (mmap_is_ia32()) {
>+			mm->get_unmapped_exec_area =
>+				arch_get_unmapped_exec_area;
>+			mm->lib_mmap_base = 0x00110000 + mmap_rnd();
>+		}
> 		mm->unmap_area = arch_unmap_area_topdown;
> 	}
> }
>diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
>index 027935c..5f2dca9 100644
>--- a/include/linux/mm_types.h
>+++ b/include/linux/mm_types.h
>@@ -225,9 +225,13 @@ struct mm_struct {
> 	unsigned long (*get_unmapped_area) (struct file *filp,
> 				unsigned long addr, unsigned long len,
> 				unsigned long pgoff, unsigned long flags);
>+	unsigned long (*get_unmapped_exec_area) (struct file *filp,
>+				unsigned long addr, unsigned long len,
>+				unsigned long pgoff, unsigned long flags);
> 	void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
> #endif
> 	unsigned long mmap_base;		/* base of mmap area */
>+	unsigned long lib_mmap_base;		/* base of mmap libraries area
>(includes zero symbol) */
> 	unsigned long task_size;		/* size of task vm space */
>	unsigned long cached_hole_size; 	/* if non-zero, the largest hole
>below free_area_cache */
>	unsigned long free_area_cache;		/* first hole of size cached_hole_size
>or larger */
>diff --git a/include/linux/sched.h b/include/linux/sched.h
>index f024c63..8feaba9 100644
>--- a/include/linux/sched.h
>+++ b/include/linux/sched.h
>@@ -394,6 +394,9 @@ arch_get_unmapped_area_topdown(struct file *filp,
>unsigned long addr,
> 			  unsigned long flags);
> extern void arch_unmap_area(struct mm_struct *, unsigned long);
>extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
>+extern unsigned long
>+arch_get_unmapped_exec_area(struct file *, unsigned long,
>+		unsigned long, unsigned long, unsigned long);
> #else
> static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
> #endif
>diff --git a/mm/mmap.c b/mm/mmap.c
>index d49736f..489510c 100644
>--- a/mm/mmap.c
>+++ b/mm/mmap.c
>@@ -50,6 +50,10 @@ static void unmap_region(struct mm_struct *mm,
> 		struct vm_area_struct *vma, struct vm_area_struct *prev,
> 		unsigned long start, unsigned long end);
> 
>+static unsigned long
>+get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned
>long len,
>+		unsigned long pgoff, unsigned long flags, bool exec);
>+
> /*
>* WARNING: the debugging will use recursive algorithms so never enable
>this
>  * unless you know what you are doing.
>@@ -989,7 +993,8 @@ unsigned long do_mmap_pgoff(struct file *file,
>unsigned long addr,
> 	/* Obtain the address to map to. we verify (or select) it and ensure
> 	 * that it represents a valid section of the address space.
> 	 */
>-	addr = get_unmapped_area(file, addr, len, pgoff, flags);
>+	addr = get_unmapped_area_prot(file, addr, len, pgoff, flags,
>+			prot & PROT_EXEC);
> 	if (addr & ~PAGE_MASK)
> 		return addr;
> 
>@@ -1528,6 +1533,46 @@ bottomup:
> }
> #endif
> 
>+unsigned long
>+arch_get_unmapped_exec_area(struct file *filp, unsigned long addr0,
>+		unsigned long len, unsigned long pgoff, unsigned long flags)
>+{
>+	unsigned long addr = addr0;
>+	struct mm_struct *mm = current->mm;
>+	struct vm_area_struct *vma;
>+
>+	if (len > TASK_SIZE)
>+		return -ENOMEM;
>+
>+	if (flags & MAP_FIXED)
>+		return addr;
>+
>+	/* We ALWAYS start from the beginning as base addresses
>+	 * with zero high bits is a valued resource */
>+	addr = max_t(unsigned long, mm->lib_mmap_base, mmap_min_addr);
>+
>+	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
>+		/* At this point:  (!vma || addr < vma->vm_end). */
>+		if (TASK_SIZE - len < addr)
>+			return -ENOMEM;
>+
>+		/* We don't want to touch brk of not DYNAMIC elf binaries */
>+		if (mm->brk && addr > mm->brk)
>+			goto failed;
>+
>+		if (!vma || addr + len <= vma->vm_start)
>+			return addr;
>+
>+		addr = vma->vm_end;
>+		/* If 0x01000000 is touched, the algo gives up */
>+		if (addr >= 0x01000000)
>+			goto failed;
>+	}
>+
>+failed:
>+	return current->mm->get_unmapped_area(filp, addr0, len, pgoff,
>flags);
>+}
>+
> void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
> {
> 	/*
>@@ -1541,9 +1586,9 @@ void arch_unmap_area_topdown(struct mm_struct
>*mm, unsigned long addr)
> 		mm->free_area_cache = mm->mmap_base;
> }
> 
>-unsigned long
>-get_unmapped_area(struct file *file, unsigned long addr, unsigned long
>len,
>-		unsigned long pgoff, unsigned long flags)
>+static unsigned long
>+get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned
>long len,
>+		unsigned long pgoff, unsigned long flags, bool exec)
> {
> 	unsigned long (*get_area)(struct file *, unsigned long,
> 				  unsigned long, unsigned long, unsigned long);
>@@ -1556,7 +1601,11 @@ get_unmapped_area(struct file *file, unsigned
>long addr, unsigned long len,
> 	if (len > TASK_SIZE)
> 		return -ENOMEM;
> 
>-	get_area = current->mm->get_unmapped_area;
>+	if (exec && current->mm->get_unmapped_exec_area)
>+		get_area = current->mm->get_unmapped_exec_area;
>+	else
>+		get_area = current->mm->get_unmapped_area;
>+
> 	if (file && file->f_op && file->f_op->get_unmapped_area)
> 		get_area = file->f_op->get_unmapped_area;
> 	addr = get_area(file, addr, len, pgoff, flags);
>@@ -1571,6 +1620,13 @@ get_unmapped_area(struct file *file, unsigned
>long addr, unsigned long len,
> 	return arch_rebalance_pgtables(addr, len);
> }
> 
>+unsigned long
>+get_unmapped_area(struct file *file, unsigned long addr, unsigned long
>len,
>+		unsigned long pgoff, unsigned long flags)
>+{
>+	return get_unmapped_area_prot(file, addr, len, pgoff, flags, false);
>+}
>+
> EXPORT_SYMBOL(get_unmapped_area);
> 
>/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none.
>*/
>---

This also greatly reduces the address space available for randomization, and may get in the way of the default brk.  Is this a net win or lose?  Also, this zero byte is going to be at the last address, which means it might not help.  How about addresses of the form 0xAA00B000 instead?  The last bits are always 000 for a page address, of course...
-- 
Sent from my mobile phone. Please excuse my brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-08-12 23:19   ` H. Peter Anvin
  (?)
@ 2011-08-13  6:26     ` Vasiliy Kulikov
  -1 siblings, 0 replies; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-08-13  6:26 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Thomas Gleixner, Ingo Molnar, kernel-hardening, Peter Zijlstra,
	Andrew Morton, x86, linux-kernel, linux-mm

On Fri, Aug 12, 2011 at 18:19 -0500, H. Peter Anvin wrote:
> This also greatly reduces the address space available for randomization,
> and may get in the way of the default brk.  Is this a net win or lose?

If the executable image is not randomized and is located out of
ASCII-armor, then yes, such allocation doesn't help much.

>  Also, this zero byte is going to be at the last address, which means it might not help.  How about addresses of the form 0xAA00B000 instead?  The last bits are always 000 for a page address, of course...

It leaves only 64kb of library protected, which is useless for most of
programs.

Thanks,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC] x86, mm: start mmap allocation for libs from low addresses
@ 2011-08-13  6:26     ` Vasiliy Kulikov
  0 siblings, 0 replies; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-08-13  6:26 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Thomas Gleixner, Ingo Molnar, kernel-hardening, Peter Zijlstra,
	Andrew Morton, x86, linux-kernel, linux-mm

On Fri, Aug 12, 2011 at 18:19 -0500, H. Peter Anvin wrote:
> This also greatly reduces the address space available for randomization,
> and may get in the way of the default brk.  Is this a net win or lose?

If the executable image is not randomized and is located out of
ASCII-armor, then yes, such allocation doesn't help much.

>  Also, this zero byte is going to be at the last address, which means it might not help.  How about addresses of the form 0xAA00B000 instead?  The last bits are always 000 for a page address, of course...

It leaves only 64kb of library protected, which is useless for most of
programs.

Thanks,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [kernel-hardening] Re: [RFC] x86, mm: start mmap allocation for libs from low addresses
@ 2011-08-13  6:26     ` Vasiliy Kulikov
  0 siblings, 0 replies; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-08-13  6:26 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Thomas Gleixner, Ingo Molnar, kernel-hardening, Peter Zijlstra,
	Andrew Morton, x86, linux-kernel, linux-mm

On Fri, Aug 12, 2011 at 18:19 -0500, H. Peter Anvin wrote:
> This also greatly reduces the address space available for randomization,
> and may get in the way of the default brk.  Is this a net win or lose?

If the executable image is not randomized and is located out of
ASCII-armor, then yes, such allocation doesn't help much.

>  Also, this zero byte is going to be at the last address, which means it might not help.  How about addresses of the form 0xAA00B000 instead?  The last bits are always 000 for a page address, of course...

It leaves only 64kb of library protected, which is useless for most of
programs.

Thanks,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-08-12 23:19   ` H. Peter Anvin
  (?)
@ 2011-08-16  9:05     ` Vasiliy Kulikov
  -1 siblings, 0 replies; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-08-16  9:05 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Thomas Gleixner, Ingo Molnar, kernel-hardening, Peter Zijlstra,
	Andrew Morton, x86, linux-kernel, linux-mm

On Fri, Aug 12, 2011 at 18:19 -0500, H. Peter Anvin wrote:
> Vasiliy Kulikov <segoon@openwall.com> wrote:
> 
> >This patch changes mmap base address allocator logic to incline to
> >allocate addresses from the first 16 Mbs of address space.  These
> >addresses start from zero byte (0x00AABBCC).
...

To make it clear:

The VM space is not significantly reduced - an additional gap, which
is used for ASCII-protected region, is calculated the same way as the
common mmap gap is calculated.  The maximum size of the gap is 1MB for
the upstream kernel default ASLR entropy - a trifle IMO.

If the new allocator fails to find appropriate vma in the protected
zone, the old one tries to do the job.  So, no visible changes for
userspace.


As to the benefit:

1) For small PIE programs, which don't use much libraries, all
executable regions are moved to the protected zone.

2) For non-PIE programs if image starts from 0x00AABBCC address and fits
into the zone the same rule of small libs applies.

3) For non-PIE programs with images above 0x01000000 and/or programs
with much libraries some code sections are outsize of the protected region.

The protection works for (1) and (2) programs.  It doesn't work for (3).


(1) is not too seldom.  Programs, which need such protection (network
daemons, programs parsing untrusted input, etc.), are usually small
enough.  In our distro, Openwall GNU/*/Linux, almost all daemon programs
fit into the region.

As the changes are not intrusive, we'd want to see this feature in the
upstream kernel.  If you know why the patch cannot be a part of the
upstream kernel - please tell me, I'll try to address the issues.

Thanks,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC] x86, mm: start mmap allocation for libs from low addresses
@ 2011-08-16  9:05     ` Vasiliy Kulikov
  0 siblings, 0 replies; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-08-16  9:05 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Thomas Gleixner, Ingo Molnar, kernel-hardening, Peter Zijlstra,
	Andrew Morton, x86, linux-kernel, linux-mm

On Fri, Aug 12, 2011 at 18:19 -0500, H. Peter Anvin wrote:
> Vasiliy Kulikov <segoon@openwall.com> wrote:
> 
> >This patch changes mmap base address allocator logic to incline to
> >allocate addresses from the first 16 Mbs of address space.  These
> >addresses start from zero byte (0x00AABBCC).
...

To make it clear:

The VM space is not significantly reduced - an additional gap, which
is used for ASCII-protected region, is calculated the same way as the
common mmap gap is calculated.  The maximum size of the gap is 1MB for
the upstream kernel default ASLR entropy - a trifle IMO.

If the new allocator fails to find appropriate vma in the protected
zone, the old one tries to do the job.  So, no visible changes for
userspace.


As to the benefit:

1) For small PIE programs, which don't use much libraries, all
executable regions are moved to the protected zone.

2) For non-PIE programs if image starts from 0x00AABBCC address and fits
into the zone the same rule of small libs applies.

3) For non-PIE programs with images above 0x01000000 and/or programs
with much libraries some code sections are outsize of the protected region.

The protection works for (1) and (2) programs.  It doesn't work for (3).


(1) is not too seldom.  Programs, which need such protection (network
daemons, programs parsing untrusted input, etc.), are usually small
enough.  In our distro, Openwall GNU/*/Linux, almost all daemon programs
fit into the region.

As the changes are not intrusive, we'd want to see this feature in the
upstream kernel.  If you know why the patch cannot be a part of the
upstream kernel - please tell me, I'll try to address the issues.

Thanks,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [kernel-hardening] Re: [RFC] x86, mm: start mmap allocation for libs from low addresses
@ 2011-08-16  9:05     ` Vasiliy Kulikov
  0 siblings, 0 replies; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-08-16  9:05 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Thomas Gleixner, Ingo Molnar, kernel-hardening, Peter Zijlstra,
	Andrew Morton, x86, linux-kernel, linux-mm

On Fri, Aug 12, 2011 at 18:19 -0500, H. Peter Anvin wrote:
> Vasiliy Kulikov <segoon@openwall.com> wrote:
> 
> >This patch changes mmap base address allocator logic to incline to
> >allocate addresses from the first 16 Mbs of address space.  These
> >addresses start from zero byte (0x00AABBCC).
...

To make it clear:

The VM space is not significantly reduced - an additional gap, which
is used for ASCII-protected region, is calculated the same way as the
common mmap gap is calculated.  The maximum size of the gap is 1MB for
the upstream kernel default ASLR entropy - a trifle IMO.

If the new allocator fails to find appropriate vma in the protected
zone, the old one tries to do the job.  So, no visible changes for
userspace.


As to the benefit:

1) For small PIE programs, which don't use much libraries, all
executable regions are moved to the protected zone.

2) For non-PIE programs if image starts from 0x00AABBCC address and fits
into the zone the same rule of small libs applies.

3) For non-PIE programs with images above 0x01000000 and/or programs
with much libraries some code sections are outsize of the protected region.

The protection works for (1) and (2) programs.  It doesn't work for (3).


(1) is not too seldom.  Programs, which need such protection (network
daemons, programs parsing untrusted input, etc.), are usually small
enough.  In our distro, Openwall GNU/*/Linux, almost all daemon programs
fit into the region.

As the changes are not intrusive, we'd want to see this feature in the
upstream kernel.  If you know why the patch cannot be a part of the
upstream kernel - please tell me, I'll try to address the issues.

Thanks,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-08-16  9:05     ` Vasiliy Kulikov
  (?)
@ 2011-08-22 10:17       ` Vasiliy Kulikov
  -1 siblings, 0 replies; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-08-22 10:17 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Thomas Gleixner, Ingo Molnar, kernel-hardening, Peter Zijlstra,
	Andrew Morton, x86, linux-kernel, linux-mm

Hi Ingo, Peter, Thomas,

On Tue, Aug 16, 2011 at 13:05 +0400, Vasiliy Kulikov wrote:
> As the changes are not intrusive, we'd want to see this feature in the
> upstream kernel.  If you know why the patch cannot be a part of the
> upstream kernel - please tell me, I'll try to address the issues.

Any comments on the RFC?  Otherwise, may I resend it as a PATCH for
inclusion?

Thanks!

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC] x86, mm: start mmap allocation for libs from low addresses
@ 2011-08-22 10:17       ` Vasiliy Kulikov
  0 siblings, 0 replies; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-08-22 10:17 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Thomas Gleixner, Ingo Molnar, kernel-hardening, Peter Zijlstra,
	Andrew Morton, x86, linux-kernel, linux-mm

Hi Ingo, Peter, Thomas,

On Tue, Aug 16, 2011 at 13:05 +0400, Vasiliy Kulikov wrote:
> As the changes are not intrusive, we'd want to see this feature in the
> upstream kernel.  If you know why the patch cannot be a part of the
> upstream kernel - please tell me, I'll try to address the issues.

Any comments on the RFC?  Otherwise, may I resend it as a PATCH for
inclusion?

Thanks!

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [kernel-hardening] Re: [RFC] x86, mm: start mmap allocation for libs from low addresses
@ 2011-08-22 10:17       ` Vasiliy Kulikov
  0 siblings, 0 replies; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-08-22 10:17 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Thomas Gleixner, Ingo Molnar, kernel-hardening, Peter Zijlstra,
	Andrew Morton, x86, linux-kernel, linux-mm

Hi Ingo, Peter, Thomas,

On Tue, Aug 16, 2011 at 13:05 +0400, Vasiliy Kulikov wrote:
> As the changes are not intrusive, we'd want to see this feature in the
> upstream kernel.  If you know why the patch cannot be a part of the
> upstream kernel - please tell me, I'll try to address the issues.

Any comments on the RFC?  Otherwise, may I resend it as a PATCH for
inclusion?

Thanks!

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-08-22 10:17       ` Vasiliy Kulikov
  (?)
@ 2011-08-22 17:24         ` H. Peter Anvin
  -1 siblings, 0 replies; 47+ messages in thread
From: H. Peter Anvin @ 2011-08-22 17:24 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: Thomas Gleixner, Ingo Molnar, kernel-hardening, Peter Zijlstra,
	Andrew Morton, x86, linux-kernel, linux-mm

On 08/22/2011 03:17 AM, Vasiliy Kulikov wrote:
> Hi Ingo, Peter, Thomas,
> 
> On Tue, Aug 16, 2011 at 13:05 +0400, Vasiliy Kulikov wrote:
>> As the changes are not intrusive, we'd want to see this feature in the
>> upstream kernel.  If you know why the patch cannot be a part of the
>> upstream kernel - please tell me, I'll try to address the issues.
> 
> Any comments on the RFC?  Otherwise, may I resend it as a PATCH for
> inclusion?
> 
> Thanks!

Conceptually:

I also have to admit to being somewhat skeptical to the concept on a
littleendian architecture like x86.

Code-wise:

The code is horrific; it is full of open-coded magic numbers; it also
puts a function called arch_get_unmapped_exec_area() in a generic file,
which could best be described as "WTF" -- the arch_ prefix we use
specifically to denote a per-architecture hook function.

As such, your claim that the changes are not intrusive is plain false.

	-hpa

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC] x86, mm: start mmap allocation for libs from low addresses
@ 2011-08-22 17:24         ` H. Peter Anvin
  0 siblings, 0 replies; 47+ messages in thread
From: H. Peter Anvin @ 2011-08-22 17:24 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: Thomas Gleixner, Ingo Molnar, kernel-hardening, Peter Zijlstra,
	Andrew Morton, x86, linux-kernel, linux-mm

On 08/22/2011 03:17 AM, Vasiliy Kulikov wrote:
> Hi Ingo, Peter, Thomas,
> 
> On Tue, Aug 16, 2011 at 13:05 +0400, Vasiliy Kulikov wrote:
>> As the changes are not intrusive, we'd want to see this feature in the
>> upstream kernel.  If you know why the patch cannot be a part of the
>> upstream kernel - please tell me, I'll try to address the issues.
> 
> Any comments on the RFC?  Otherwise, may I resend it as a PATCH for
> inclusion?
> 
> Thanks!

Conceptually:

I also have to admit to being somewhat skeptical to the concept on a
littleendian architecture like x86.

Code-wise:

The code is horrific; it is full of open-coded magic numbers; it also
puts a function called arch_get_unmapped_exec_area() in a generic file,
which could best be described as "WTF" -- the arch_ prefix we use
specifically to denote a per-architecture hook function.

As such, your claim that the changes are not intrusive is plain false.

	-hpa

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [kernel-hardening] Re: [RFC] x86, mm: start mmap allocation for libs from low addresses
@ 2011-08-22 17:24         ` H. Peter Anvin
  0 siblings, 0 replies; 47+ messages in thread
From: H. Peter Anvin @ 2011-08-22 17:24 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: Thomas Gleixner, Ingo Molnar, kernel-hardening, Peter Zijlstra,
	Andrew Morton, x86, linux-kernel, linux-mm

On 08/22/2011 03:17 AM, Vasiliy Kulikov wrote:
> Hi Ingo, Peter, Thomas,
> 
> On Tue, Aug 16, 2011 at 13:05 +0400, Vasiliy Kulikov wrote:
>> As the changes are not intrusive, we'd want to see this feature in the
>> upstream kernel.  If you know why the patch cannot be a part of the
>> upstream kernel - please tell me, I'll try to address the issues.
> 
> Any comments on the RFC?  Otherwise, may I resend it as a PATCH for
> inclusion?
> 
> Thanks!

Conceptually:

I also have to admit to being somewhat skeptical to the concept on a
littleendian architecture like x86.

Code-wise:

The code is horrific; it is full of open-coded magic numbers; it also
puts a function called arch_get_unmapped_exec_area() in a generic file,
which could best be described as "WTF" -- the arch_ prefix we use
specifically to denote a per-architecture hook function.

As such, your claim that the changes are not intrusive is plain false.

	-hpa

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-08-22 17:24         ` H. Peter Anvin
  (?)
@ 2011-08-22 20:14           ` Vasiliy Kulikov
  -1 siblings, 0 replies; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-08-22 20:14 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Thomas Gleixner, Ingo Molnar, kernel-hardening, Peter Zijlstra,
	Andrew Morton, x86, linux-kernel, linux-mm

On Mon, Aug 22, 2011 at 10:24 -0700, H. Peter Anvin wrote:
> Conceptually:
> 
> I also have to admit to being somewhat skeptical to the concept on a
> littleendian architecture like x86.

Sorry, I was too short at my statement.  This is a quote from Solar
Designer:

"Requiring NUL as the most significant byte of a 32-bit address achieves
two things:

1. The overflow length has to be inferred/guessed exactly, because only
one NUL may be written.  Simply using a repeated pattern (with function
address and arguments) no longer works.

2. Passing function arguments in the straightforward manner no longer
works, because copying stops after the NUL.  The attacker's best bet may
be to find an entry point not at function boundary that sets registers
and then proceeds with or branches to the desired library code.  The
easiest way to set registers and branch would be a function epilogue -
pop/pop/.../ret - but then there's the difficulty in passing the address
to ret to (we have just one NUL and we've already used it to get to this
code).  Similarly, even via such pop's we can't pass an argument that
contains a NUL in it - e.g., the address of "/bin/sh" in libc (it
contains a NUL most significant byte too) or a zero value for root's
uid.  A possible bypass is via multiple overflows - if the overflow may
be triggered more than once before the vulnerable function returns, then
multiple NULs may be written, exactly one per overflow.  But this is
hopefully relatively rare."

I'll extend the patch description to explain the motivation more
clearly.


> Code-wise:
> 
> The code is horrific; it is full of open-coded magic numbers;

Agreed, the magic needs macro definition and comments.

> it also
> puts a function called arch_get_unmapped_exec_area() in a generic file,
> which could best be described as "WTF" -- the arch_ prefix we use
> specifically to denote a per-architecture hook function.

Agreed.  But I'd want to leave it in mm/mmap.c as it's likely be used by
other archs - the changes are bitness specific, not arch specific.  Is
it OK if I do this?

#ifndef HAVE_ARCH_UNMAPPED_EXEC_AREA
void *arch_get_unmapped_exec_area(...)
{
    ...
}
#endif


Thank you for the review!

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC] x86, mm: start mmap allocation for libs from low addresses
@ 2011-08-22 20:14           ` Vasiliy Kulikov
  0 siblings, 0 replies; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-08-22 20:14 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Thomas Gleixner, Ingo Molnar, kernel-hardening, Peter Zijlstra,
	Andrew Morton, x86, linux-kernel, linux-mm

On Mon, Aug 22, 2011 at 10:24 -0700, H. Peter Anvin wrote:
> Conceptually:
> 
> I also have to admit to being somewhat skeptical to the concept on a
> littleendian architecture like x86.

Sorry, I was too short at my statement.  This is a quote from Solar
Designer:

"Requiring NUL as the most significant byte of a 32-bit address achieves
two things:

1. The overflow length has to be inferred/guessed exactly, because only
one NUL may be written.  Simply using a repeated pattern (with function
address and arguments) no longer works.

2. Passing function arguments in the straightforward manner no longer
works, because copying stops after the NUL.  The attacker's best bet may
be to find an entry point not at function boundary that sets registers
and then proceeds with or branches to the desired library code.  The
easiest way to set registers and branch would be a function epilogue -
pop/pop/.../ret - but then there's the difficulty in passing the address
to ret to (we have just one NUL and we've already used it to get to this
code).  Similarly, even via such pop's we can't pass an argument that
contains a NUL in it - e.g., the address of "/bin/sh" in libc (it
contains a NUL most significant byte too) or a zero value for root's
uid.  A possible bypass is via multiple overflows - if the overflow may
be triggered more than once before the vulnerable function returns, then
multiple NULs may be written, exactly one per overflow.  But this is
hopefully relatively rare."

I'll extend the patch description to explain the motivation more
clearly.


> Code-wise:
> 
> The code is horrific; it is full of open-coded magic numbers;

Agreed, the magic needs macro definition and comments.

> it also
> puts a function called arch_get_unmapped_exec_area() in a generic file,
> which could best be described as "WTF" -- the arch_ prefix we use
> specifically to denote a per-architecture hook function.

Agreed.  But I'd want to leave it in mm/mmap.c as it's likely be used by
other archs - the changes are bitness specific, not arch specific.  Is
it OK if I do this?

#ifndef HAVE_ARCH_UNMAPPED_EXEC_AREA
void *arch_get_unmapped_exec_area(...)
{
    ...
}
#endif


Thank you for the review!

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [kernel-hardening] Re: [RFC] x86, mm: start mmap allocation for libs from low addresses
@ 2011-08-22 20:14           ` Vasiliy Kulikov
  0 siblings, 0 replies; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-08-22 20:14 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Thomas Gleixner, Ingo Molnar, kernel-hardening, Peter Zijlstra,
	Andrew Morton, x86, linux-kernel, linux-mm

On Mon, Aug 22, 2011 at 10:24 -0700, H. Peter Anvin wrote:
> Conceptually:
> 
> I also have to admit to being somewhat skeptical to the concept on a
> littleendian architecture like x86.

Sorry, I was too short at my statement.  This is a quote from Solar
Designer:

"Requiring NUL as the most significant byte of a 32-bit address achieves
two things:

1. The overflow length has to be inferred/guessed exactly, because only
one NUL may be written.  Simply using a repeated pattern (with function
address and arguments) no longer works.

2. Passing function arguments in the straightforward manner no longer
works, because copying stops after the NUL.  The attacker's best bet may
be to find an entry point not at function boundary that sets registers
and then proceeds with or branches to the desired library code.  The
easiest way to set registers and branch would be a function epilogue -
pop/pop/.../ret - but then there's the difficulty in passing the address
to ret to (we have just one NUL and we've already used it to get to this
code).  Similarly, even via such pop's we can't pass an argument that
contains a NUL in it - e.g., the address of "/bin/sh" in libc (it
contains a NUL most significant byte too) or a zero value for root's
uid.  A possible bypass is via multiple overflows - if the overflow may
be triggered more than once before the vulnerable function returns, then
multiple NULs may be written, exactly one per overflow.  But this is
hopefully relatively rare."

I'll extend the patch description to explain the motivation more
clearly.


> Code-wise:
> 
> The code is horrific; it is full of open-coded magic numbers;

Agreed, the magic needs macro definition and comments.

> it also
> puts a function called arch_get_unmapped_exec_area() in a generic file,
> which could best be described as "WTF" -- the arch_ prefix we use
> specifically to denote a per-architecture hook function.

Agreed.  But I'd want to leave it in mm/mmap.c as it's likely be used by
other archs - the changes are bitness specific, not arch specific.  Is
it OK if I do this?

#ifndef HAVE_ARCH_UNMAPPED_EXEC_AREA
void *arch_get_unmapped_exec_area(...)
{
    ...
}
#endif


Thank you for the review!

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-08-22 20:14           ` Vasiliy Kulikov
  (?)
@ 2011-08-22 20:17             ` H. Peter Anvin
  -1 siblings, 0 replies; 47+ messages in thread
From: H. Peter Anvin @ 2011-08-22 20:17 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: Thomas Gleixner, Ingo Molnar, kernel-hardening, Peter Zijlstra,
	Andrew Morton, x86, linux-kernel, linux-mm

On 08/22/2011 01:14 PM, Vasiliy Kulikov wrote:
> 
>> Code-wise:
>>
>> The code is horrific; it is full of open-coded magic numbers;
> 
> Agreed, the magic needs macro definition and comments.
> 
>> it also
>> puts a function called arch_get_unmapped_exec_area() in a generic file,
>> which could best be described as "WTF" -- the arch_ prefix we use
>> specifically to denote a per-architecture hook function.
> 
> Agreed.  But I'd want to leave it in mm/mmap.c as it's likely be used by
> other archs - the changes are bitness specific, not arch specific.  Is
> it OK if I do this?
> 
> #ifndef HAVE_ARCH_UNMAPPED_EXEC_AREA
> void *arch_get_unmapped_exec_area(...)
> {
>     ...
> }
> #endif
> 

Only if this is really an architecture-specific function overridden in
specific architectures.  I'm not so sure that applies here.
Furthermore, I'm not even all that sure what this function *does*.

	-hpa


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC] x86, mm: start mmap allocation for libs from low addresses
@ 2011-08-22 20:17             ` H. Peter Anvin
  0 siblings, 0 replies; 47+ messages in thread
From: H. Peter Anvin @ 2011-08-22 20:17 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: Thomas Gleixner, Ingo Molnar, kernel-hardening, Peter Zijlstra,
	Andrew Morton, x86, linux-kernel, linux-mm

On 08/22/2011 01:14 PM, Vasiliy Kulikov wrote:
> 
>> Code-wise:
>>
>> The code is horrific; it is full of open-coded magic numbers;
> 
> Agreed, the magic needs macro definition and comments.
> 
>> it also
>> puts a function called arch_get_unmapped_exec_area() in a generic file,
>> which could best be described as "WTF" -- the arch_ prefix we use
>> specifically to denote a per-architecture hook function.
> 
> Agreed.  But I'd want to leave it in mm/mmap.c as it's likely be used by
> other archs - the changes are bitness specific, not arch specific.  Is
> it OK if I do this?
> 
> #ifndef HAVE_ARCH_UNMAPPED_EXEC_AREA
> void *arch_get_unmapped_exec_area(...)
> {
>     ...
> }
> #endif
> 

Only if this is really an architecture-specific function overridden in
specific architectures.  I'm not so sure that applies here.
Furthermore, I'm not even all that sure what this function *does*.

	-hpa

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [kernel-hardening] Re: [RFC] x86, mm: start mmap allocation for libs from low addresses
@ 2011-08-22 20:17             ` H. Peter Anvin
  0 siblings, 0 replies; 47+ messages in thread
From: H. Peter Anvin @ 2011-08-22 20:17 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: Thomas Gleixner, Ingo Molnar, kernel-hardening, Peter Zijlstra,
	Andrew Morton, x86, linux-kernel, linux-mm

On 08/22/2011 01:14 PM, Vasiliy Kulikov wrote:
> 
>> Code-wise:
>>
>> The code is horrific; it is full of open-coded magic numbers;
> 
> Agreed, the magic needs macro definition and comments.
> 
>> it also
>> puts a function called arch_get_unmapped_exec_area() in a generic file,
>> which could best be described as "WTF" -- the arch_ prefix we use
>> specifically to denote a per-architecture hook function.
> 
> Agreed.  But I'd want to leave it in mm/mmap.c as it's likely be used by
> other archs - the changes are bitness specific, not arch specific.  Is
> it OK if I do this?
> 
> #ifndef HAVE_ARCH_UNMAPPED_EXEC_AREA
> void *arch_get_unmapped_exec_area(...)
> {
>     ...
> }
> #endif
> 

Only if this is really an architecture-specific function overridden in
specific architectures.  I'm not so sure that applies here.
Furthermore, I'm not even all that sure what this function *does*.

	-hpa

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-08-22 20:17             ` H. Peter Anvin
  (?)
@ 2011-08-23  6:41               ` Vasiliy Kulikov
  -1 siblings, 0 replies; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-08-23  6:41 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Thomas Gleixner, Ingo Molnar, kernel-hardening, Peter Zijlstra,
	Andrew Morton, x86, linux-kernel, linux-mm

On Mon, Aug 22, 2011 at 13:17 -0700, H. Peter Anvin wrote:
> On 08/22/2011 01:14 PM, Vasiliy Kulikov wrote:
> > 
> >> Code-wise:
> >>
> >> The code is horrific; it is full of open-coded magic numbers;
> > 
> > Agreed, the magic needs macro definition and comments.
> > 
> >> it also
> >> puts a function called arch_get_unmapped_exec_area() in a generic file,
> >> which could best be described as "WTF" -- the arch_ prefix we use
> >> specifically to denote a per-architecture hook function.
> > 
> > Agreed.  But I'd want to leave it in mm/mmap.c as it's likely be used by
> > other archs - the changes are bitness specific, not arch specific.  Is
> > it OK if I do this?
> > 
> > #ifndef HAVE_ARCH_UNMAPPED_EXEC_AREA
> > void *arch_get_unmapped_exec_area(...)
> > {
> >     ...
> > }
> > #endif
> > 
> 
> Only if this is really an architecture-specific function overridden in
> specific architectures.  I'm not so sure that applies here.

It is a more or less generic allocator.  Arch specific constants will be
moved to arch headers, so it will be a 32-bit specific function, not
arch specific (64 bit architectures don't need ASCII shield at all as
mmap addresses already contain a zero byte).  It will not be overriden
by x86 as it is "enough generic" for x86.

I've defined it as arch_* looking at other allocator implementations.
All of them are arch_* and are located in mm/mmap.c with the ability to
override them in architecture specific files.  Probably nobody will
override it, but I tried to make it consistent with the existing code.
If this HAVE_ARCH_*/arch_* logic is not suitable for exec_area, I'll
remove arch_ prefix.


> Furthermore, I'm not even all that sure what this function *does*.

This is a bottom-up allocator, which tries to reuse all holes in the
ASCII-protected region.  It differs from arch_get_unmapped_area() in the
priority of the first 16 Mb - arch_get_unmapped_area() tries to walk
through all vmas in the whole VM space, arch_get_unmapped_exec_area()
tries to reuse all memory from the first 16 Mb and only then allocating
arbitrary addressed by fallbacking to the default allocator (top down in
case of x86).

I'll add the comment for the allocator.

Thank you,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC] x86, mm: start mmap allocation for libs from low addresses
@ 2011-08-23  6:41               ` Vasiliy Kulikov
  0 siblings, 0 replies; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-08-23  6:41 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Thomas Gleixner, Ingo Molnar, kernel-hardening, Peter Zijlstra,
	Andrew Morton, x86, linux-kernel, linux-mm

On Mon, Aug 22, 2011 at 13:17 -0700, H. Peter Anvin wrote:
> On 08/22/2011 01:14 PM, Vasiliy Kulikov wrote:
> > 
> >> Code-wise:
> >>
> >> The code is horrific; it is full of open-coded magic numbers;
> > 
> > Agreed, the magic needs macro definition and comments.
> > 
> >> it also
> >> puts a function called arch_get_unmapped_exec_area() in a generic file,
> >> which could best be described as "WTF" -- the arch_ prefix we use
> >> specifically to denote a per-architecture hook function.
> > 
> > Agreed.  But I'd want to leave it in mm/mmap.c as it's likely be used by
> > other archs - the changes are bitness specific, not arch specific.  Is
> > it OK if I do this?
> > 
> > #ifndef HAVE_ARCH_UNMAPPED_EXEC_AREA
> > void *arch_get_unmapped_exec_area(...)
> > {
> >     ...
> > }
> > #endif
> > 
> 
> Only if this is really an architecture-specific function overridden in
> specific architectures.  I'm not so sure that applies here.

It is a more or less generic allocator.  Arch specific constants will be
moved to arch headers, so it will be a 32-bit specific function, not
arch specific (64 bit architectures don't need ASCII shield at all as
mmap addresses already contain a zero byte).  It will not be overriden
by x86 as it is "enough generic" for x86.

I've defined it as arch_* looking at other allocator implementations.
All of them are arch_* and are located in mm/mmap.c with the ability to
override them in architecture specific files.  Probably nobody will
override it, but I tried to make it consistent with the existing code.
If this HAVE_ARCH_*/arch_* logic is not suitable for exec_area, I'll
remove arch_ prefix.


> Furthermore, I'm not even all that sure what this function *does*.

This is a bottom-up allocator, which tries to reuse all holes in the
ASCII-protected region.  It differs from arch_get_unmapped_area() in the
priority of the first 16 Mb - arch_get_unmapped_area() tries to walk
through all vmas in the whole VM space, arch_get_unmapped_exec_area()
tries to reuse all memory from the first 16 Mb and only then allocating
arbitrary addressed by fallbacking to the default allocator (top down in
case of x86).

I'll add the comment for the allocator.

Thank you,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [kernel-hardening] Re: [RFC] x86, mm: start mmap allocation for libs from low addresses
@ 2011-08-23  6:41               ` Vasiliy Kulikov
  0 siblings, 0 replies; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-08-23  6:41 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Thomas Gleixner, Ingo Molnar, kernel-hardening, Peter Zijlstra,
	Andrew Morton, x86, linux-kernel, linux-mm

On Mon, Aug 22, 2011 at 13:17 -0700, H. Peter Anvin wrote:
> On 08/22/2011 01:14 PM, Vasiliy Kulikov wrote:
> > 
> >> Code-wise:
> >>
> >> The code is horrific; it is full of open-coded magic numbers;
> > 
> > Agreed, the magic needs macro definition and comments.
> > 
> >> it also
> >> puts a function called arch_get_unmapped_exec_area() in a generic file,
> >> which could best be described as "WTF" -- the arch_ prefix we use
> >> specifically to denote a per-architecture hook function.
> > 
> > Agreed.  But I'd want to leave it in mm/mmap.c as it's likely be used by
> > other archs - the changes are bitness specific, not arch specific.  Is
> > it OK if I do this?
> > 
> > #ifndef HAVE_ARCH_UNMAPPED_EXEC_AREA
> > void *arch_get_unmapped_exec_area(...)
> > {
> >     ...
> > }
> > #endif
> > 
> 
> Only if this is really an architecture-specific function overridden in
> specific architectures.  I'm not so sure that applies here.

It is a more or less generic allocator.  Arch specific constants will be
moved to arch headers, so it will be a 32-bit specific function, not
arch specific (64 bit architectures don't need ASCII shield at all as
mmap addresses already contain a zero byte).  It will not be overriden
by x86 as it is "enough generic" for x86.

I've defined it as arch_* looking at other allocator implementations.
All of them are arch_* and are located in mm/mmap.c with the ability to
override them in architecture specific files.  Probably nobody will
override it, but I tried to make it consistent with the existing code.
If this HAVE_ARCH_*/arch_* logic is not suitable for exec_area, I'll
remove arch_ prefix.


> Furthermore, I'm not even all that sure what this function *does*.

This is a bottom-up allocator, which tries to reuse all holes in the
ASCII-protected region.  It differs from arch_get_unmapped_area() in the
priority of the first 16 Mb - arch_get_unmapped_area() tries to walk
through all vmas in the whole VM space, arch_get_unmapped_exec_area()
tries to reuse all memory from the first 16 Mb and only then allocating
arbitrary addressed by fallbacking to the default allocator (top down in
case of x86).

I'll add the comment for the allocator.

Thank you,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [kernel-hardening] [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-08-12 10:58 ` [kernel-hardening] " Solar Designer
  2011-08-12 11:05   ` Vasiliy Kulikov
@ 2011-08-25 17:19   ` Vasiliy Kulikov
  2011-08-25 17:39     ` Solar Designer
                       ` (2 more replies)
  1 sibling, 3 replies; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-08-25 17:19 UTC (permalink / raw)
  To: kernel-hardening

Solar,

I hope I've addressed Peter's complains in the following patch.
However, it would be great if you could (re)check the motivation and the
comments.  I tried not to make previous mistakes and worded the
statements more carefully, but an additional review won't hurt :)

Thanks!

------------------------------
From: Vasiliy Kulikov <segoon@openwall.com>

This patch changes mmap base address allocator logic to incline to
allocate addresses for executable pages from the first 16 Mbs of address
space.  These addresses start from zero byte (0x00AABBCC).  Using such
addresses breaks ret2libc exploits abusing string buffer overflows (or
does it much harder).

As x86 architecture is little-endian, this zero byte is the last byte of
the address.  So it's possible to e.g. overwrite a return address on the
stack with the mailformed address.  However, now it's impossible to
additionally overwrite function arguments, which are located after the
function address on the stack.  The attacker's best bet may be to find
an entry point not at function boundary that sets registers and then
proceeds with or branches to the desired library code.  The easiest way
to set registers and branch would be a function epilogue -
pop/pop/.../ret - but then there's the difficulty in passing the address
to ret to (we have just one NUL and we've already used it to get to this
code).  Similarly, even via such pop's we can't pass an argument that
contains a NUL in it - e.g., the address of "/bin/sh" in libc (it
contains a NUL most significant byte too) or a zero value for root's
uid.  A possible bypass is via multiple overflows - if the overflow may
be triggered more than once before the vulnerable function returns, then
multiple NULs may be written, exactly one per overflow.  But this is
hopefully relatively rare.

To fully utilize the protection, the executable image should be
randomized (sysctl kernel.randomize_va_space > 0 and the executable is
compiled as PIE) and the sum of libraries sizes plus executable size
shouldn't exceed 16 Mb.  In this case the only pages out of
ASCII-protected range are VDSO and vsyscall.  However, they don't
provide enough material for obtaining arbitrary code execution and are
not dangerous without using other executable pages.

The logic is applied to x86 32 bit tasks, both for 32 bit kernels and
for 32 bit tasks running on 64 bit kernels.  64 bit tasks already have
zero bytes in addresses of library functions.  Other architectures
(non-x86) may reuse the logic too.

Without the patch:

$ ldd /bin/ls
	linux-gate.so.1 =>  (0xf779c000)
        librt.so.1 => /lib/librt.so.1 (0xb7fcf000)
        libtermcap.so.2 => /lib/libtermcap.so.2 (0xb7fca000)
        libc.so.6 => /lib/libc.so.6 (0xb7eae000)
        libpthread.so.0 => /lib/libpthread.so.0 (0xb7e5b000)
        /lib/ld-linux.so.2 (0xb7fe6000)

With the patch:

$ ldd /bin/ls
	linux-gate.so.1 =>  (0xf772a000)
	librt.so.1 => /lib/librt.so.1 (0x0004a000)
	libtermcap.so.2 => /lib/libtermcap.so.2 (0x0005e000)
	libc.so.6 => /lib/libc.so.6 (0x00062000)
	libpthread.so.0 => /lib/libpthread.so.0 (0x00183000)
	/lib/ld-linux.so.2 (0x00121000)


If CONFIG_VM86=y, the first megabyte is excluded from the potential
range for mmap allocations as it might be used by vm86 code.  If
CONFIG_VM86=n, the allocation begins from the mmap_min_addr.  Regardless
of CONFIG_VM86 the base address is randomized with the same entropy size
as mm->mmap_base.

If 16 Mbs are over, we fallback to the old allocation algorithm.
But, hopefully, programs which need such protection (network daemons,
programs working with untrusted data, etc.) are small enough to utilize
the protection.

The same logic was used in -ow patch for 2.0-2.4 kernels and in
exec-shield for 2.6.x kernels.  Code parts were taken from exec-shield
from RHEL6.

v2 - Added comments, adjusted patch description.
   - s/arch_get_unmapped_exec_area/get_unmapped_exec_area/
   - Don't reserve first Mb if CONFIG_VM86=n.

Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
--
 arch/x86/mm/mmap.c       |   20 +++++++++++
 include/linux/mm_types.h |    4 ++
 include/linux/sched.h    |    3 ++
 mm/mmap.c                |   82 +++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 104 insertions(+), 5 deletions(-)

diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 1dab519..4e7a783 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -118,6 +118,22 @@ static unsigned long mmap_legacy_base(void)
 		return TASK_UNMAPPED_BASE + mmap_rnd();
 }
 
+#ifdef CONFIG_VM86
+/*
+ * Don't touch any memory that can be used by vm86 apps.
+ * Reserve the first Mb + 64 kb of guard pages.
+ */
+#define ASCII_ARMOR_MIN_ADDR (0x00100000 + 0x00010000)
+#else
+/* No special users of low addresses.  Start just after mmap_min_addr. */
+#define ASCII_ARMOR_MIN_ADDR 0
+#endif
+
+static unsigned long mmap_lib_base(void)
+{
+	return ASCII_ARMOR_MIN_ADDR + mmap_rnd();
+}
+
 /*
  * This function, called very early during the creation of a new
  * process VM image, sets up which VM layout function to use:
@@ -131,6 +147,10 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	} else {
 		mm->mmap_base = mmap_base();
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+		if (mmap_is_ia32()) {
+			mm->get_unmapped_exec_area = get_unmapped_exec_area;
+			mm->lib_mmap_base = mmap_lib_base();
+		}
 		mm->unmap_area = arch_unmap_area_topdown;
 	}
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 027935c..68fc216 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -225,9 +225,13 @@ struct mm_struct {
 	unsigned long (*get_unmapped_area) (struct file *filp,
 				unsigned long addr, unsigned long len,
 				unsigned long pgoff, unsigned long flags);
+	unsigned long (*get_unmapped_exec_area) (struct file *filp,
+				unsigned long addr, unsigned long len,
+				unsigned long pgoff, unsigned long flags);
 	void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
 #endif
 	unsigned long mmap_base;		/* base of mmap area */
+	unsigned long lib_mmap_base;		/* base of mmap libraries area (for get_unmapped_exec_area()) */
 	unsigned long task_size;		/* size of task vm space */
 	unsigned long cached_hole_size; 	/* if non-zero, the largest hole below free_area_cache */
 	unsigned long free_area_cache;		/* first hole of size cached_hole_size or larger */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f024c63..ef9024f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -394,6 +394,9 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 			  unsigned long flags);
 extern void arch_unmap_area(struct mm_struct *, unsigned long);
 extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
+extern unsigned long
+get_unmapped_exec_area(struct file *, unsigned long,
+		unsigned long, unsigned long, unsigned long);
 #else
 static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
 #endif
diff --git a/mm/mmap.c b/mm/mmap.c
index d49736f..2828322 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -50,6 +50,10 @@ static void unmap_region(struct mm_struct *mm,
 		struct vm_area_struct *vma, struct vm_area_struct *prev,
 		unsigned long start, unsigned long end);
 
+static unsigned long
+get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned long len,
+		unsigned long pgoff, unsigned long flags, bool exec);
+
 /*
  * WARNING: the debugging will use recursive algorithms so never enable this
  * unless you know what you are doing.
@@ -989,7 +993,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	/* Obtain the address to map to. we verify (or select) it and ensure
 	 * that it represents a valid section of the address space.
 	 */
-	addr = get_unmapped_area(file, addr, len, pgoff, flags);
+	addr = get_unmapped_area_prot(file, addr, len, pgoff, flags,
+			prot & PROT_EXEC);
 	if (addr & ~PAGE_MASK)
 		return addr;
 
@@ -1528,6 +1533,62 @@ bottomup:
 }
 #endif
 
+/* Addresses before this value contain at least one zero byte. */
+#define ASCII_ARMOR_MAX_ADDR 0x01000000
+
+/*
+ * This function finds the first unmapped region inside of
+ * [mm->lib_mmap_base; ASCII_ARMOR_MAX_ADDR) region.  Addresses from this
+ * region contain at least one zero byte, which greatly complicates
+ * exploitation of C string buffer overflows (C strings cannot contain zero
+ * byte inside) in return to libc class of attacks.
+ *
+ * This allocator is bottom up allocator like arch_get_unmapped_area(), but
+ * it differs from the latter.  get_unmapped_exec_area() does its best to
+ * allocate as low address as possible.
+ */
+unsigned long
+get_unmapped_exec_area(struct file *filp, unsigned long addr0,
+		unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+	unsigned long addr = addr0;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+
+	if (len > TASK_SIZE)
+		return -ENOMEM;
+
+	if (flags & MAP_FIXED)
+		return addr;
+
+	/* We ALWAYS start from the beginning as base addresses
+	 * with zero high bits is a valued resource */
+	addr = max_t(unsigned long, mm->lib_mmap_base, mmap_min_addr);
+
+	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+		/* At this point:  (!vma || addr < vma->vm_end). */
+		if (TASK_SIZE - len < addr)
+			return -ENOMEM;
+
+		/* We don't want to reduce brk area of not DYNAMIC elf binaries
+		 * with sysctl kernel.randomize_va_space < 2. */
+		if (mm->brk && addr > mm->brk)
+			goto failed;
+
+		if (!vma || addr + len <= vma->vm_start)
+			return addr;
+
+		addr = vma->vm_end;
+
+		/* If ACSII-armor area is over, the algo gives up */
+		if (addr >= ASCII_ARMOR_MAX_ADDR)
+			goto failed;
+	}
+
+failed:
+	return current->mm->get_unmapped_area(filp, addr0, len, pgoff, flags);
+}
+
 void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
 {
 	/*
@@ -1541,9 +1602,9 @@ void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
 		mm->free_area_cache = mm->mmap_base;
 }
 
-unsigned long
-get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
-		unsigned long pgoff, unsigned long flags)
+static unsigned long
+get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned long len,
+		unsigned long pgoff, unsigned long flags, bool exec)
 {
 	unsigned long (*get_area)(struct file *, unsigned long,
 				  unsigned long, unsigned long, unsigned long);
@@ -1556,7 +1617,11 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 	if (len > TASK_SIZE)
 		return -ENOMEM;
 
-	get_area = current->mm->get_unmapped_area;
+	if (exec && current->mm->get_unmapped_exec_area)
+		get_area = current->mm->get_unmapped_exec_area;
+	else
+		get_area = current->mm->get_unmapped_area;
+
 	if (file && file->f_op && file->f_op->get_unmapped_area)
 		get_area = file->f_op->get_unmapped_area;
 	addr = get_area(file, addr, len, pgoff, flags);
@@ -1571,6 +1636,13 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 	return arch_rebalance_pgtables(addr, len);
 }
 
+unsigned long
+get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+		unsigned long pgoff, unsigned long flags)
+{
+	return get_unmapped_area_prot(file, addr, len, pgoff, flags, false);
+}
+
 EXPORT_SYMBOL(get_unmapped_area);
 
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
-- 
Vasiliy

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* Re: [kernel-hardening] [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-08-25 17:19   ` Vasiliy Kulikov
@ 2011-08-25 17:39     ` Solar Designer
  2011-09-02 18:29     ` Solar Designer
  2011-09-02 23:34     ` Solar Designer
  2 siblings, 0 replies; 47+ messages in thread
From: Solar Designer @ 2011-08-25 17:39 UTC (permalink / raw)
  To: kernel-hardening

Vasiliy,

On Thu, Aug 25, 2011 at 09:19:34PM +0400, Vasiliy Kulikov wrote:
> I hope I've addressed Peter's complains in the following patch.
> However, it would be great if you could (re)check the motivation and the
> comments.  I tried not to make previous mistakes and worded the
> statements more carefully, but an additional review won't hurt :)

Yes, I will have comments and will propose changes.  Unfortunately, I
don't have time right now.  Please don't post this to LKML yet - please
wait for me.

Thanks,

Alexander

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [kernel-hardening] [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-08-25 17:19   ` Vasiliy Kulikov
  2011-08-25 17:39     ` Solar Designer
@ 2011-09-02 18:29     ` Solar Designer
  2011-09-03 11:18       ` Vasiliy Kulikov
  2011-09-02 23:34     ` Solar Designer
  2 siblings, 1 reply; 47+ messages in thread
From: Solar Designer @ 2011-09-02 18:29 UTC (permalink / raw)
  To: kernel-hardening

Vasiliy,

Some nitpicking on the patch description:

On Thu, Aug 25, 2011 at 09:19:34PM +0400, Vasiliy Kulikov wrote:
> This patch changes mmap base address allocator logic to incline to
> allocate addresses for executable pages from the first 16 Mbs of address

s/Mbs/MiB/ (or just MB, whereas Mb is sometimes used to denote megabits)

> space.  These addresses start from zero byte (0x00AABBCC).  Using such
> addresses breaks ret2libc exploits abusing string buffer overflows (or
> does it much harder).

s,does it much harder,makes such attacks harder and/or less reliable,

> As x86 architecture is little-endian, this zero byte is the last byte of
> the address.  So it's possible to e.g. overwrite a return address on the
> stack with the mailformed address.  However, now it's impossible to

s/mailformed/malformed/

> additionally overwrite function arguments, which are located after the
> function address on the stack.  The attacker's best bet may be to find
> an entry point not at function boundary that sets registers and then
> proceeds with or branches to the desired library code.  The easiest way
> to set registers and branch would be a function epilogue -
> pop/pop/.../ret - but then there's the difficulty in passing the address
> to ret to (we have just one NUL and we've already used it to get to this
> code).  Similarly, even via such pop's we can't pass an argument that
> contains a NUL in it - e.g., the address of "/bin/sh" in libc (it
> contains a NUL most significant byte too) or a zero value for root's
> uid.

The above was partially flawed logic on my part - as written above
(without further detail), the pop/pop/.../ret thing doesn't apply
because those pop's would read stack right after the just-used return
address - that is, the same stack locations that we presumably could not
write to in order to pass the arguments in a more straightforward
fashion.  So this trick would be of no help, and thus its other
limitations would be of no relevance.

However, in practice similar approaches involving function epilogues
could be of relevance.  For example, a certain build of glibc has this
instruction sequence:

mov    0xfffffff0(%ebp),%eax
lea    0xfffffff4(%ebp),%esp
pop    %ebx
pop    %esi
pop    %edi
pop    %ebp
ret

So the reads may actually be from different stack locations - perhaps on
the calling function's stack frame (since our vulnerable function likely
similarly restored %ebp prior to returning).

In practice, an exploit could try to pass these values via inputs other
than the string used to cause the overflow, if such user inputs are
available in a given program and attack vector.  These other inputs may
or may not make it difficult to pass NULs, and the number of NULs that
may be passed will vary.

Trying to explain it in detail is beyond scope of the patch description,
and we might get it partially wrong again.

I propose that we keep this portion almost as-is:

> additionally overwrite function arguments, which are located after the
> function address on the stack.  The attacker's best bet may be to find
> an entry point not at function boundary that sets registers and then
> proceeds with or branches to the desired library code.  The easiest way

with only one change - s/overwrite/provide/

However, we just end the sentence here:

"to set registers and branch would be a function epilogue."

Instead of this piece:

> pop/pop/.../ret - but then there's the difficulty in passing the address
> to ret to (we have just one NUL and we've already used it to get to this
> code).  Similarly, even via such pop's we can't pass an argument that
> contains a NUL in it - e.g., the address of "/bin/sh" in libc (it
> contains a NUL most significant byte too) or a zero value for root's
> uid.

We may write:

"Then it may be similarly difficult to reliably pass register values and
a further address to branch to, because the desired values for these
will also tend to contain NULs - e.g., the address of "/bin/sh" in libc
or a zero value for root's uid."

> A possible bypass is via multiple overflows - if the overflow may
> be triggered more than once before the vulnerable function returns, then
> multiple NULs may be written, exactly one per overflow.  But this is
> hopefully relatively rare.

This is OK to keep.

> To fully utilize the protection, the executable image should be
> randomized (sysctl kernel.randomize_va_space > 0 and the executable is
> compiled as PIE) and the sum of libraries sizes plus executable size
> shouldn't exceed 16 Mb.

Same comment re: "Mb".

> In this case the only pages out of
> ASCII-protected range are VDSO and vsyscall.  However, they don't
> provide enough material for obtaining arbitrary code execution and are
> not dangerous without using other executable pages.

I did not research this.  Perhaps you're right.

I think there are also cases when those pages are not present - e.g.,
they don't appear to be present for 32-bit processes on Owl now, both
with 32-bit and 64-bit kernels (Owl default builds).

> The logic is applied to x86 32 bit tasks, both for 32 bit kernels and
> for 32 bit tasks running on 64 bit kernels.  64 bit tasks already have
> zero bytes in addresses of library functions.  Other architectures
> (non-x86) may reuse the logic too.

OK.

> Without the patch:
> 
> $ ldd /bin/ls
> 	linux-gate.so.1 =>  (0xf779c000)
>         librt.so.1 => /lib/librt.so.1 (0xb7fcf000)
>         libtermcap.so.2 => /lib/libtermcap.so.2 (0xb7fca000)
>         libc.so.6 => /lib/libc.so.6 (0xb7eae000)
>         libpthread.so.0 => /lib/libpthread.so.0 (0xb7e5b000)
>         /lib/ld-linux.so.2 (0xb7fe6000)
> 
> With the patch:
> 
> $ ldd /bin/ls
> 	linux-gate.so.1 =>  (0xf772a000)
> 	librt.so.1 => /lib/librt.so.1 (0x0004a000)
> 	libtermcap.so.2 => /lib/libtermcap.so.2 (0x0005e000)
> 	libc.so.6 => /lib/libc.so.6 (0x00062000)
> 	libpthread.so.0 => /lib/libpthread.so.0 (0x00183000)
> 	/lib/ld-linux.so.2 (0x00121000)

Yes, we're getting output similar to the latter on Owl for i686 now
(32-bit RHEL5'ish kernel), but without linux-gate.  Ditto for -ow
patches for older kernels.

> If CONFIG_VM86=y, the first megabyte is excluded from the potential
> range for mmap allocations as it might be used by vm86 code.  If
> CONFIG_VM86=n, the allocation begins from the mmap_min_addr.  Regardless
> of CONFIG_VM86 the base address is randomized with the same entropy size
> as mm->mmap_base.

OK.  Shouldn't CONFIG_VM86 be a sysctl, though?

Does the kernel have CONFIG_VM86 already?  I thought were merely
discussed it, but you never submitted it to LKML?  If/when CONFIG_VM86
or an equivalent sysctl is added, it should also control the vm86(2) and
vm86old(2) syscalls.

> If 16 Mbs are over, we fallback to the old allocation algorithm.
> But, hopefully, programs which need such protection (network daemons,
> programs working with untrusted data, etc.) are small enough to utilize
> the protection.

OK.  Same comment about "Mbs".

> The same logic was used in -ow patch for 2.0-2.4 kernels and in
> exec-shield for 2.6.x kernels.  Code parts were taken from exec-shield
> from RHEL6.
> 
> v2 - Added comments, adjusted patch description.
>    - s/arch_get_unmapped_exec_area/get_unmapped_exec_area/
>    - Don't reserve first Mb if CONFIG_VM86=n.

s/first Mb/first 1 MiB + 64 KiB/

> Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
> --
>  arch/x86/mm/mmap.c       |   20 +++++++++++
>  include/linux/mm_types.h |    4 ++
>  include/linux/sched.h    |    3 ++
>  mm/mmap.c                |   82 +++++++++++++++++++++++++++++++++++++++++++---
>  4 files changed, 104 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
> index 1dab519..4e7a783 100644
> --- a/arch/x86/mm/mmap.c
> +++ b/arch/x86/mm/mmap.c
> @@ -118,6 +118,22 @@ static unsigned long mmap_legacy_base(void)
>  		return TASK_UNMAPPED_BASE + mmap_rnd();
>  }
>  
> +#ifdef CONFIG_VM86
> +/*
> + * Don't touch any memory that can be used by vm86 apps.
> + * Reserve the first Mb + 64 kb of guard pages.
> + */
> +#define ASCII_ARMOR_MIN_ADDR (0x00100000 + 0x00010000)

The extra 64 KiB are not "guard pages".  The rationale is that these
are addresses available from VM86 mode, as well as from real mode with
line A20 enabled on 286+ machines (dosemu may run DOS drivers and
programs from that era).  This was a way to get slightly more of usable
memory under DOS - IIRC, I had something like 900+ KB total DOS memory,
736 KB free after full bootup on a 386 (with proper drivers and
configuration tweaks).  Not everyone was happy with a mere 640 KB, of
which only 550 KB or so would be free after DOS bootup.

http://en.wikipedia.org/wiki/Real_mode#Addressing_capacity

"So, the actual amount of memory addressable by the 80286 and later x86
CPUs in real mode is 1 MiB + 64 KiB - 16 B = 1114096 B."

Alexander

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [kernel-hardening] [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-08-25 17:19   ` Vasiliy Kulikov
  2011-08-25 17:39     ` Solar Designer
  2011-09-02 18:29     ` Solar Designer
@ 2011-09-02 23:34     ` Solar Designer
  2011-09-03 12:12       ` Vasiliy Kulikov
  2 siblings, 1 reply; 47+ messages in thread
From: Solar Designer @ 2011-09-02 23:34 UTC (permalink / raw)
  To: kernel-hardening

Vasiliy,

Here's another set of comments from me (I ran out of time when writing
the previous message on this).

On Thu, Aug 25, 2011 at 09:19:34PM +0400, Vasiliy Kulikov wrote:
> +++ b/arch/x86/mm/mmap.c
> @@ -118,6 +118,22 @@ static unsigned long mmap_legacy_base(void)
[...]
> +static unsigned long mmap_lib_base(void)
> +{
> +	return ASCII_ARMOR_MIN_ADDR + mmap_rnd();
> +}

I was about to suggest that you declare this function inline, but then I
noticed that all other tiny and single-use functions in
arch/x86/mm/mmap.c are also non-inline.  Weird.  You might want to bring
this up on LKML separately.

> @@ -131,6 +147,10 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
>  	} else {
>  		mm->mmap_base = mmap_base();
>  		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
> +		if (mmap_is_ia32()) {
> +			mm->get_unmapped_exec_area = get_unmapped_exec_area;
> +			mm->lib_mmap_base = mmap_lib_base();
> +		}
>  		mm->unmap_area = arch_unmap_area_topdown;
>  	}

Are you sure it's safe to leave get_unmapped_exec_area and lib_mmap_base
uninitialized when !mmap_is_ia32()?  Is there implicit initialization to
NULL before this point maybe?  (I did not check.)

> +/* Addresses before this value contain at least one zero byte. */
> +#define ASCII_ARMOR_MAX_ADDR 0x01000000
> +
> +/*
> + * This function finds the first unmapped region inside of
> + * [mm->lib_mmap_base; ASCII_ARMOR_MAX_ADDR) region.  Addresses from this
> + * region contain at least one zero byte, which greatly complicates

Maybe s/greatly // because this word is subjective, and the actual
difficulty varies for different scenarios.

> + * exploitation of C string buffer overflows (C strings cannot contain zero
> + * byte inside) in return to libc class of attacks.
> + *
> + * This allocator is bottom up allocator like arch_get_unmapped_area(), but
> + * it differs from the latter.  get_unmapped_exec_area() does its best to
> + * allocate as low address as possible.
> + */
> +unsigned long
> +get_unmapped_exec_area(struct file *filp, unsigned long addr0,
> +		unsigned long len, unsigned long pgoff, unsigned long flags)
> +{
> +	unsigned long addr = addr0;
> +	struct mm_struct *mm = current->mm;
> +	struct vm_area_struct *vma;
> +
> +	if (len > TASK_SIZE)
> +		return -ENOMEM;
> +
> +	if (flags & MAP_FIXED)
> +		return addr;

In the MAP_FIXED case, don't we need to make sure that addr is actually
available - that is, do a "goto failed" here (fallback to old handling)?

And if so, don't we need to make it the very first check - before the
would-be-redundant len check?

> +	/* We ALWAYS start from the beginning as base addresses
> +	 * with zero high bits is a valued resource */
> +	addr = max_t(unsigned long, mm->lib_mmap_base, mmap_min_addr);
> +
> +	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
> +		/* At this point:  (!vma || addr < vma->vm_end). */
> +		if (TASK_SIZE - len < addr)
> +			return -ENOMEM;

Since this check is primarily of addr, I'd write it as:

		if (addr > TASK_SIZE - len)
			return -ENOMEM;

Also, I hope the len == 0 && addr == TASK_SIZE case that this check
happens to permit is irrelevant here.  I naively hope (well, maybe not)
that addr has been checked for "< TASK_SIZE" at a higher level. ;-)
You could want to review the current mm code for this.

> +		/* We don't want to reduce brk area of not DYNAMIC elf binaries
> +		 * with sysctl kernel.randomize_va_space < 2. */
> +		if (mm->brk && addr > mm->brk)
> +			goto failed;

Does this check come from RHEL?  I don't fully understand it.  We also
check for "vma->vm_end >= ASCII_ARMOR_MAX_ADDR" below.  Does this imply
that we choose to handle the case of mm->brk being lower than
ASCII_ARMOR_MAX_ADDR here?  Is it a practically relevant case?  Or was
this check possibly less redundant on RHEL?

(If these questions are time-consuming to find answers to, feel free to
disregard them for now.)

> +		if (!vma || addr + len <= vma->vm_start)
> +			return addr;
> +
> +		addr = vma->vm_end;
> +
> +		/* If ACSII-armor area is over, the algo gives up */
> +		if (addr >= ASCII_ARMOR_MAX_ADDR)
> +			goto failed;
> +	}
> +
> +failed:
> +	return current->mm->get_unmapped_area(filp, addr0, len, pgoff, flags);
> +}

Now, the main question: what's wrong with doing something as simple as:

unsigned long
get_unmapped_exec_area(struct file *filp, unsigned long addr,
		unsigned long len, unsigned long pgoff, unsigned long flags)
{
	return current->mm->get_unmapped_area(filp, ((flags & MAP_FIXED) || len >= 0x00ef0000UL) ? addr : max_t(unsigned long, mm->lib_mmap_base, mmap_min_addr), len, pgoff, flags);
}

Of course, written in a cleaner and more generic fashion.

This is similar to what I did in -ow patches, where I only needed to
adjust TASK_UNMAPPED_BASE turning it from a constant to a trivial macro
accepting the allocation size as input.

Doesn't current->mm->get_unmapped_area() contain a similar loop to find
the nearest hole to the supplied address?

OK, I guess this is an attempt to avoid the risk of mmap'ing something
right after the current end of the brk area, which would prevent its
further expansion, if we have lots of small executable mappings.  With
-ow patches, this risk is probably present (although I've never seen
this happen in practice).  Is that the only reason?

If so, would it possibly be cleaner (less invasive) to call
current->mm->get_unmapped_area() first (adjusting the hint like -ow
patches did), then see if the found address is suitable for use (below
brk)?  In the unlikely case when the returned address is not suitable
for use, drop the hint and call again.  OK, someone might not like the
maybe 2x slowdown in the unlikely case that the address is unsuitable.
But then, your code also involves walking a portion of the vma list
twice in case you fallback to the old handling.  So it may be the same
speed, but more functionality duplication.

I am not exactly proposing this change.  Rather, I mention it as a
possibility for your consideration.

What do you think?

Alexander

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [kernel-hardening] [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-09-02 18:29     ` Solar Designer
@ 2011-09-03 11:18       ` Vasiliy Kulikov
  2011-09-03 23:57         ` Solar Designer
  0 siblings, 1 reply; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-09-03 11:18 UTC (permalink / raw)
  To: kernel-hardening

On Fri, Sep 02, 2011 at 22:29 +0400, Solar Designer wrote:
> On Thu, Aug 25, 2011 at 09:19:34PM +0400, Vasiliy Kulikov wrote:
> > This patch changes mmap base address allocator logic to incline to
> > allocate addresses for executable pages from the first 16 Mbs of address
> 
> s/Mbs/MiB/ (or just MB, whereas Mb is sometimes used to denote megabits)

OK.

> > space.  These addresses start from zero byte (0x00AABBCC).  Using such
> > addresses breaks ret2libc exploits abusing string buffer overflows (or
> > does it much harder).
> 
> s,does it much harder,makes such attacks harder and/or less reliable,

OK.

> > As x86 architecture is little-endian, this zero byte is the last byte of
> > the address.  So it's possible to e.g. overwrite a return address on the
> > stack with the mailformed address.  However, now it's impossible to
> 
> s/mailformed/malformed/

Oops.

> > additionally overwrite function arguments, which are located after the
> > function address on the stack.  The attacker's best bet may be to find
> > an entry point not at function boundary that sets registers and then
> > proceeds with or branches to the desired library code.  The easiest way
> > to set registers and branch would be a function epilogue -
> > pop/pop/.../ret - but then there's the difficulty in passing the address
> > to ret to (we have just one NUL and we've already used it to get to this
> > code).  Similarly, even via such pop's we can't pass an argument that
> > contains a NUL in it - e.g., the address of "/bin/sh" in libc (it
> > contains a NUL most significant byte too) or a zero value for root's
> > uid.
> 
> The above was partially flawed logic on my part - as written above
> (without further detail), the pop/pop/.../ret thing doesn't apply
> because those pop's would read stack right after the just-used return
> address - that is, the same stack locations that we presumably could not
> write to in order to pass the arguments in a more straightforward
> fashion.  So this trick would be of no help, and thus its other
> limitations would be of no relevance.

Why not?  If function address contains NUL, the overflow stops at this
address.  If it doesn't contain NUL, but argument contain NUL, it is the
last argument an attacker can use (therefore, it would be the last
used code chunk).  So, it has some value even if he can somehow write
the ret address (e.g. it is out of 16 MBs).


> > If CONFIG_VM86=y, the first megabyte is excluded from the potential
> > range for mmap allocations as it might be used by vm86 code.  If
> > CONFIG_VM86=n, the allocation begins from the mmap_min_addr.  Regardless
> > of CONFIG_VM86 the base address is randomized with the same entropy size
> > as mm->mmap_base.
> 
> OK.  Shouldn't CONFIG_VM86 be a sysctl, though?

This is not a hardening setting that was present in -ow, but an existing
config to disable vm86/vm86_old at the compile time.  It was added for
EMBEDDED.


> > +#ifdef CONFIG_VM86
> > +/*
> > + * Don't touch any memory that can be used by vm86 apps.
> > + * Reserve the first Mb + 64 kb of guard pages.
> > + */
> > +#define ASCII_ARMOR_MIN_ADDR (0x00100000 + 0x00010000)
> 
> The extra 64 KiB are not "guard pages".  The rationale is that these
> are addresses available from VM86 mode, as well as from real mode with
> line A20 enabled on 286+ machines (dosemu may run DOS drivers and
> programs from that era).  This was a way to get slightly more of usable
> memory under DOS - IIRC, I had something like 900+ KB total DOS memory,
> 736 KB free after full bootup on a 386 (with proper drivers and
> configuration tweaks).  Not everyone was happy with a mere 640 KB, of
> which only 550 KB or so would be free after DOS bootup.
> 
> http://en.wikipedia.org/wiki/Real_mode#Addressing_capacity
> 
> "So, the actual amount of memory addressable by the 80286 and later x86
> CPUs in real mode is 1 MiB + 64 KiB - 16 B = 1114096 B."

Yes, my bad.

Thanks,

-- 
Vasiliy

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [kernel-hardening] [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-09-02 23:34     ` Solar Designer
@ 2011-09-03 12:12       ` Vasiliy Kulikov
  2011-09-03 23:40         ` Solar Designer
  2011-09-04  7:21         ` Vasiliy Kulikov
  0 siblings, 2 replies; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-09-03 12:12 UTC (permalink / raw)
  To: kernel-hardening

On Sat, Sep 03, 2011 at 03:34 +0400, Solar Designer wrote:
> > +static unsigned long mmap_lib_base(void)
> > +{
> > +	return ASCII_ARMOR_MIN_ADDR + mmap_rnd();
> > +}
> 
> I was about to suggest that you declare this function inline, but then I
> noticed that all other tiny and single-use functions in
> arch/x86/mm/mmap.c are also non-inline.  Weird.  You might want to bring
> this up on LKML separately.

Why?  They are called once per process when mm_struct is initialized.
Inlining would not boost the code.


> > @@ -131,6 +147,10 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
> >  	} else {
> >  		mm->mmap_base = mmap_base();
> >  		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
> > +		if (mmap_is_ia32()) {
> > +			mm->get_unmapped_exec_area = get_unmapped_exec_area;
> > +			mm->lib_mmap_base = mmap_lib_base();
> > +		}
> >  		mm->unmap_area = arch_unmap_area_topdown;
> >  	}
> 
> Are you sure it's safe to leave get_unmapped_exec_area and lib_mmap_base
> uninitialized when !mmap_is_ia32()?  Is there implicit initialization to
> NULL before this point maybe?  (I did not check.)

There is:

struct mm_struct * mm_alloc(void)
{
	struct mm_struct * mm;

	mm = allocate_mm();
	if (mm) {
		memset(mm, 0, sizeof(*mm));
...


> > +/* Addresses before this value contain at least one zero byte. */
> > +#define ASCII_ARMOR_MAX_ADDR 0x01000000
> > +
> > +/*
> > + * This function finds the first unmapped region inside of
> > + * [mm->lib_mmap_base; ASCII_ARMOR_MAX_ADDR) region.  Addresses from this
> > + * region contain at least one zero byte, which greatly complicates
> 
> Maybe s/greatly // because this word is subjective, and the actual
> difficulty varies for different scenarios.

OK.


> > +unsigned long
> > +get_unmapped_exec_area(struct file *filp, unsigned long addr0,
> > +		unsigned long len, unsigned long pgoff, unsigned long flags)
> > +{
> > +	unsigned long addr = addr0;
> > +	struct mm_struct *mm = current->mm;
> > +	struct vm_area_struct *vma;
> > +
> > +	if (len > TASK_SIZE)
> > +		return -ENOMEM;
> > +
> > +	if (flags & MAP_FIXED)
> > +		return addr;
> 
> In the MAP_FIXED case, don't we need to make sure that addr is actually
> available - that is, do a "goto failed" here (fallback to old handling)?

No, what can old handling do?  If MAP_FIXED is passed, we may not change
the behaviour.  If the region is already owned by someone, checks in the
upper mm code would return error.  Also all other allocators just return
addr in this case.


> 
> > +	/* We ALWAYS start from the beginning as base addresses
> > +	 * with zero high bits is a valued resource */
> > +	addr = max_t(unsigned long, mm->lib_mmap_base, mmap_min_addr);
> > +
> > +	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
> > +		/* At this point:  (!vma || addr < vma->vm_end). */
> > +		if (TASK_SIZE - len < addr)
> > +			return -ENOMEM;
> 
> Since this check is primarily of addr, I'd write it as:
> 
> 		if (addr > TASK_SIZE - len)
> 			return -ENOMEM;
> 
> Also, I hope the len == 0 && addr == TASK_SIZE case that this check
> happens to permit is irrelevant here.  I naively hope (well, maybe not)
> that addr has been checked for "< TASK_SIZE" at a higher level. ;-)
> You could want to review the current mm code for this.

len == 0 is checked in the callers - do_mmap_pgoff() and __do_brk().

"< TASK_SIZE":

	/* Careful about overflows.. */
	if (len > TASK_SIZE)
		return -ENOMEM;
    ...
	if (addr > TASK_SIZE - len)
		return -ENOMEM;

> > +		/* We don't want to reduce brk area of not DYNAMIC elf binaries
> > +		 * with sysctl kernel.randomize_va_space < 2. */
> > +		if (mm->brk && addr > mm->brk)
> > +			goto failed;
> 
> Does this check come from RHEL?

Partly, see below.


>  I don't fully understand it.  We also
> check for "vma->vm_end >= ASCII_ARMOR_MAX_ADDR" below.  Does this imply
> that we choose to handle the case of mm->brk being lower than
> ASCII_ARMOR_MAX_ADDR here?  Is it a practically relevant case?

It's possible to have a weird case: PIE is disabled, exec image is lower
than 0x01000000, kernel.randomize_va_space=0.  It means brk area starts
in ASCII-armor zone.  If we try to jump over brk, then next brk growth
would fail as we've loaded some library just after the last brk page.

(Also it would touch brk guard pages, which I didn't investigate.)


>  Or was
> this check possibly less redundant on RHEL?


At first, RHEL tries to allocate a region without brk check:

		addr = !should_randomize() ? SHLIB_BASE :
			randomize_range(SHLIB_BASE, 0x01000000, len);


Then if it fails, exec-shield tries to find a region in a cycle, without
the brk check.  Then if it overruns 0x01000000, it starts to do brk
check:

			if (addr >= 0x01000000 && should_randomize()) {
				tmp = randomize_range(0x01000000,
					PAGE_ALIGN(max(mm->start_brk,
					(unsigned long)0x08000000)), len);

So, they don't care about this rare case.

I heard many times about legacy randomize_va_space=0 systems, which
disable brk randomization because of some ancient proprietary software.
Non-PIE binaries are very often nowadays.  I didn't see these 3 cases at
once (non-PIE, low mmap exec base, no brk randomization), but I don't
see why it's impossible.  If you know why it is, I'll remove the check.


> (If these questions are time-consuming to find answers to, feel free to
> disregard them for now.)
> 
> > +		if (!vma || addr + len <= vma->vm_start)
> > +			return addr;
> > +
> > +		addr = vma->vm_end;
> > +
> > +		/* If ACSII-armor area is over, the algo gives up */
> > +		if (addr >= ASCII_ARMOR_MAX_ADDR)
> > +			goto failed;
> > +	}
> > +
> > +failed:
> > +	return current->mm->get_unmapped_area(filp, addr0, len, pgoff, flags);
> > +}
> 
> Now, the main question: what's wrong with doing something as simple as:
> 
> unsigned long
> get_unmapped_exec_area(struct file *filp, unsigned long addr,
> 		unsigned long len, unsigned long pgoff, unsigned long flags)
> {
> 	return current->mm->get_unmapped_area(filp, ((flags & MAP_FIXED) || len >= 0x00ef0000UL) ? addr : max_t(unsigned long, mm->lib_mmap_base, mmap_min_addr), len, pgoff, flags);
> }
> 
> Of course, written in a cleaner and more generic fashion.

1) get_unmapped_area may ignore the hint if the address is already owned.
Then it would start from the cached hole address rather than ->lib_mmap_base.

2) my code contains brk check in ASCII zone.


> This is similar to what I did in -ow patches, where I only needed to
> adjust TASK_UNMAPPED_BASE turning it from a constant to a trivial macro
> accepting the allocation size as input.

In times of -ow patch the allocator was simpler :)  Now it differs from
arch to arch, it can be top down and bottom up.  More than that:
currently bottom up allocator doesn't respect randomization.

We could reuse/refactor bottom up allocator to use randomization, but
it's likely to be taken as an invasive change.

Changing top down allocator is likely to be rejected too as it might
significantly slowdown the case when addr and the near zones are owned,
but the cache hint is just OK.  Note that we don't want to change
anything for non-32 bit systems.


I'd want to reduce the added code size too, but it is rather tricky to
do it with mm code.  AFAICS, the way RHEL did it (separate
->get_unmapped_exec_area allocator) is the easiest one.


Thanks,

-- 
Vasiliy

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [kernel-hardening] [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-09-03 12:12       ` Vasiliy Kulikov
@ 2011-09-03 23:40         ` Solar Designer
  2011-09-04  7:21         ` Vasiliy Kulikov
  1 sibling, 0 replies; 47+ messages in thread
From: Solar Designer @ 2011-09-03 23:40 UTC (permalink / raw)
  To: kernel-hardening

Vasiliy,

On Sat, Sep 03, 2011 at 04:12:06PM +0400, Vasiliy Kulikov wrote:
> On Sat, Sep 03, 2011 at 03:34 +0400, Solar Designer wrote:
> > > +static unsigned long mmap_lib_base(void)
> > > +{
> > > +	return ASCII_ARMOR_MIN_ADDR + mmap_rnd();
> > > +}
> > 
> > I was about to suggest that you declare this function inline, but then I
> > noticed that all other tiny and single-use functions in
> > arch/x86/mm/mmap.c are also non-inline.  Weird.  You might want to bring
> > this up on LKML separately.
> 
> Why?  They are called once per process when mm_struct is initialized.
> Inlining would not boost the code.

Besides the minor and maybe unimportant speed difference, inlining of
functions invoked from just one place may reduce total code size.  Of
course, it's a very minor issue.

> > > +unsigned long
> > > +get_unmapped_exec_area(struct file *filp, unsigned long addr0,
> > > +		unsigned long len, unsigned long pgoff, unsigned long flags)
> > > +{
> > > +	unsigned long addr = addr0;
> > > +	struct mm_struct *mm = current->mm;
> > > +	struct vm_area_struct *vma;
> > > +
> > > +	if (len > TASK_SIZE)
> > > +		return -ENOMEM;
> > > +
> > > +	if (flags & MAP_FIXED)
> > > +		return addr;
> > 
> > In the MAP_FIXED case, don't we need to make sure that addr is actually
> > available - that is, do a "goto failed" here (fallback to old handling)?
> 
> No, what can old handling do?  If MAP_FIXED is passed, we may not change
> the behaviour.  If the region is already owned by someone, checks in the
> upper mm code would return error.  Also all other allocators just return
> addr in this case.

OK.  I'll trust you on this.

> It's possible to have a weird case: PIE is disabled, exec image is lower
> than 0x01000000, kernel.randomize_va_space=0.  It means brk area starts
> in ASCII-armor zone.  If we try to jump over brk, then next brk growth
> would fail as we've loaded some library just after the last brk page.

OK.  Maybe you can make the comment as clear as the above paragraph?
The way it's written currently, it is unclear what it talks about -
e.g., why "not DYNAMIC elf binaries"?  In fact, it doesn't even look
consistent with what you wrote above.

> At first, RHEL tries to allocate a region without brk check:
> 
> 		addr = !should_randomize() ? SHLIB_BASE :
> 			randomize_range(SHLIB_BASE, 0x01000000, len);
> 
> 
> Then if it fails, exec-shield tries to find a region in a cycle, without
> the brk check.  Then if it overruns 0x01000000, it starts to do brk
> check:
> 
> 			if (addr >= 0x01000000 && should_randomize()) {
> 				tmp = randomize_range(0x01000000,
> 					PAGE_ALIGN(max(mm->start_brk,
> 					(unsigned long)0x08000000)), len);
> 
> So, they don't care about this rare case.
> 
> I heard many times about legacy randomize_va_space=0 systems, which
> disable brk randomization because of some ancient proprietary software.
> Non-PIE binaries are very often nowadays.  I didn't see these 3 cases at
> once (non-PIE, low mmap exec base, no brk randomization), but I don't
> see why it's impossible.  If you know why it is, I'll remove the check.

I think non-PIE binaries with their fixed base address other than the
standard 0x08048000 are very rare - possibly they only exist in kernel
vulnerability exploits. ;-)  But I don't know for sure.  I don't mind
the check, but please make the comment descriptive and correct.

> > Now, the main question: what's wrong with doing something as simple as:
> > 
> > unsigned long
> > get_unmapped_exec_area(struct file *filp, unsigned long addr,
> > 		unsigned long len, unsigned long pgoff, unsigned long flags)
> > {
> > 	return current->mm->get_unmapped_area(filp, ((flags & MAP_FIXED) || len >= 0x00ef0000UL) ? addr : max_t(unsigned long, mm->lib_mmap_base, mmap_min_addr), len, pgoff, flags);
> > }
> > 
> > Of course, written in a cleaner and more generic fashion.
> 
> 1) get_unmapped_area may ignore the hint if the address is already owned.
> Then it would start from the cached hole address rather than ->lib_mmap_base.

OK.  This cached hole address might be a post-2.4 thing - at least, I
haven't seen this behavior on 2.4 and prior.

> 2) my code contains brk check in ASCII zone.

OK.

> I'd want to reduce the added code size too, but it is rather tricky to
> do it with mm code.  AFAICS, the way RHEL did it (separate
> ->get_unmapped_exec_area allocator) is the easiest one.

I am fine with this approach.  One nice aspect of it is that there's no
performance impact for non-exec mmap()'s.

Alexander

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [kernel-hardening] [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-09-03 11:18       ` Vasiliy Kulikov
@ 2011-09-03 23:57         ` Solar Designer
  2011-09-05 12:46           ` Vasiliy Kulikov
  0 siblings, 1 reply; 47+ messages in thread
From: Solar Designer @ 2011-09-03 23:57 UTC (permalink / raw)
  To: kernel-hardening

Vasiliy,

On Sat, Sep 03, 2011 at 03:18:49PM +0400, Vasiliy Kulikov wrote:
> On Fri, Sep 02, 2011 at 22:29 +0400, Solar Designer wrote:
> > On Thu, Aug 25, 2011 at 09:19:34PM +0400, Vasiliy Kulikov wrote:
> > > additionally overwrite function arguments, which are located after the
> > > function address on the stack.  The attacker's best bet may be to find
> > > an entry point not at function boundary that sets registers and then
> > > proceeds with or branches to the desired library code.  The easiest way
> > > to set registers and branch would be a function epilogue -
> > > pop/pop/.../ret - but then there's the difficulty in passing the address
> > > to ret to (we have just one NUL and we've already used it to get to this
> > > code).  Similarly, even via such pop's we can't pass an argument that
> > > contains a NUL in it - e.g., the address of "/bin/sh" in libc (it
> > > contains a NUL most significant byte too) or a zero value for root's
> > > uid.
> > 
> > The above was partially flawed logic on my part - as written above
> > (without further detail), the pop/pop/.../ret thing doesn't apply
> > because those pop's would read stack right after the just-used return
> > address - that is, the same stack locations that we presumably could not
> > write to in order to pass the arguments in a more straightforward
> > fashion.  So this trick would be of no help, and thus its other
> > limitations would be of no relevance.
> 
> Why not?

I am not sure what exactly your "why not" applies to.  What I said was
that the trick of returning specifically to pop/pop/.../ret would be of
no help to an exploit writer trying to bypass ASCII armor, and I
explained why not in the paragraph you quoted.  So the exploit writer
would use some other trick, possibly just slightly different - to give
an example (just to you, not for LKML), I included an instruction
sequence from a glibc build that would be a better target to return to
(note how it is not limited to pop and ret instructions), and that would
actually make the limitations being talked about relevant.

> If function address contains NUL, the overflow stops at this
> address.  If it doesn't contain NUL, but argument contain NUL, it is the
> last argument an attacker can use

Right.

> (therefore, it would be the last used code chunk).

I don't understand this bit.

> So, it has some value even if he can somehow write
> the ret address (e.g. it is out of 16 MBs).

Right.  Once again, what I said is that this limitation becomes relevant
in certain cases other than returning to a trivial pop/ret sequence.
Namely, it is relevant when returning straight to a function entry, and
it is relevant when returning to certain other instruction sequences.
Just not when returning specifically to pop/ret, which is of no help in
an attack trying to bypass ASCII armor anyway.

To summarize: we happened to give a poor example in the patch
description, and I'd like to correct that by reducing the level of
detail.  (The alternative would have been to go deeper into detail.)

> > > If CONFIG_VM86=y, the first megabyte is excluded from the potential
> > > range for mmap allocations as it might be used by vm86 code.  If
> > > CONFIG_VM86=n, the allocation begins from the mmap_min_addr.  Regardless
> > > of CONFIG_VM86 the base address is randomized with the same entropy size
> > > as mm->mmap_base.
> > 
> > OK.  Shouldn't CONFIG_VM86 be a sysctl, though?
> 
> This is not a hardening setting that was present in -ow, but an existing
> config to disable vm86/vm86_old at the compile time.  It was added for
> EMBEDDED.

Oh, I was not aware of that.

solar@host:~/kernel/mainline/linux-3.0.4 $ fgrep -rl CONFIG_VM86 .
./arch/x86/kernel/Makefile
./arch/x86/kernel/entry_32.S
./arch/x86/include/asm/vm86.h
./arch/x86/include/asm/processor-flags.h

Looks like there's no Kconfig option for this - perhaps add it with a
separate patch?

Thanks,

Alexander

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [kernel-hardening] [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-09-03 12:12       ` Vasiliy Kulikov
  2011-09-03 23:40         ` Solar Designer
@ 2011-09-04  7:21         ` Vasiliy Kulikov
  1 sibling, 0 replies; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-09-04  7:21 UTC (permalink / raw)
  To: kernel-hardening

On Sat, Sep 03, 2011 at 16:12 +0400, Vasiliy Kulikov wrote:
> > > +		/* We don't want to reduce brk area of not DYNAMIC elf binaries
> > > +		 * with sysctl kernel.randomize_va_space < 2. */
> > > +		if (mm->brk && addr > mm->brk)
> > > +			goto failed;
> > 
> > Does this check come from RHEL?
> 
> Partly, see below.
> 
> 
> >  I don't fully understand it.  We also
> > check for "vma->vm_end >= ASCII_ARMOR_MAX_ADDR" below.  Does this imply
> > that we choose to handle the case of mm->brk being lower than
> > ASCII_ARMOR_MAX_ADDR here?  Is it a practically relevant case?
> 
> It's possible to have a weird case: PIE is disabled, exec image is lower
> than 0x01000000, kernel.randomize_va_space=0.  It means brk area starts
> in ASCII-armor zone.  If we try to jump over brk, then next brk growth
> would fail as we've loaded some library just after the last brk page.
> 
> (Also it would touch brk guard pages, which I didn't investigate.)
> 
> 
> >  Or was
> > this check possibly less redundant on RHEL?
> 

Sorry, I was precluded and have mixed things.  This should be read as:

> At first, RHEL tries to allocate a region without brk check:
> 
> 		addr = !should_randomize() ? SHLIB_BASE :
> 			randomize_range(SHLIB_BASE, 0x01000000, len);
> 
> 
> Then if it fails, exec-shield tries to find a region in a cycle, without
> the brk check.

s/without/with/

			/*
			 * Must not let a PROT_EXEC mapping get into the
			 * brk area:
			 */
			if (addr + len > mm->brk)
				goto failed;

>  Then if it overruns 0x01000000, it starts to do brk
> check:
> 
> 			if (addr >= 0x01000000 && should_randomize()) {
> 				tmp = randomize_range(0x01000000,
> 					PAGE_ALIGN(max(mm->start_brk,
> 					(unsigned long)0x08000000)), len);
> 
> So, they don't care about this rare case.

They care, but it is inconsistent - sometimes they check it, sometimes
not.  If the first guess failed - they do, otherwise don't.


> I heard many times about legacy randomize_va_space=0 systems, which
> disable brk randomization because of some ancient proprietary software.
> Non-PIE binaries are very often nowadays.  I didn't see these 3 cases at
> once (non-PIE, low mmap exec base, no brk randomization), but I don't
> see why it's impossible.  If you know why it is, I'll remove the check.

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [kernel-hardening] [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-09-03 23:57         ` Solar Designer
@ 2011-09-05 12:46           ` Vasiliy Kulikov
  2011-09-06  5:05             ` Solar Designer
  0 siblings, 1 reply; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-09-05 12:46 UTC (permalink / raw)
  To: kernel-hardening

Solar,

On Sun, Sep 04, 2011 at 03:57 +0400, Solar Designer wrote:
> On Sat, Sep 03, 2011 at 03:18:49PM +0400, Vasiliy Kulikov wrote:
> > On Fri, Sep 02, 2011 at 22:29 +0400, Solar Designer wrote:
> > > On Thu, Aug 25, 2011 at 09:19:34PM +0400, Vasiliy Kulikov wrote:
> > > > additionally overwrite function arguments, which are located after the
> > > > function address on the stack.  The attacker's best bet may be to find
> > > > an entry point not at function boundary that sets registers and then
> > > > proceeds with or branches to the desired library code.  The easiest way
> > > > to set registers and branch would be a function epilogue -
> > > > pop/pop/.../ret - but then there's the difficulty in passing the address
> > > > to ret to (we have just one NUL and we've already used it to get to this
> > > > code).  Similarly, even via such pop's we can't pass an argument that
> > > > contains a NUL in it - e.g., the address of "/bin/sh" in libc (it
> > > > contains a NUL most significant byte too) or a zero value for root's
> > > > uid.
> > > 
> > > The above was partially flawed logic on my part - as written above
> > > (without further detail), the pop/pop/.../ret thing doesn't apply
> > > because those pop's would read stack right after the just-used return
> > > address - that is, the same stack locations that we presumably could not
> > > write to in order to pass the arguments in a more straightforward
> > > fashion.  So this trick would be of no help, and thus its other
> > > limitations would be of no relevance.
> > 
> > Why not?
> 
> I am not sure what exactly your "why not" applies to.

Hmm, yes, looks like I've lost the thread some time ago :(  A good
description definitely needs much longer and scrupulous analysis.
Probably there is a public paper with a review/analysis/benefits of
ASCII-armor that we're able to refer in the patch description?  I cannot
find any rigorous paper, unfortunately.


> > > > If CONFIG_VM86=y, the first megabyte is excluded from the potential
> > > > range for mmap allocations as it might be used by vm86 code.  If
> > > > CONFIG_VM86=n, the allocation begins from the mmap_min_addr.  Regardless
> > > > of CONFIG_VM86 the base address is randomized with the same entropy size
> > > > as mm->mmap_base.
> > > 
> > > OK.  Shouldn't CONFIG_VM86 be a sysctl, though?
> > 
> > This is not a hardening setting that was present in -ow, but an existing
> > config to disable vm86/vm86_old at the compile time.  It was added for
> > EMBEDDED.
> 
> Oh, I was not aware of that.
> 
> solar@host:~/kernel/mainline/linux-3.0.4 $ fgrep -rl CONFIG_VM86 .
> ./arch/x86/kernel/Makefile
> ./arch/x86/kernel/entry_32.S
> ./arch/x86/include/asm/vm86.h
> ./arch/x86/include/asm/processor-flags.h
> 
> Looks like there's no Kconfig option for this - perhaps add it with a
> separate patch?

Since 2.6.x CONFIG_ prefix is not used in Kconfig files:

$ grep -w VM86 arch/x86/Kconfig
config VM86
    bool "Enable VM86 support" if EXPERT


Thanks,

-- 
Vasiliy

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [kernel-hardening] [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-09-05 12:46           ` Vasiliy Kulikov
@ 2011-09-06  5:05             ` Solar Designer
  2011-09-07  9:09               ` Vasiliy Kulikov
  0 siblings, 1 reply; 47+ messages in thread
From: Solar Designer @ 2011-09-06  5:05 UTC (permalink / raw)
  To: kernel-hardening

On Mon, Sep 05, 2011 at 04:46:47PM +0400, Vasiliy Kulikov wrote:
> Hmm, yes, looks like I've lost the thread some time ago :(  A good
> description definitely needs much longer and scrupulous analysis.

Or alternatively it may choose not to go into detail at all, which is a
safer bet. ;-)

> Probably there is a public paper with a review/analysis/benefits of
> ASCII-armor that we're able to refer in the patch description?  I cannot
> find any rigorous paper, unfortunately.

I'm not aware of such paper.

> On Sun, Sep 04, 2011 at 03:57 +0400, Solar Designer wrote:
> > solar@host:~/kernel/mainline/linux-3.0.4 $ fgrep -rl CONFIG_VM86 .
> > ./arch/x86/kernel/Makefile
> > ./arch/x86/kernel/entry_32.S
> > ./arch/x86/include/asm/vm86.h
> > ./arch/x86/include/asm/processor-flags.h
> > 
> > Looks like there's no Kconfig option for this - perhaps add it with a
> > separate patch?
> 
> Since 2.6.x CONFIG_ prefix is not used in Kconfig files:
> 
> $ grep -w VM86 arch/x86/Kconfig
> config VM86
>     bool "Enable VM86 support" if EXPERT

Oh, of course.  Sometimes I say/do dumb things.

Turns out this stuff is also present in RHEL5/OpenVZ kernels, so we
might want to turn on CONFIG_EMBEDDED and disable CONFIG_VM86 in Owl
kernel builds, even before we move to newer kernels.

Alexander

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [kernel-hardening] [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-09-06  5:05             ` Solar Designer
@ 2011-09-07  9:09               ` Vasiliy Kulikov
  2011-09-07  9:30                 ` Solar Designer
  0 siblings, 1 reply; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-09-07  9:09 UTC (permalink / raw)
  To: kernel-hardening

Solar,

I've updated patch description, code comments, and "if" condition.  If
no other objection, I'll post it as RFCv2 on LKML.


===================

This patch changes mmap base address allocator logic to incline to
allocate addresses for executable pages from the first 16 MBs of address
space.  These addresses start from zero byte (0x00AABBCC).  Using such
addresses breaks ret2libc exploits abusing string buffer overflows (or
makes such attacks harder and/or less reliable).

As x86 architecture is little-endian, this zero byte is the last byte of
the address.  So it's possible to e.g. overwrite a return address on the
stack with the malformed address.  However, now it's impossible to
additionally provide function arguments, which are located after the
function address on the stack.  The attacker's best bet may be to find
an entry point not at function boundary that sets registers and then
proceeds with or branches to the desired library code.  The easiest way
to set registers and branch would be a function epilogue.  Then it may
be similarly difficult to reliably pass register values and a further
address to branch to, because the desired values for these will also
tend to contain NULs - e.g., the address of "/bin/sh" in libc or a zero
value for root's uid.  A possible bypass is via multiple overflows - if
the overflow may be triggered more than once before the vulnerable
function returns, then multiple NULs may be written, exactly one per
overflow.  But this is hopefully relatively rare.

To fully utilize the protection, the executable image should be
randomized (sysctl kernel.randomize_va_space > 0 and the executable is
compiled as PIE) and the sum of libraries sizes plus executable size
shouldn't exceed 16 MBs.  In this case the only pages out of
ASCII-protected range are VDSO and vsyscall pages.  However, they don't
provide enough material for obtaining arbitrary code execution and are
not dangerous without using other executable pages.

The logic is applied to x86 32 bit tasks, both for 32 bit kernels and
for 32 bit tasks running on 64 bit kernels.  64 bit tasks already have
zero bytes in addresses of library functions.  Other architectures
(non-x86) may reuse the logic too.

Without the patch:

$ ldd /bin/ls
	linux-gate.so.1 =>  (0xf779c000)
        librt.so.1 => /lib/librt.so.1 (0xb7fcf000)
        libtermcap.so.2 => /lib/libtermcap.so.2 (0xb7fca000)
        libc.so.6 => /lib/libc.so.6 (0xb7eae000)
        libpthread.so.0 => /lib/libpthread.so.0 (0xb7e5b000)
        /lib/ld-linux.so.2 (0xb7fe6000)

With the patch:

$ ldd /bin/ls
	linux-gate.so.1 =>  (0xf772a000)
	librt.so.1 => /lib/librt.so.1 (0x0004a000)
	libtermcap.so.2 => /lib/libtermcap.so.2 (0x0005e000)
	libc.so.6 => /lib/libc.so.6 (0x00062000)
	libpthread.so.0 => /lib/libpthread.so.0 (0x00183000)
	/lib/ld-linux.so.2 (0x00121000)


If CONFIG_VM86=y, the first megabyte is excluded from the potential
range for mmap allocations as it might be used by vm86 code.  If
CONFIG_VM86=n, the allocation begins from the mmap_min_addr.  Regardless
of CONFIG_VM86 the base address is randomized with the same entropy size
as mm->mmap_base.

If 16 MBs are over, we fallback to the old allocation algorithm.
But, hopefully, programs which need such protection (network daemons,
programs working with untrusted data, etc.) are small enough to utilize
the protection.

The same logic was used in -ow patch for 2.0-2.4 kernels and in
exec-shield for 2.6.x kernels.  Code parts were taken from exec-shield
from RHEL6.


v2 - Added comments, adjusted patch description.
   - s/arch_get_unmapped_exec_area/get_unmapped_exec_area/
   - Don't reserve first 1 MiB + 64 kb if CONFIG_VM86=n.

Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
---
 arch/x86/mm/mmap.c       |   20 ++++++++++
 include/linux/mm_types.h |    4 ++
 include/linux/sched.h    |    3 ++
 mm/mmap.c                |   87 +++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 109 insertions(+), 5 deletions(-)

diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 1dab519..20b2eae 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -118,6 +118,22 @@ static unsigned long mmap_legacy_base(void)
 		return TASK_UNMAPPED_BASE + mmap_rnd();
 }
 
+#ifdef CONFIG_VM86
+/*
+ * Don't touch any memory that can be addressed by vm86 apps.
+ * Reserve the first 1 MiB + 64 kb.
+ */
+#define ASCII_ARMOR_MIN_ADDR 0x00110000
+#else
+/* No special users of low addresses.  Start just after mmap_min_addr. */
+#define ASCII_ARMOR_MIN_ADDR 0
+#endif
+
+static unsigned long mmap_lib_base(void)
+{
+	return ASCII_ARMOR_MIN_ADDR + mmap_rnd();
+}
+
 /*
  * This function, called very early during the creation of a new
  * process VM image, sets up which VM layout function to use:
@@ -131,6 +147,10 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	} else {
 		mm->mmap_base = mmap_base();
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+		if (mmap_is_ia32()) {
+			mm->get_unmapped_exec_area = get_unmapped_exec_area;
+			mm->lib_mmap_base = mmap_lib_base();
+		}
 		mm->unmap_area = arch_unmap_area_topdown;
 	}
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 027935c..68fc216 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -225,9 +225,13 @@ struct mm_struct {
 	unsigned long (*get_unmapped_area) (struct file *filp,
 				unsigned long addr, unsigned long len,
 				unsigned long pgoff, unsigned long flags);
+	unsigned long (*get_unmapped_exec_area) (struct file *filp,
+				unsigned long addr, unsigned long len,
+				unsigned long pgoff, unsigned long flags);
 	void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
 #endif
 	unsigned long mmap_base;		/* base of mmap area */
+	unsigned long lib_mmap_base;		/* base of mmap libraries area (for get_unmapped_exec_area()) */
 	unsigned long task_size;		/* size of task vm space */
 	unsigned long cached_hole_size; 	/* if non-zero, the largest hole below free_area_cache */
 	unsigned long free_area_cache;		/* first hole of size cached_hole_size or larger */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f024c63..ef9024f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -394,6 +394,9 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 			  unsigned long flags);
 extern void arch_unmap_area(struct mm_struct *, unsigned long);
 extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
+extern unsigned long
+get_unmapped_exec_area(struct file *, unsigned long,
+		unsigned long, unsigned long, unsigned long);
 #else
 static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
 #endif
diff --git a/mm/mmap.c b/mm/mmap.c
index d49736f..593f557 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -50,6 +50,10 @@ static void unmap_region(struct mm_struct *mm,
 		struct vm_area_struct *vma, struct vm_area_struct *prev,
 		unsigned long start, unsigned long end);
 
+static unsigned long
+get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned long len,
+		unsigned long pgoff, unsigned long flags, bool exec);
+
 /*
  * WARNING: the debugging will use recursive algorithms so never enable this
  * unless you know what you are doing.
@@ -989,7 +993,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	/* Obtain the address to map to. we verify (or select) it and ensure
 	 * that it represents a valid section of the address space.
 	 */
-	addr = get_unmapped_area(file, addr, len, pgoff, flags);
+	addr = get_unmapped_area_prot(file, addr, len, pgoff, flags,
+			prot & PROT_EXEC);
 	if (addr & ~PAGE_MASK)
 		return addr;
 
@@ -1528,6 +1533,67 @@ bottomup:
 }
 #endif
 
+/* Addresses before this value contain at least one zero byte. */
+#define ASCII_ARMOR_MAX_ADDR 0x01000000
+
+/*
+ * This function finds the first unmapped region inside of
+ * [mm->lib_mmap_base; ASCII_ARMOR_MAX_ADDR) region.  Addresses from this
+ * region contain at least one zero byte, which complicates
+ * exploitation of C string buffer overflows (C strings cannot contain zero
+ * byte inside) in return to libc class of attacks.
+ *
+ * This allocator is bottom up allocator like arch_get_unmapped_area(), but
+ * it differs from the latter.  get_unmapped_exec_area() does its best to
+ * allocate as low address as possible.
+ */
+unsigned long
+get_unmapped_exec_area(struct file *filp, unsigned long addr0,
+		unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+	unsigned long addr = addr0;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+
+	if (len > TASK_SIZE)
+		return -ENOMEM;
+
+	if (flags & MAP_FIXED)
+		return addr;
+
+	/* We ALWAYS start from the beginning as base addresses
+	 * with zero high bits is a valued resource */
+	addr = max_t(unsigned long, mm->lib_mmap_base, mmap_min_addr);
+
+	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+		/* At this point:  (!vma || addr < vma->vm_end). */
+		if (addr > TASK_SIZE - len)
+			return -ENOMEM;
+
+		/*
+		 * If kernel.randomize_va_space < 2, the executable is build as
+		 * non-PIE, and exec image base is lower than ASCII_ARMOR_MAX_ADDR,
+		 * it's possible to touch or overrun brk area in ASCII-armor
+		 * zone.  We don't want to reduce future brk growth, so we
+		 * fallback to the default allocator in this case.
+		 */
+		if (mm->brk && addr + len > mm->brk)
+			goto failed;
+
+		if (!vma || addr + len <= vma->vm_start)
+			return addr;
+
+		addr = vma->vm_end;
+
+		/* If ACSII-armor area is over, the algo gives up */
+		if (addr >= ASCII_ARMOR_MAX_ADDR)
+			goto failed;
+	}
+
+failed:
+	return current->mm->get_unmapped_area(filp, addr0, len, pgoff, flags);
+}
+
 void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
 {
 	/*
@@ -1541,9 +1607,9 @@ void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
 		mm->free_area_cache = mm->mmap_base;
 }
 
-unsigned long
-get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
-		unsigned long pgoff, unsigned long flags)
+static unsigned long
+get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned long len,
+		unsigned long pgoff, unsigned long flags, bool exec)
 {
 	unsigned long (*get_area)(struct file *, unsigned long,
 				  unsigned long, unsigned long, unsigned long);
@@ -1556,7 +1622,11 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 	if (len > TASK_SIZE)
 		return -ENOMEM;
 
-	get_area = current->mm->get_unmapped_area;
+	if (exec && current->mm->get_unmapped_exec_area)
+		get_area = current->mm->get_unmapped_exec_area;
+	else
+		get_area = current->mm->get_unmapped_area;
+
 	if (file && file->f_op && file->f_op->get_unmapped_area)
 		get_area = file->f_op->get_unmapped_area;
 	addr = get_area(file, addr, len, pgoff, flags);
@@ -1571,6 +1641,13 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 	return arch_rebalance_pgtables(addr, len);
 }
 
+unsigned long
+get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+		unsigned long pgoff, unsigned long flags)
+{
+	return get_unmapped_area_prot(file, addr, len, pgoff, flags, false);
+}
+
 EXPORT_SYMBOL(get_unmapped_area);
 
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
---

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* Re: [kernel-hardening] [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-09-07  9:09               ` Vasiliy Kulikov
@ 2011-09-07  9:30                 ` Solar Designer
  2011-09-07  9:34                   ` Vasiliy Kulikov
  0 siblings, 1 reply; 47+ messages in thread
From: Solar Designer @ 2011-09-07  9:30 UTC (permalink / raw)
  To: kernel-hardening

Vasiliy,

On Wed, Sep 07, 2011 at 01:09:00PM +0400, Vasiliy Kulikov wrote:
> I've updated patch description, code comments, and "if" condition.  If
> no other objection, I'll post it as RFCv2 on LKML.

I've included some minor corrections below.  Please make changes
accordingly and post to LKML.

> +#ifdef CONFIG_VM86
> +/*
> + * Don't touch any memory that can be addressed by vm86 apps.
> + * Reserve the first 1 MiB + 64 kb.
> + */
> +#define ASCII_ARMOR_MIN_ADDR 0x00110000
> +#else
> +/* No special users of low addresses.  Start just after mmap_min_addr. */
> +#define ASCII_ARMOR_MIN_ADDR 0
> +#endif

What if mmap_min_addr set really low, or is even 0?  I think we want to
skip low addresses even if processes are permitted to use those.
(Permitted does not mean encouraged.)  So how about ASCII_ARMOR_MIN_ADDR
0x19000 (100 KB) when !CONFIG_VM86?

> +	/* We ALWAYS start from the beginning as base addresses
> +	 * with zero high bits is a valued resource */

s/valued/scarce and valuable/

> +		 * If kernel.randomize_va_space < 2, the executable is build as

s/build/built/

Thanks,

Alexander

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [kernel-hardening] [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-09-07  9:30                 ` Solar Designer
@ 2011-09-07  9:34                   ` Vasiliy Kulikov
  2011-09-07  9:43                     ` Solar Designer
  0 siblings, 1 reply; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-09-07  9:34 UTC (permalink / raw)
  To: kernel-hardening

Solar,

On Wed, Sep 07, 2011 at 13:30 +0400, Solar Designer wrote:
> > +#ifdef CONFIG_VM86
> > +/*
> > + * Don't touch any memory that can be addressed by vm86 apps.
> > + * Reserve the first 1 MiB + 64 kb.
> > + */
> > +#define ASCII_ARMOR_MIN_ADDR 0x00110000
> > +#else
> > +/* No special users of low addresses.  Start just after mmap_min_addr. */
> > +#define ASCII_ARMOR_MIN_ADDR 0
> > +#endif
> 
> What if mmap_min_addr set really low, or is even 0?  I think we want to
> skip low addresses even if processes are permitted to use those.
> (Permitted does not mean encouraged.)  So how about ASCII_ARMOR_MIN_ADDR
> 0x19000 (100 KB) when !CONFIG_VM86?

Are you talking about safety with NULL pointer dereferencing?


> > +	/* We ALWAYS start from the beginning as base addresses
> > +	 * with zero high bits is a valued resource */
> 
> s/valued/scarce and valuable/
> 
> > +		 * If kernel.randomize_va_space < 2, the executable is build as
> 
> s/build/built/

Right, thank you!

-- 
Vasiliy

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [kernel-hardening] [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-09-07  9:34                   ` Vasiliy Kulikov
@ 2011-09-07  9:43                     ` Solar Designer
  2011-09-07  9:55                       ` Vasiliy Kulikov
  0 siblings, 1 reply; 47+ messages in thread
From: Solar Designer @ 2011-09-07  9:43 UTC (permalink / raw)
  To: kernel-hardening

Vasiliy,

On Wed, Sep 07, 2011 at 01:34:11PM +0400, Vasiliy Kulikov wrote:
> On Wed, Sep 07, 2011 at 13:30 +0400, Solar Designer wrote:
> > What if mmap_min_addr set really low, or is even 0?  I think we want to
> > skip low addresses even if processes are permitted to use those.
> > (Permitted does not mean encouraged.)  So how about ASCII_ARMOR_MIN_ADDR
> > 0x19000 (100 KB) when !CONFIG_VM86?
> 
> Are you talking about safety with NULL pointer dereferencing?

Yes.  Besides deliberate attacks, we want inadvertent NULL pointer
dereferences to be detected by the kernel as such.  This wouldn't happen
if the kernel itself happens to map something at NULL in almost every
process just because someone set mmap_min_addr to 0 e.g. to be able to
occasionally run a certain userspace app that needs that.

Then, there may be attack scenarios where the attacker is not able to
invoke mmap() in the process via which the attack on the kernel is
initiated - e.g., some daemon process over which the attacker has only
partial control over a certain protocol.  Thus, low mmap_min_addr
wouldn't be directly exploitable by the attacker, and we shouldn't be
making matters worse with this patch.

Alexander

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [kernel-hardening] [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-09-07  9:43                     ` Solar Designer
@ 2011-09-07  9:55                       ` Vasiliy Kulikov
  2011-09-07 10:16                         ` Solar Designer
  0 siblings, 1 reply; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-09-07  9:55 UTC (permalink / raw)
  To: kernel-hardening

Solar,

On Wed, Sep 07, 2011 at 13:43 +0400, Solar Designer wrote:
> On Wed, Sep 07, 2011 at 01:34:11PM +0400, Vasiliy Kulikov wrote:
> > On Wed, Sep 07, 2011 at 13:30 +0400, Solar Designer wrote:
> > > What if mmap_min_addr set really low, or is even 0?  I think we want to
> > > skip low addresses even if processes are permitted to use those.
> > > (Permitted does not mean encouraged.)  So how about ASCII_ARMOR_MIN_ADDR
> > > 0x19000 (100 KB) when !CONFIG_VM86?
> > 
> > Are you talking about safety with NULL pointer dereferencing?
> 
> Yes.

OK, fully agree.  But why 100 KB?  Probably 0x10000 (64 KB)?  It looks
nicer and not so magic.

-- 
Vasiliy

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [kernel-hardening] [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-09-07  9:55                       ` Vasiliy Kulikov
@ 2011-09-07 10:16                         ` Solar Designer
  2011-09-07 11:01                           ` Vasiliy Kulikov
  0 siblings, 1 reply; 47+ messages in thread
From: Solar Designer @ 2011-09-07 10:16 UTC (permalink / raw)
  To: kernel-hardening

On Wed, Sep 07, 2011 at 01:55:08PM +0400, Vasiliy Kulikov wrote:
> OK, fully agree.  But why 100 KB?  Probably 0x10000 (64 KB)?  It looks
> nicer and not so magic.

Well, on Owl we have mmap_min_addr at 96 KB, which is sufficient e.g. in
case we have a struct field offset not larger than 32 KB and the field
itself is an array indexed by a 16-bit value.  Or if the field offset is
not larger than 64 KB and the index is a signed 16-bit value.

100 KB is a very cheap enhancement of the above, also allowing for two
levels of indirection (up to one 16-bit signed and one 16-bit unsigned)
relative to a fixed offset that fits in 4 KB.

Maybe we should move from 96 KB to 100 KB for Owl's mmap_min_addr
default.  Or maybe we should use 132 KB (4+64+64).

Oh, this assumes arrays of char, or our 16-bit variable being byte
offset rather than index.  132 KB would also support arrays of 16-bit
words, and even 16-bit signed indexes into arrays of 32-bit words.

OK, maybe I am imagining these possibilities, but to me these values
feel a little bit more reasonable than a mere 64 KB, which might be
just insufficient e.g. if we have a 16-bit unsigned byte offset variable
and the array itself is a struct field.  Even 68 KB would be a lot more
likely to help then.

Alexander

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [kernel-hardening] [RFC] x86, mm: start mmap allocation for libs from low addresses
  2011-09-07 10:16                         ` Solar Designer
@ 2011-09-07 11:01                           ` Vasiliy Kulikov
  0 siblings, 0 replies; 47+ messages in thread
From: Vasiliy Kulikov @ 2011-09-07 11:01 UTC (permalink / raw)
  To: kernel-hardening

Solar,

On Wed, Sep 07, 2011 at 14:16 +0400, Solar Designer wrote:
> On Wed, Sep 07, 2011 at 01:55:08PM +0400, Vasiliy Kulikov wrote:
> > OK, fully agree.  But why 100 KB?  Probably 0x10000 (64 KB)?  It looks
> > nicer and not so magic.
> 
> Well, on Owl we have mmap_min_addr at 96 KB, which is sufficient e.g. in
> case we have a struct field offset not larger than 32 KB and the field
> itself is an array indexed by a 16-bit value.  Or if the field offset is
> not larger than 64 KB and the index is a signed 16-bit value.
> 
> 100 KB is a very cheap enhancement of the above, also allowing for two
> levels of indirection (up to one 16-bit signed and one 16-bit unsigned)
> relative to a fixed offset that fits in 4 KB.
> 
> Maybe we should move from 96 KB to 100 KB for Owl's mmap_min_addr
> default.  Or maybe we should use 132 KB (4+64+64).
> 
> Oh, this assumes arrays of char, or our 16-bit variable being byte
> offset rather than index.  132 KB would also support arrays of 16-bit
> words, and even 16-bit signed indexes into arrays of 32-bit words.
> 
> OK, maybe I am imagining these possibilities, but to me these values
> feel a little bit more reasonable than a mere 64 KB, which might be
> just insufficient e.g. if we have a 16-bit unsigned byte offset variable
> and the array itself is a struct field.  Even 68 KB would be a lot more
> likely to help then.

This calculation makes sense IMHO, but it assumes rather infrequent
scenario and I definitely don't want to start this discussion on LKML
(at least now).  0x20000 (128 KB) should be enough for our needs (the
128KB's byte belongs to ELF header, which is not writable) and it
doesn't look like a very magic number (any magic should be described in
the comment, which we don't exhaustively do even with the patch in
general ;) ).

So, I'll proceed with 0x20000.

Thanks,

-- 
Vasiliy

^ permalink raw reply	[flat|nested] 47+ messages in thread

end of thread, other threads:[~2011-09-07 11:01 UTC | newest]

Thread overview: 47+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-08-12 10:29 [RFC] x86, mm: start mmap allocation for libs from low addresses Vasiliy Kulikov
2011-08-12 10:29 ` [kernel-hardening] " Vasiliy Kulikov
2011-08-12 10:29 ` Vasiliy Kulikov
2011-08-12 10:58 ` [kernel-hardening] " Solar Designer
2011-08-12 11:05   ` Vasiliy Kulikov
2011-08-25 17:19   ` Vasiliy Kulikov
2011-08-25 17:39     ` Solar Designer
2011-09-02 18:29     ` Solar Designer
2011-09-03 11:18       ` Vasiliy Kulikov
2011-09-03 23:57         ` Solar Designer
2011-09-05 12:46           ` Vasiliy Kulikov
2011-09-06  5:05             ` Solar Designer
2011-09-07  9:09               ` Vasiliy Kulikov
2011-09-07  9:30                 ` Solar Designer
2011-09-07  9:34                   ` Vasiliy Kulikov
2011-09-07  9:43                     ` Solar Designer
2011-09-07  9:55                       ` Vasiliy Kulikov
2011-09-07 10:16                         ` Solar Designer
2011-09-07 11:01                           ` Vasiliy Kulikov
2011-09-02 23:34     ` Solar Designer
2011-09-03 12:12       ` Vasiliy Kulikov
2011-09-03 23:40         ` Solar Designer
2011-09-04  7:21         ` Vasiliy Kulikov
2011-08-12 23:19 ` H. Peter Anvin
2011-08-12 23:19   ` [kernel-hardening] " H. Peter Anvin
2011-08-12 23:19   ` H. Peter Anvin
2011-08-13  6:26   ` Vasiliy Kulikov
2011-08-13  6:26     ` [kernel-hardening] " Vasiliy Kulikov
2011-08-13  6:26     ` Vasiliy Kulikov
2011-08-16  9:05   ` Vasiliy Kulikov
2011-08-16  9:05     ` [kernel-hardening] " Vasiliy Kulikov
2011-08-16  9:05     ` Vasiliy Kulikov
2011-08-22 10:17     ` Vasiliy Kulikov
2011-08-22 10:17       ` [kernel-hardening] " Vasiliy Kulikov
2011-08-22 10:17       ` Vasiliy Kulikov
2011-08-22 17:24       ` H. Peter Anvin
2011-08-22 17:24         ` [kernel-hardening] " H. Peter Anvin
2011-08-22 17:24         ` H. Peter Anvin
2011-08-22 20:14         ` Vasiliy Kulikov
2011-08-22 20:14           ` [kernel-hardening] " Vasiliy Kulikov
2011-08-22 20:14           ` Vasiliy Kulikov
2011-08-22 20:17           ` H. Peter Anvin
2011-08-22 20:17             ` [kernel-hardening] " H. Peter Anvin
2011-08-22 20:17             ` H. Peter Anvin
2011-08-23  6:41             ` Vasiliy Kulikov
2011-08-23  6:41               ` [kernel-hardening] " Vasiliy Kulikov
2011-08-23  6:41               ` Vasiliy Kulikov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.