All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-15  8:07 ` Shan Hai
  0 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-15  8:07 UTC (permalink / raw)
  To: benh, paulus
  Cc: tglx, walken, dhowells, cmetcalf, tony.luck, akpm, a.p.zijlstra,
	linuxppc-dev, linux-kernel

The following test case could reveal a bug in the futex_lock_pi()

BUG: On FUTEX_LOCK_PI, there is a infinite loop in the futex_lock_pi() 
	on Powerpc e500 core.
Cause: The linux kernel on the e500 core has no write permission on
	the COW page, refer the head comment of the following test code.
 
ftrace on test case:
[000]   353.990181: futex_lock_pi_atomic <-futex_lock_pi
[000]   353.990185: cmpxchg_futex_value_locked <-futex_lock_pi_atomic
[snip]
[000]   353.990191: do_page_fault <-handle_page_fault
[000]   353.990192: bad_page_fault <-handle_page_fault
[000]   353.990193: search_exception_tables <-bad_page_fault
[snip]
[000]   353.990199: get_user_pages <-fault_in_user_writeable
[snip]
[000]   353.990208: mark_page_accessed <-follow_page
[000]   353.990222: futex_lock_pi_atomic <-futex_lock_pi
[snip]
[000]   353.990230: cmpxchg_futex_value_locked <-futex_lock_pi_atomic
[ a loop occures here ]


/* 
 * A test case for revealing an infinite loop in the futex_lock_pi().
 * - there are 2 processes, parent and a child
 * - the parent process allocates and initializes a pthread_mutex MUTEX in a 
 *	shared memory region
 * - the parent process holds the MUTEX and do long time computing
 * - the child process tries to hold the MUTEX during the parent holding it and 
 *	traps into the kernel for waiting on the MUTEX because of contention
 * - the kernel loops in futex_lock_pi()
 * - result of 'top' command reveals that the system usage of CPU is 100%
 */

#include <stdio.h>
#include <stdlib.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <errno.h>
#include <pthread.h>
#include <string.h>
#include <signal.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/mman.h>

enum { SHM_INIT, SHM_GET };
enum { PARENT, CHILD };

#define FIXED_MMAP_ADDR 0x20000000
#define MMAP_SIZE	0x2000000

static int shmid;
static char shm_name[100];
static int sleep_period = 100000;

void * shmem_init(int flag)
{
	int start = FIXED_MMAP_ADDR;
	int memory_size = MMAP_SIZE;
	int mode = 0666;
	void *addr;
	int ret;

	sprintf(shm_name, "/shmem_1234");

	shmid = shm_open (shm_name, O_RDWR | O_EXCL | O_CREAT | O_TRUNC, mode);

	if (shmid < 0) {
    		if (errno == EEXIST) {
			printf ("shm_open: %s\n", strerror(errno)); 
      			shmid = shm_open (shm_name, O_RDWR, mode);
		} else {
    			printf("failed to shm_open, err=%s\n", strerror(errno));
			return NULL;
  		}
	}

  	ret = fcntl (shmid, F_SETFD, FD_CLOEXEC);
  	if (ret < 0) {
    		printf("fcntl: %s\n", strerror(errno));
		return NULL;
  	}

	ret = ftruncate (shmid, memory_size);
	if (ret < 0) {
    		printf("ftruncate: %s\n", strerror(errno));
		return NULL;
  	}

	addr = mmap ((void *)start, memory_size, PROT_READ | PROT_WRITE, 
			MAP_SHARED | MAP_FIXED, shmid, 0);

  	if (addr == MAP_FAILED) {
		printf ("mmap: %s\n", strerror(errno)); 
    		close (shmid);
    		shm_unlink (shm_name);
		return NULL;
  	}
	
	if (flag == SHM_INIT)
		memset(addr, 0, memory_size);
	
	return (void *)start;
}

pthread_mutex_t * shmem_mutex_init(int flag)
{
	pthread_mutex_t * pmutex = (pthread_mutex_t *)shmem_init(flag);
	pthread_mutexattr_t attr;

	if (flag == SHM_INIT) {
		pthread_mutexattr_init (&attr);
		pthread_mutexattr_setpshared (&attr, PTHREAD_PROCESS_SHARED);
		pthread_mutexattr_setprotocol (&attr, PTHREAD_PRIO_INHERIT);
		pthread_mutexattr_setrobust_np (&attr, 
						PTHREAD_MUTEX_STALLED_NP);
		pthread_mutexattr_settype (&attr, PTHREAD_MUTEX_ERRORCHECK);
		if (pthread_mutex_init (pmutex, &attr) != 0) {
    			printf("Init mutex failed, err=%s\n", strerror(errno));
			pthread_mutexattr_destroy (&attr);
			return NULL;
		}
	}

	return pmutex;
}

void long_running_task(int flag)
{
	static int counter = 0;

	if (flag == PARENT) 
		usleep(5*sleep_period);
	else
		usleep(3*sleep_period);

	counter = (counter + 1) % 100;
	printf("%d: completed %d computing\n", getpid(), counter);
}

void sig_handler(int signum)
{
	close(shmid);
	shm_unlink(shm_name);
	
	exit(0);
}

int main(int argc, char *argv[])
{
	pthread_mutex_t *mutex_parent, *mutex_child;

	signal(SIGUSR1, sig_handler);

	if (fork()) { /* parent process */
		if ((mutex_parent = shmem_mutex_init(SHM_INIT)) == NULL) {
			printf("failed to get the shmem_mutex\n");
			exit(-1);
		}
	
		while (1) {
			printf("%d: try to hold the lock\n", getpid()); 
			pthread_mutex_lock(mutex_parent);
			printf("%d: got the lock\n", getpid()); 
			long_running_task(PARENT);
			pthread_mutex_unlock(mutex_parent);
			printf("%d: released the lock\n", getpid());
		}
	} else { /* child process */
		usleep(sleep_period);
		if ((mutex_child = shmem_mutex_init(SHM_GET)) == NULL) {
			printf("failed to get the shmem_mutex\n");
			exit(-1);
		}
	
		while (1) {
			printf("%d: try to hold the lock\n", getpid()); 
			pthread_mutex_lock(mutex_child);
			printf("%d: got the lock\n", getpid()); 
			long_running_task(CHILD);
			pthread_mutex_unlock(mutex_child);
			printf("%d: released the lock\n", getpid());
		}
	}	

	return 0;
}

---
 arch/powerpc/include/asm/futex.h |   11 ++++++++++-
 arch/powerpc/include/asm/tlb.h   |   25 +++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletions(-)


^ permalink raw reply	[flat|nested] 138+ messages in thread

* [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-15  8:07 ` Shan Hai
  0 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-15  8:07 UTC (permalink / raw)
  To: benh, paulus
  Cc: tony.luck, a.p.zijlstra, linux-kernel, cmetcalf, dhowells, tglx,
	walken, linuxppc-dev, akpm

The following test case could reveal a bug in the futex_lock_pi()

BUG: On FUTEX_LOCK_PI, there is a infinite loop in the futex_lock_pi() 
	on Powerpc e500 core.
Cause: The linux kernel on the e500 core has no write permission on
	the COW page, refer the head comment of the following test code.
 
ftrace on test case:
[000]   353.990181: futex_lock_pi_atomic <-futex_lock_pi
[000]   353.990185: cmpxchg_futex_value_locked <-futex_lock_pi_atomic
[snip]
[000]   353.990191: do_page_fault <-handle_page_fault
[000]   353.990192: bad_page_fault <-handle_page_fault
[000]   353.990193: search_exception_tables <-bad_page_fault
[snip]
[000]   353.990199: get_user_pages <-fault_in_user_writeable
[snip]
[000]   353.990208: mark_page_accessed <-follow_page
[000]   353.990222: futex_lock_pi_atomic <-futex_lock_pi
[snip]
[000]   353.990230: cmpxchg_futex_value_locked <-futex_lock_pi_atomic
[ a loop occures here ]


/* 
 * A test case for revealing an infinite loop in the futex_lock_pi().
 * - there are 2 processes, parent and a child
 * - the parent process allocates and initializes a pthread_mutex MUTEX in a 
 *	shared memory region
 * - the parent process holds the MUTEX and do long time computing
 * - the child process tries to hold the MUTEX during the parent holding it and 
 *	traps into the kernel for waiting on the MUTEX because of contention
 * - the kernel loops in futex_lock_pi()
 * - result of 'top' command reveals that the system usage of CPU is 100%
 */

#include <stdio.h>
#include <stdlib.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <errno.h>
#include <pthread.h>
#include <string.h>
#include <signal.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/mman.h>

enum { SHM_INIT, SHM_GET };
enum { PARENT, CHILD };

#define FIXED_MMAP_ADDR 0x20000000
#define MMAP_SIZE	0x2000000

static int shmid;
static char shm_name[100];
static int sleep_period = 100000;

void * shmem_init(int flag)
{
	int start = FIXED_MMAP_ADDR;
	int memory_size = MMAP_SIZE;
	int mode = 0666;
	void *addr;
	int ret;

	sprintf(shm_name, "/shmem_1234");

	shmid = shm_open (shm_name, O_RDWR | O_EXCL | O_CREAT | O_TRUNC, mode);

	if (shmid < 0) {
    		if (errno == EEXIST) {
			printf ("shm_open: %s\n", strerror(errno)); 
      			shmid = shm_open (shm_name, O_RDWR, mode);
		} else {
    			printf("failed to shm_open, err=%s\n", strerror(errno));
			return NULL;
  		}
	}

  	ret = fcntl (shmid, F_SETFD, FD_CLOEXEC);
  	if (ret < 0) {
    		printf("fcntl: %s\n", strerror(errno));
		return NULL;
  	}

	ret = ftruncate (shmid, memory_size);
	if (ret < 0) {
    		printf("ftruncate: %s\n", strerror(errno));
		return NULL;
  	}

	addr = mmap ((void *)start, memory_size, PROT_READ | PROT_WRITE, 
			MAP_SHARED | MAP_FIXED, shmid, 0);

  	if (addr == MAP_FAILED) {
		printf ("mmap: %s\n", strerror(errno)); 
    		close (shmid);
    		shm_unlink (shm_name);
		return NULL;
  	}
	
	if (flag == SHM_INIT)
		memset(addr, 0, memory_size);
	
	return (void *)start;
}

pthread_mutex_t * shmem_mutex_init(int flag)
{
	pthread_mutex_t * pmutex = (pthread_mutex_t *)shmem_init(flag);
	pthread_mutexattr_t attr;

	if (flag == SHM_INIT) {
		pthread_mutexattr_init (&attr);
		pthread_mutexattr_setpshared (&attr, PTHREAD_PROCESS_SHARED);
		pthread_mutexattr_setprotocol (&attr, PTHREAD_PRIO_INHERIT);
		pthread_mutexattr_setrobust_np (&attr, 
						PTHREAD_MUTEX_STALLED_NP);
		pthread_mutexattr_settype (&attr, PTHREAD_MUTEX_ERRORCHECK);
		if (pthread_mutex_init (pmutex, &attr) != 0) {
    			printf("Init mutex failed, err=%s\n", strerror(errno));
			pthread_mutexattr_destroy (&attr);
			return NULL;
		}
	}

	return pmutex;
}

void long_running_task(int flag)
{
	static int counter = 0;

	if (flag == PARENT) 
		usleep(5*sleep_period);
	else
		usleep(3*sleep_period);

	counter = (counter + 1) % 100;
	printf("%d: completed %d computing\n", getpid(), counter);
}

void sig_handler(int signum)
{
	close(shmid);
	shm_unlink(shm_name);
	
	exit(0);
}

int main(int argc, char *argv[])
{
	pthread_mutex_t *mutex_parent, *mutex_child;

	signal(SIGUSR1, sig_handler);

	if (fork()) { /* parent process */
		if ((mutex_parent = shmem_mutex_init(SHM_INIT)) == NULL) {
			printf("failed to get the shmem_mutex\n");
			exit(-1);
		}
	
		while (1) {
			printf("%d: try to hold the lock\n", getpid()); 
			pthread_mutex_lock(mutex_parent);
			printf("%d: got the lock\n", getpid()); 
			long_running_task(PARENT);
			pthread_mutex_unlock(mutex_parent);
			printf("%d: released the lock\n", getpid());
		}
	} else { /* child process */
		usleep(sleep_period);
		if ((mutex_child = shmem_mutex_init(SHM_GET)) == NULL) {
			printf("failed to get the shmem_mutex\n");
			exit(-1);
		}
	
		while (1) {
			printf("%d: try to hold the lock\n", getpid()); 
			pthread_mutex_lock(mutex_child);
			printf("%d: got the lock\n", getpid()); 
			long_running_task(CHILD);
			pthread_mutex_unlock(mutex_child);
			printf("%d: released the lock\n", getpid());
		}
	}	

	return 0;
}

---
 arch/powerpc/include/asm/futex.h |   11 ++++++++++-
 arch/powerpc/include/asm/tlb.h   |   25 +++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletions(-)

^ permalink raw reply	[flat|nested] 138+ messages in thread

* [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-15  8:07 ` Shan Hai
@ 2011-07-15  8:07   ` Shan Hai
  -1 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-15  8:07 UTC (permalink / raw)
  To: benh, paulus
  Cc: tglx, walken, dhowells, cmetcalf, tony.luck, akpm, a.p.zijlstra,
	linuxppc-dev, linux-kernel, Shan Hai

The kernel has no write permission on COW pages by default on e500 core, this
will cause endless loop in futex_lock_pi, because futex code assumes the kernel
has write permission on COW pages. Grant write permission to the kernel on COW
pages when access violation page fault occurs.

Signed-off-by: Shan Hai <haishan.bai@gmail.com>
---
 arch/powerpc/include/asm/futex.h |   11 ++++++++++-
 arch/powerpc/include/asm/tlb.h   |   25 +++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h
index c94e4a3..54c3e74 100644
--- a/arch/powerpc/include/asm/futex.h
+++ b/arch/powerpc/include/asm/futex.h
@@ -8,6 +8,7 @@
 #include <asm/errno.h>
 #include <asm/synch.h>
 #include <asm/asm-compat.h>
+#include <asm/tlb.h>
 
 #define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \
   __asm__ __volatile ( \
@@ -113,7 +114,15 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
         : "cc", "memory");
 
 	*uval = prev;
-        return ret;
+
+	/* Futex assumes the kernel has permission to write to
+	 * COW pages, grant the kernel write permission on COW
+	 * pages because it has none by default.
+	 */
+	if (ret == -EFAULT)
+		__tlb_fixup_write_permission(current->mm, (unsigned long)uaddr);
+
+	return ret;
 }
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
index e2b428b..3863c6a 100644
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -45,5 +45,30 @@ static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep,
 #endif
 }
 
+/* Grant write permission to the kernel on a page. */
+static inline void __tlb_fixup_write_permission(struct mm_struct *mm,
+						unsigned long address)
+{
+#if defined(CONFIG_FSL_BOOKE)
+	/* Grant write permission to the kernel on a page by setting TLB.SW
+	 * bit, the bit setting operation is tricky here, calling
+	 * handle_mm_fault with FAULT_FLAG_WRITE causes _PAGE_DIRTY bit of
+	 * the pte to be set, the _PAGE_DIRTY of the pte is translated into
+	 * TLB.SW on Powerpc e500 core.
+	 */
+
+	struct vm_area_struct *vma;
+
+	vma = find_vma(mm, address);
+	if (likely(vma)) {
+		/* only fixup present page */
+		if (follow_page(vma, address, FOLL_WRITE)) {
+			handle_mm_fault(mm, vma, address, FAULT_FLAG_WRITE);
+			flush_tlb_page(vma, address);
+		}
+	}
+#endif
+}
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_POWERPC_TLB_H */
-- 
1.7.1


^ permalink raw reply	[flat|nested] 138+ messages in thread

* [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-15  8:07   ` Shan Hai
  0 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-15  8:07 UTC (permalink / raw)
  To: benh, paulus
  Cc: tony.luck, a.p.zijlstra, Shan Hai, linux-kernel, cmetcalf,
	dhowells, tglx, walken, linuxppc-dev, akpm

The kernel has no write permission on COW pages by default on e500 core, this
will cause endless loop in futex_lock_pi, because futex code assumes the kernel
has write permission on COW pages. Grant write permission to the kernel on COW
pages when access violation page fault occurs.

Signed-off-by: Shan Hai <haishan.bai@gmail.com>
---
 arch/powerpc/include/asm/futex.h |   11 ++++++++++-
 arch/powerpc/include/asm/tlb.h   |   25 +++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h
index c94e4a3..54c3e74 100644
--- a/arch/powerpc/include/asm/futex.h
+++ b/arch/powerpc/include/asm/futex.h
@@ -8,6 +8,7 @@
 #include <asm/errno.h>
 #include <asm/synch.h>
 #include <asm/asm-compat.h>
+#include <asm/tlb.h>
 
 #define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \
   __asm__ __volatile ( \
@@ -113,7 +114,15 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
         : "cc", "memory");
 
 	*uval = prev;
-        return ret;
+
+	/* Futex assumes the kernel has permission to write to
+	 * COW pages, grant the kernel write permission on COW
+	 * pages because it has none by default.
+	 */
+	if (ret == -EFAULT)
+		__tlb_fixup_write_permission(current->mm, (unsigned long)uaddr);
+
+	return ret;
 }
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
index e2b428b..3863c6a 100644
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -45,5 +45,30 @@ static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep,
 #endif
 }
 
+/* Grant write permission to the kernel on a page. */
+static inline void __tlb_fixup_write_permission(struct mm_struct *mm,
+						unsigned long address)
+{
+#if defined(CONFIG_FSL_BOOKE)
+	/* Grant write permission to the kernel on a page by setting TLB.SW
+	 * bit, the bit setting operation is tricky here, calling
+	 * handle_mm_fault with FAULT_FLAG_WRITE causes _PAGE_DIRTY bit of
+	 * the pte to be set, the _PAGE_DIRTY of the pte is translated into
+	 * TLB.SW on Powerpc e500 core.
+	 */
+
+	struct vm_area_struct *vma;
+
+	vma = find_vma(mm, address);
+	if (likely(vma)) {
+		/* only fixup present page */
+		if (follow_page(vma, address, FOLL_WRITE)) {
+			handle_mm_fault(mm, vma, address, FAULT_FLAG_WRITE);
+			flush_tlb_page(vma, address);
+		}
+	}
+#endif
+}
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_POWERPC_TLB_H */
-- 
1.7.1

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-15  8:07 ` Shan Hai
@ 2011-07-15  8:20   ` Peter Zijlstra
  -1 siblings, 0 replies; 138+ messages in thread
From: Peter Zijlstra @ 2011-07-15  8:20 UTC (permalink / raw)
  To: Shan Hai
  Cc: benh, paulus, tglx, walken, dhowells, cmetcalf, tony.luck, akpm,
	linuxppc-dev, linux-kernel

On Fri, 2011-07-15 at 16:07 +0800, Shan Hai wrote:
> 
> The following test case could reveal a bug in the futex_lock_pi()
> 
> BUG: On FUTEX_LOCK_PI, there is a infinite loop in the futex_lock_pi() 
>         on Powerpc e500 core.
> Cause: The linux kernel on the e500 core has no write permission on
>         the COW page, refer the head comment of the following test code.
>  
> ftrace on test case:
> [000]   353.990181: futex_lock_pi_atomic <-futex_lock_pi
> [000]   353.990185: cmpxchg_futex_value_locked <-futex_lock_pi_atomic
> [snip]
> [000]   353.990191: do_page_fault <-handle_page_fault
> [000]   353.990192: bad_page_fault <-handle_page_fault
> [000]   353.990193: search_exception_tables <-bad_page_fault
> [snip]
> [000]   353.990199: get_user_pages <-fault_in_user_writeable
> [snip]
> [000]   353.990208: mark_page_accessed <-follow_page
> [000]   353.990222: futex_lock_pi_atomic <-futex_lock_pi
> [snip]
> [000]   353.990230: cmpxchg_futex_value_locked <-futex_lock_pi_atomic
> [ a loop occures here ]
> 


But but but but, that get_user_pages(.write=1, .force=0) should result
in a COW break, getting our own writable page.

What is this e500 thing smoking that this doesn't work?

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-15  8:20   ` Peter Zijlstra
  0 siblings, 0 replies; 138+ messages in thread
From: Peter Zijlstra @ 2011-07-15  8:20 UTC (permalink / raw)
  To: Shan Hai
  Cc: tony.luck, linux-kernel, cmetcalf, dhowells, paulus, tglx,
	walken, linuxppc-dev, akpm

On Fri, 2011-07-15 at 16:07 +0800, Shan Hai wrote:
>=20
> The following test case could reveal a bug in the futex_lock_pi()
>=20
> BUG: On FUTEX_LOCK_PI, there is a infinite loop in the futex_lock_pi()=
=20
>         on Powerpc e500 core.
> Cause: The linux kernel on the e500 core has no write permission on
>         the COW page, refer the head comment of the following test code.
> =20
> ftrace on test case:
> [000]   353.990181: futex_lock_pi_atomic <-futex_lock_pi
> [000]   353.990185: cmpxchg_futex_value_locked <-futex_lock_pi_atomic
> [snip]
> [000]   353.990191: do_page_fault <-handle_page_fault
> [000]   353.990192: bad_page_fault <-handle_page_fault
> [000]   353.990193: search_exception_tables <-bad_page_fault
> [snip]
> [000]   353.990199: get_user_pages <-fault_in_user_writeable
> [snip]
> [000]   353.990208: mark_page_accessed <-follow_page
> [000]   353.990222: futex_lock_pi_atomic <-futex_lock_pi
> [snip]
> [000]   353.990230: cmpxchg_futex_value_locked <-futex_lock_pi_atomic
> [ a loop occures here ]
>=20


But but but but, that get_user_pages(.write=3D1, .force=3D0) should result
in a COW break, getting our own writable page.

What is this e500 thing smoking that this doesn't work?

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-15  8:20   ` Peter Zijlstra
@ 2011-07-15  8:38     ` MailingLists
  -1 siblings, 0 replies; 138+ messages in thread
From: MailingLists @ 2011-07-15  8:38 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: benh, paulus, tglx, walken, dhowells, cmetcalf, tony.luck, akpm,
	linuxppc-dev, linux-kernel

On 07/15/2011 04:20 PM, Peter Zijlstra wrote:
> On Fri, 2011-07-15 at 16:07 +0800, Shan Hai wrote:
>> The following test case could reveal a bug in the futex_lock_pi()
>>
>> BUG: On FUTEX_LOCK_PI, there is a infinite loop in the futex_lock_pi()
>>          on Powerpc e500 core.
>> Cause: The linux kernel on the e500 core has no write permission on
>>          the COW page, refer the head comment of the following test code.
>>
>> ftrace on test case:
>> [000]   353.990181: futex_lock_pi_atomic<-futex_lock_pi
>> [000]   353.990185: cmpxchg_futex_value_locked<-futex_lock_pi_atomic
>> [snip]
>> [000]   353.990191: do_page_fault<-handle_page_fault
>> [000]   353.990192: bad_page_fault<-handle_page_fault
>> [000]   353.990193: search_exception_tables<-bad_page_fault
>> [snip]
>> [000]   353.990199: get_user_pages<-fault_in_user_writeable
>> [snip]
>> [000]   353.990208: mark_page_accessed<-follow_page
>> [000]   353.990222: futex_lock_pi_atomic<-futex_lock_pi
>> [snip]
>> [000]   353.990230: cmpxchg_futex_value_locked<-futex_lock_pi_atomic
>> [ a loop occures here ]
>>
>
> But but but but, that get_user_pages(.write=1, .force=0) should result
> in a COW break, getting our own writable page.
>
> What is this e500 thing smoking that this doesn't work?

A page could be set to read only by the kernel (supervisor in the powerpc
literature) on the e500, and that's what the kernel do. Set SW(supervisor
write) bit in the TLB entry to grant write permission to the kernel on a
page.

And further the SW bit is set according to the DIRTY flag of the PTE,
PTE.DIRTY is set in the do_page_fault(), the futex_lock_pi() disabled
page fault, the PTE.DIRTY never can be set, so do the SW bit, unbreakable
COW occurred, infinite loop followed.

Thanks
Shan Hai

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-15  8:38     ` MailingLists
  0 siblings, 0 replies; 138+ messages in thread
From: MailingLists @ 2011-07-15  8:38 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tony.luck, linux-kernel, cmetcalf, dhowells, paulus, tglx,
	walken, linuxppc-dev, akpm

On 07/15/2011 04:20 PM, Peter Zijlstra wrote:
> On Fri, 2011-07-15 at 16:07 +0800, Shan Hai wrote:
>> The following test case could reveal a bug in the futex_lock_pi()
>>
>> BUG: On FUTEX_LOCK_PI, there is a infinite loop in the futex_lock_pi()
>>          on Powerpc e500 core.
>> Cause: The linux kernel on the e500 core has no write permission on
>>          the COW page, refer the head comment of the following test code.
>>
>> ftrace on test case:
>> [000]   353.990181: futex_lock_pi_atomic<-futex_lock_pi
>> [000]   353.990185: cmpxchg_futex_value_locked<-futex_lock_pi_atomic
>> [snip]
>> [000]   353.990191: do_page_fault<-handle_page_fault
>> [000]   353.990192: bad_page_fault<-handle_page_fault
>> [000]   353.990193: search_exception_tables<-bad_page_fault
>> [snip]
>> [000]   353.990199: get_user_pages<-fault_in_user_writeable
>> [snip]
>> [000]   353.990208: mark_page_accessed<-follow_page
>> [000]   353.990222: futex_lock_pi_atomic<-futex_lock_pi
>> [snip]
>> [000]   353.990230: cmpxchg_futex_value_locked<-futex_lock_pi_atomic
>> [ a loop occures here ]
>>
>
> But but but but, that get_user_pages(.write=1, .force=0) should result
> in a COW break, getting our own writable page.
>
> What is this e500 thing smoking that this doesn't work?

A page could be set to read only by the kernel (supervisor in the powerpc
literature) on the e500, and that's what the kernel do. Set SW(supervisor
write) bit in the TLB entry to grant write permission to the kernel on a
page.

And further the SW bit is set according to the DIRTY flag of the PTE,
PTE.DIRTY is set in the do_page_fault(), the futex_lock_pi() disabled
page fault, the PTE.DIRTY never can be set, so do the SW bit, unbreakable
COW occurred, infinite loop followed.

Thanks
Shan Hai

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-15  8:38     ` MailingLists
@ 2011-07-15  8:44       ` Peter Zijlstra
  -1 siblings, 0 replies; 138+ messages in thread
From: Peter Zijlstra @ 2011-07-15  8:44 UTC (permalink / raw)
  To: MailingLists
  Cc: benh, paulus, tglx, walken, dhowells, cmetcalf, tony.luck, akpm,
	linuxppc-dev, linux-kernel

On Fri, 2011-07-15 at 16:38 +0800, MailingLists wrote:
> On 07/15/2011 04:20 PM, Peter Zijlstra wrote:
> > On Fri, 2011-07-15 at 16:07 +0800, Shan Hai wrote:
> >> The following test case could reveal a bug in the futex_lock_pi()
> >>
> >> BUG: On FUTEX_LOCK_PI, there is a infinite loop in the futex_lock_pi()
> >>          on Powerpc e500 core.
> >> Cause: The linux kernel on the e500 core has no write permission on
> >>          the COW page, refer the head comment of the following test code.
> >>
> >> ftrace on test case:
> >> [000]   353.990181: futex_lock_pi_atomic<-futex_lock_pi
> >> [000]   353.990185: cmpxchg_futex_value_locked<-futex_lock_pi_atomic
> >> [snip]
> >> [000]   353.990191: do_page_fault<-handle_page_fault
> >> [000]   353.990192: bad_page_fault<-handle_page_fault
> >> [000]   353.990193: search_exception_tables<-bad_page_fault
> >> [snip]
> >> [000]   353.990199: get_user_pages<-fault_in_user_writeable
> >> [snip]
> >> [000]   353.990208: mark_page_accessed<-follow_page
> >> [000]   353.990222: futex_lock_pi_atomic<-futex_lock_pi
> >> [snip]
> >> [000]   353.990230: cmpxchg_futex_value_locked<-futex_lock_pi_atomic
> >> [ a loop occures here ]
> >>
> >
> > But but but but, that get_user_pages(.write=1, .force=0) should result
> > in a COW break, getting our own writable page.
> >
> > What is this e500 thing smoking that this doesn't work?
> 
> A page could be set to read only by the kernel (supervisor in the powerpc
> literature) on the e500, and that's what the kernel do. Set SW(supervisor
> write) bit in the TLB entry to grant write permission to the kernel on a
> page.
> 
> And further the SW bit is set according to the DIRTY flag of the PTE,
> PTE.DIRTY is set in the do_page_fault(), the futex_lock_pi() disabled
> page fault, the PTE.DIRTY never can be set, so do the SW bit, unbreakable
> COW occurred, infinite loop followed.

I'm fairly sure fault_in_user_writeable() has PF enabled as it takes
mmap_sem, an pagefaul_disable() is akin to preemp_disable() on mainline.

Also get_user_pages() fully expects to be able to schedule, and in fact
can call the full pf handler path all by its lonesome self.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-15  8:44       ` Peter Zijlstra
  0 siblings, 0 replies; 138+ messages in thread
From: Peter Zijlstra @ 2011-07-15  8:44 UTC (permalink / raw)
  To: MailingLists
  Cc: tony.luck, linux-kernel, cmetcalf, dhowells, paulus, tglx,
	walken, linuxppc-dev, akpm

On Fri, 2011-07-15 at 16:38 +0800, MailingLists wrote:
> On 07/15/2011 04:20 PM, Peter Zijlstra wrote:
> > On Fri, 2011-07-15 at 16:07 +0800, Shan Hai wrote:
> >> The following test case could reveal a bug in the futex_lock_pi()
> >>
> >> BUG: On FUTEX_LOCK_PI, there is a infinite loop in the futex_lock_pi()
> >>          on Powerpc e500 core.
> >> Cause: The linux kernel on the e500 core has no write permission on
> >>          the COW page, refer the head comment of the following test co=
de.
> >>
> >> ftrace on test case:
> >> [000]   353.990181: futex_lock_pi_atomic<-futex_lock_pi
> >> [000]   353.990185: cmpxchg_futex_value_locked<-futex_lock_pi_atomic
> >> [snip]
> >> [000]   353.990191: do_page_fault<-handle_page_fault
> >> [000]   353.990192: bad_page_fault<-handle_page_fault
> >> [000]   353.990193: search_exception_tables<-bad_page_fault
> >> [snip]
> >> [000]   353.990199: get_user_pages<-fault_in_user_writeable
> >> [snip]
> >> [000]   353.990208: mark_page_accessed<-follow_page
> >> [000]   353.990222: futex_lock_pi_atomic<-futex_lock_pi
> >> [snip]
> >> [000]   353.990230: cmpxchg_futex_value_locked<-futex_lock_pi_atomic
> >> [ a loop occures here ]
> >>
> >
> > But but but but, that get_user_pages(.write=3D1, .force=3D0) should res=
ult
> > in a COW break, getting our own writable page.
> >
> > What is this e500 thing smoking that this doesn't work?
>=20
> A page could be set to read only by the kernel (supervisor in the powerpc
> literature) on the e500, and that's what the kernel do. Set SW(supervisor
> write) bit in the TLB entry to grant write permission to the kernel on a
> page.
>=20
> And further the SW bit is set according to the DIRTY flag of the PTE,
> PTE.DIRTY is set in the do_page_fault(), the futex_lock_pi() disabled
> page fault, the PTE.DIRTY never can be set, so do the SW bit, unbreakable
> COW occurred, infinite loop followed.

I'm fairly sure fault_in_user_writeable() has PF enabled as it takes
mmap_sem, an pagefaul_disable() is akin to preemp_disable() on mainline.

Also get_user_pages() fully expects to be able to schedule, and in fact
can call the full pf handler path all by its lonesome self.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-15  8:20   ` Peter Zijlstra
@ 2011-07-15  9:05     ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-15  9:05 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Shan Hai, paulus, tglx, walken, dhowells, cmetcalf, tony.luck,
	akpm, linuxppc-dev, linux-kernel

On Fri, 2011-07-15 at 10:20 +0200, Peter Zijlstra wrote:
> But but but but, that get_user_pages(.write=1, .force=0) should result
> in a COW break, getting our own writable page.
> 
> What is this e500 thing smoking that this doesn't work? 

Right. That should have triggered the cow & flushed the TLB entry.... 

Cheers,
Ben.



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-15  9:05     ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-15  9:05 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tony.luck, Shan Hai, linux-kernel, cmetcalf, dhowells, paulus,
	tglx, walken, linuxppc-dev, akpm

On Fri, 2011-07-15 at 10:20 +0200, Peter Zijlstra wrote:
> But but but but, that get_user_pages(.write=1, .force=0) should result
> in a COW break, getting our own writable page.
> 
> What is this e500 thing smoking that this doesn't work? 

Right. That should have triggered the cow & flushed the TLB entry.... 

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-15  8:38     ` MailingLists
@ 2011-07-15  9:07       ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-15  9:07 UTC (permalink / raw)
  To: MailingLists
  Cc: Peter Zijlstra, paulus, tglx, walken, dhowells, cmetcalf,
	tony.luck, akpm, linuxppc-dev, linux-kernel

On Fri, 2011-07-15 at 16:38 +0800, MailingLists wrote:
> A page could be set to read only by the kernel (supervisor in the
> powerpc
> literature) on the e500, and that's what the kernel do. Set
> SW(supervisor
> write) bit in the TLB entry to grant write permission to the kernel on
> a
> page.
> 
> And further the SW bit is set according to the DIRTY flag of the PTE,
> PTE.DIRTY is set in the do_page_fault(), the futex_lock_pi() disabled
> page fault, the PTE.DIRTY never can be set, so do the SW bit,
> unbreakable
> COW occurred, infinite loop followed. 

That would be it ... the SW dirty and young tracking relies on faults to
fixup things in handle_pte_fault(). If the "disable page fault" thingy
happens before we get there, then we have a pretty nasty bug. Note that
this will hit more than just e500 (and in fact any architecture that
relies on SW to do dirty and young tracking).

Cheers,
Ben.


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-15  9:07       ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-15  9:07 UTC (permalink / raw)
  To: MailingLists
  Cc: tony.luck, Peter Zijlstra, linux-kernel, cmetcalf, dhowells,
	paulus, tglx, walken, linuxppc-dev, akpm

On Fri, 2011-07-15 at 16:38 +0800, MailingLists wrote:
> A page could be set to read only by the kernel (supervisor in the
> powerpc
> literature) on the e500, and that's what the kernel do. Set
> SW(supervisor
> write) bit in the TLB entry to grant write permission to the kernel on
> a
> page.
> 
> And further the SW bit is set according to the DIRTY flag of the PTE,
> PTE.DIRTY is set in the do_page_fault(), the futex_lock_pi() disabled
> page fault, the PTE.DIRTY never can be set, so do the SW bit,
> unbreakable
> COW occurred, infinite loop followed. 

That would be it ... the SW dirty and young tracking relies on faults to
fixup things in handle_pte_fault(). If the "disable page fault" thingy
happens before we get there, then we have a pretty nasty bug. Note that
this will hit more than just e500 (and in fact any architecture that
relies on SW to do dirty and young tracking).

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-15  8:44       ` Peter Zijlstra
@ 2011-07-15  9:08         ` Shan Hai
  -1 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-15  9:08 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: benh, paulus, tglx, walken, dhowells, cmetcalf, tony.luck, akpm,
	linuxppc-dev, linux-kernel

On 07/15/2011 04:44 PM, Peter Zijlstra wrote:
> On Fri, 2011-07-15 at 16:38 +0800, MailingLists wrote:
>> On 07/15/2011 04:20 PM, Peter Zijlstra wrote:
>>> On Fri, 2011-07-15 at 16:07 +0800, Shan Hai wrote:
>>>> The following test case could reveal a bug in the futex_lock_pi()
>>>>
>>>> BUG: On FUTEX_LOCK_PI, there is a infinite loop in the futex_lock_pi()
>>>>           on Powerpc e500 core.
>>>> Cause: The linux kernel on the e500 core has no write permission on
>>>>           the COW page, refer the head comment of the following test code.
>>>>
>>>> ftrace on test case:
>>>> [000]   353.990181: futex_lock_pi_atomic<-futex_lock_pi
>>>> [000]   353.990185: cmpxchg_futex_value_locked<-futex_lock_pi_atomic
>>>> [snip]
>>>> [000]   353.990191: do_page_fault<-handle_page_fault
>>>> [000]   353.990192: bad_page_fault<-handle_page_fault
>>>> [000]   353.990193: search_exception_tables<-bad_page_fault
>>>> [snip]
>>>> [000]   353.990199: get_user_pages<-fault_in_user_writeable
>>>> [snip]
>>>> [000]   353.990208: mark_page_accessed<-follow_page
>>>> [000]   353.990222: futex_lock_pi_atomic<-futex_lock_pi
>>>> [snip]
>>>> [000]   353.990230: cmpxchg_futex_value_locked<-futex_lock_pi_atomic
>>>> [ a loop occures here ]
>>>>
>>> But but but but, that get_user_pages(.write=1, .force=0) should result
>>> in a COW break, getting our own writable page.
>>>
>>> What is this e500 thing smoking that this doesn't work?
>> A page could be set to read only by the kernel (supervisor in the powerpc
>> literature) on the e500, and that's what the kernel do. Set SW(supervisor
>> write) bit in the TLB entry to grant write permission to the kernel on a
>> page.
>>
>> And further the SW bit is set according to the DIRTY flag of the PTE,
>> PTE.DIRTY is set in the do_page_fault(), the futex_lock_pi() disabled
>> page fault, the PTE.DIRTY never can be set, so do the SW bit, unbreakable
>> COW occurred, infinite loop followed.
> I'm fairly sure fault_in_user_writeable() has PF enabled as it takes
> mmap_sem, an pagefaul_disable() is akin to preemp_disable() on mainline.
>
> Also get_user_pages() fully expects to be able to schedule, and in fact
> can call the full pf handler path all by its lonesome self.

The whole scenario should be,
- the child process triggers a page fault at the first time access to
     the lock, and it got its own writable page, but its *clean* for
     the reason just for checking the status of the lock.
     I am sorry for above "unbreakable COW".
- the futex_lock_pi() is invoked because of the lock contention,
     and the futex_atomic_cmpxchg_inatomic() tries to get the lock,
     it found out the lock is free so tries to write to the lock for
     reservation, a page fault occurs, because the page is read only
     for kernel(e500 specific), and returns -EFAULT to the caller
- the fault_in_user_writeable() tries to fix the fault,
     but from the get_user_pages() view everything is ok, because
     the COW was already broken, retry futex_lock_pi_atomic()
- futex_lock_pi_atomic() --> futex_atomic_cmpxchg_inatomic(),
     another write protection page fault
- infinite loop

Thanks
Shan Hai



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-15  9:08         ` Shan Hai
  0 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-15  9:08 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tony.luck, linux-kernel, cmetcalf, dhowells, paulus, tglx,
	walken, linuxppc-dev, akpm

On 07/15/2011 04:44 PM, Peter Zijlstra wrote:
> On Fri, 2011-07-15 at 16:38 +0800, MailingLists wrote:
>> On 07/15/2011 04:20 PM, Peter Zijlstra wrote:
>>> On Fri, 2011-07-15 at 16:07 +0800, Shan Hai wrote:
>>>> The following test case could reveal a bug in the futex_lock_pi()
>>>>
>>>> BUG: On FUTEX_LOCK_PI, there is a infinite loop in the futex_lock_pi()
>>>>           on Powerpc e500 core.
>>>> Cause: The linux kernel on the e500 core has no write permission on
>>>>           the COW page, refer the head comment of the following test code.
>>>>
>>>> ftrace on test case:
>>>> [000]   353.990181: futex_lock_pi_atomic<-futex_lock_pi
>>>> [000]   353.990185: cmpxchg_futex_value_locked<-futex_lock_pi_atomic
>>>> [snip]
>>>> [000]   353.990191: do_page_fault<-handle_page_fault
>>>> [000]   353.990192: bad_page_fault<-handle_page_fault
>>>> [000]   353.990193: search_exception_tables<-bad_page_fault
>>>> [snip]
>>>> [000]   353.990199: get_user_pages<-fault_in_user_writeable
>>>> [snip]
>>>> [000]   353.990208: mark_page_accessed<-follow_page
>>>> [000]   353.990222: futex_lock_pi_atomic<-futex_lock_pi
>>>> [snip]
>>>> [000]   353.990230: cmpxchg_futex_value_locked<-futex_lock_pi_atomic
>>>> [ a loop occures here ]
>>>>
>>> But but but but, that get_user_pages(.write=1, .force=0) should result
>>> in a COW break, getting our own writable page.
>>>
>>> What is this e500 thing smoking that this doesn't work?
>> A page could be set to read only by the kernel (supervisor in the powerpc
>> literature) on the e500, and that's what the kernel do. Set SW(supervisor
>> write) bit in the TLB entry to grant write permission to the kernel on a
>> page.
>>
>> And further the SW bit is set according to the DIRTY flag of the PTE,
>> PTE.DIRTY is set in the do_page_fault(), the futex_lock_pi() disabled
>> page fault, the PTE.DIRTY never can be set, so do the SW bit, unbreakable
>> COW occurred, infinite loop followed.
> I'm fairly sure fault_in_user_writeable() has PF enabled as it takes
> mmap_sem, an pagefaul_disable() is akin to preemp_disable() on mainline.
>
> Also get_user_pages() fully expects to be able to schedule, and in fact
> can call the full pf handler path all by its lonesome self.

The whole scenario should be,
- the child process triggers a page fault at the first time access to
     the lock, and it got its own writable page, but its *clean* for
     the reason just for checking the status of the lock.
     I am sorry for above "unbreakable COW".
- the futex_lock_pi() is invoked because of the lock contention,
     and the futex_atomic_cmpxchg_inatomic() tries to get the lock,
     it found out the lock is free so tries to write to the lock for
     reservation, a page fault occurs, because the page is read only
     for kernel(e500 specific), and returns -EFAULT to the caller
- the fault_in_user_writeable() tries to fix the fault,
     but from the get_user_pages() view everything is ok, because
     the COW was already broken, retry futex_lock_pi_atomic()
- futex_lock_pi_atomic() --> futex_atomic_cmpxchg_inatomic(),
     another write protection page fault
- infinite loop

Thanks
Shan Hai

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-15  9:08         ` Shan Hai
@ 2011-07-15  9:12           ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-15  9:12 UTC (permalink / raw)
  To: Shan Hai
  Cc: Peter Zijlstra, paulus, tglx, walken, dhowells, cmetcalf,
	tony.luck, akpm, linuxppc-dev, linux-kernel

On Fri, 2011-07-15 at 17:08 +0800, Shan Hai wrote:
> The whole scenario should be,
> - the child process triggers a page fault at the first time access to
>      the lock, and it got its own writable page, but its *clean* for
>      the reason just for checking the status of the lock.
>      I am sorry for above "unbreakable COW".
> - the futex_lock_pi() is invoked because of the lock contention,
>      and the futex_atomic_cmpxchg_inatomic() tries to get the lock,
>      it found out the lock is free so tries to write to the lock for
>      reservation, a page fault occurs, because the page is read only
>      for kernel(e500 specific), and returns -EFAULT to the caller

There is nothing e500 specific about user read only pages being read
only for kernel. All architectures behave the same way here afaik.

_However_ there is something not totally x86-like in the fact that we
require handle_mm_fault() to deal with dirty and young tracking, which
means that we -will- fault for a non-dirty writeable page or for any
non-young page. It's quite possible that the page fault disabling occurs
before that and thus breaks those architectures (it's not only e500 and
afaik not only powerpc) while x86 works fine due to HW update of dirty
and young.

It might be something to look into.

Cheers,
Ben.

> - the fault_in_user_writeable() tries to fix the fault,
>      but from the get_user_pages() view everything is ok, because
>      the COW was already broken, retry futex_lock_pi_atomic()
> - futex_lock_pi_atomic() --> futex_atomic_cmpxchg_inatomic(),
>      another write protection page fault
> - infinite loop
> 
> 


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-15  9:12           ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-15  9:12 UTC (permalink / raw)
  To: Shan Hai
  Cc: tony.luck, Peter Zijlstra, linux-kernel, cmetcalf, dhowells,
	paulus, tglx, walken, linuxppc-dev, akpm

On Fri, 2011-07-15 at 17:08 +0800, Shan Hai wrote:
> The whole scenario should be,
> - the child process triggers a page fault at the first time access to
>      the lock, and it got its own writable page, but its *clean* for
>      the reason just for checking the status of the lock.
>      I am sorry for above "unbreakable COW".
> - the futex_lock_pi() is invoked because of the lock contention,
>      and the futex_atomic_cmpxchg_inatomic() tries to get the lock,
>      it found out the lock is free so tries to write to the lock for
>      reservation, a page fault occurs, because the page is read only
>      for kernel(e500 specific), and returns -EFAULT to the caller

There is nothing e500 specific about user read only pages being read
only for kernel. All architectures behave the same way here afaik.

_However_ there is something not totally x86-like in the fact that we
require handle_mm_fault() to deal with dirty and young tracking, which
means that we -will- fault for a non-dirty writeable page or for any
non-young page. It's quite possible that the page fault disabling occurs
before that and thus breaks those architectures (it's not only e500 and
afaik not only powerpc) while x86 works fine due to HW update of dirty
and young.

It might be something to look into.

Cheers,
Ben.

> - the fault_in_user_writeable() tries to fix the fault,
>      but from the get_user_pages() view everything is ok, because
>      the COW was already broken, retry futex_lock_pi_atomic()
> - futex_lock_pi_atomic() --> futex_atomic_cmpxchg_inatomic(),
>      another write protection page fault
> - infinite loop
> 
> 

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-15  9:08         ` Shan Hai
@ 2011-07-15  9:50           ` Peter Zijlstra
  -1 siblings, 0 replies; 138+ messages in thread
From: Peter Zijlstra @ 2011-07-15  9:50 UTC (permalink / raw)
  To: Shan Hai
  Cc: benh, paulus, tglx, walken, dhowells, cmetcalf, tony.luck, akpm,
	linuxppc-dev, linux-kernel

On Fri, 2011-07-15 at 17:08 +0800, Shan Hai wrote:
> The whole scenario should be,
> - the child process triggers a page fault at the first time access to
>      the lock, and it got its own writable page, but its *clean* for
>      the reason just for checking the status of the lock.
>      I am sorry for above "unbreakable COW".
> - the futex_lock_pi() is invoked because of the lock contention,
>      and the futex_atomic_cmpxchg_inatomic() tries to get the lock,
>      it found out the lock is free so tries to write to the lock for
>      reservation, a page fault occurs, because the page is read only
>      for kernel(e500 specific), and returns -EFAULT to the caller
> - the fault_in_user_writeable() tries to fix the fault,
>      but from the get_user_pages() view everything is ok, because
>      the COW was already broken, retry futex_lock_pi_atomic()

but that's a bug right there, gup(.write=1) _should_ be a complete write
fault, and as such toggle your sw dirty/young tracking.

> - futex_lock_pi_atomic() --> futex_atomic_cmpxchg_inatomic(),
>      another write protection page fault
> - infinite loop


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-15  9:50           ` Peter Zijlstra
  0 siblings, 0 replies; 138+ messages in thread
From: Peter Zijlstra @ 2011-07-15  9:50 UTC (permalink / raw)
  To: Shan Hai
  Cc: tony.luck, linux-kernel, cmetcalf, dhowells, paulus, tglx,
	walken, linuxppc-dev, akpm

On Fri, 2011-07-15 at 17:08 +0800, Shan Hai wrote:
> The whole scenario should be,
> - the child process triggers a page fault at the first time access to
>      the lock, and it got its own writable page, but its *clean* for
>      the reason just for checking the status of the lock.
>      I am sorry for above "unbreakable COW".
> - the futex_lock_pi() is invoked because of the lock contention,
>      and the futex_atomic_cmpxchg_inatomic() tries to get the lock,
>      it found out the lock is free so tries to write to the lock for
>      reservation, a page fault occurs, because the page is read only
>      for kernel(e500 specific), and returns -EFAULT to the caller
> - the fault_in_user_writeable() tries to fix the fault,
>      but from the get_user_pages() view everything is ok, because
>      the COW was already broken, retry futex_lock_pi_atomic()

but that's a bug right there, gup(.write=3D1) _should_ be a complete write
fault, and as such toggle your sw dirty/young tracking.

> - futex_lock_pi_atomic() --> futex_atomic_cmpxchg_inatomic(),
>      another write protection page fault
> - infinite loop

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-15  9:50           ` Peter Zijlstra
@ 2011-07-15 10:06             ` Shan Hai
  -1 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-15 10:06 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: benh, paulus, tglx, walken, dhowells, cmetcalf, tony.luck, akpm,
	linuxppc-dev, linux-kernel

On 07/15/2011 05:50 PM, Peter Zijlstra wrote:
> On Fri, 2011-07-15 at 17:08 +0800, Shan Hai wrote:
>> The whole scenario should be,
>> - the child process triggers a page fault at the first time access to
>>       the lock, and it got its own writable page, but its *clean* for
>>       the reason just for checking the status of the lock.
>>       I am sorry for above "unbreakable COW".
>> - the futex_lock_pi() is invoked because of the lock contention,
>>       and the futex_atomic_cmpxchg_inatomic() tries to get the lock,
>>       it found out the lock is free so tries to write to the lock for
>>       reservation, a page fault occurs, because the page is read only
>>       for kernel(e500 specific), and returns -EFAULT to the caller
>> - the fault_in_user_writeable() tries to fix the fault,
>>       but from the get_user_pages() view everything is ok, because
>>       the COW was already broken, retry futex_lock_pi_atomic()
> but that's a bug right there, gup(.write=1) _should_ be a complete write
> fault, and as such toggle your sw dirty/young tracking.
>

The fault causing futex_atomic_cmpxchg_inatomic() is
protected by pagefault_disable(), so the page fault handler has
no chance to toggle the SW dirty/young tracking.

Thanks
Shan Hai

>> - futex_lock_pi_atomic() -->  futex_atomic_cmpxchg_inatomic(),
>>       another write protection page fault
>> - infinite loop


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-15 10:06             ` Shan Hai
  0 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-15 10:06 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tony.luck, linux-kernel, cmetcalf, dhowells, paulus, tglx,
	walken, linuxppc-dev, akpm

On 07/15/2011 05:50 PM, Peter Zijlstra wrote:
> On Fri, 2011-07-15 at 17:08 +0800, Shan Hai wrote:
>> The whole scenario should be,
>> - the child process triggers a page fault at the first time access to
>>       the lock, and it got its own writable page, but its *clean* for
>>       the reason just for checking the status of the lock.
>>       I am sorry for above "unbreakable COW".
>> - the futex_lock_pi() is invoked because of the lock contention,
>>       and the futex_atomic_cmpxchg_inatomic() tries to get the lock,
>>       it found out the lock is free so tries to write to the lock for
>>       reservation, a page fault occurs, because the page is read only
>>       for kernel(e500 specific), and returns -EFAULT to the caller
>> - the fault_in_user_writeable() tries to fix the fault,
>>       but from the get_user_pages() view everything is ok, because
>>       the COW was already broken, retry futex_lock_pi_atomic()
> but that's a bug right there, gup(.write=1) _should_ be a complete write
> fault, and as such toggle your sw dirty/young tracking.
>

The fault causing futex_atomic_cmpxchg_inatomic() is
protected by pagefault_disable(), so the page fault handler has
no chance to toggle the SW dirty/young tracking.

Thanks
Shan Hai

>> - futex_lock_pi_atomic() -->  futex_atomic_cmpxchg_inatomic(),
>>       another write protection page fault
>> - infinite loop

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-15  8:07   ` Shan Hai
@ 2011-07-15 10:23     ` Peter Zijlstra
  -1 siblings, 0 replies; 138+ messages in thread
From: Peter Zijlstra @ 2011-07-15 10:23 UTC (permalink / raw)
  To: Shan Hai
  Cc: benh, paulus, tglx, walken, dhowells, cmetcalf, tony.luck, akpm,
	linuxppc-dev, linux-kernel

On Fri, 2011-07-15 at 16:07 +0800, Shan Hai wrote:
> The kernel has no write permission on COW pages by default on e500 core, this
> will cause endless loop in futex_lock_pi, because futex code assumes the kernel
> has write permission on COW pages. Grant write permission to the kernel on COW
> pages when access violation page fault occurs.
> 
> Signed-off-by: Shan Hai <haishan.bai@gmail.com>
> ---
>  arch/powerpc/include/asm/futex.h |   11 ++++++++++-
>  arch/powerpc/include/asm/tlb.h   |   25 +++++++++++++++++++++++++
>  2 files changed, 35 insertions(+), 1 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h
> index c94e4a3..54c3e74 100644
> --- a/arch/powerpc/include/asm/futex.h
> +++ b/arch/powerpc/include/asm/futex.h
> @@ -8,6 +8,7 @@
>  #include <asm/errno.h>
>  #include <asm/synch.h>
>  #include <asm/asm-compat.h>
> +#include <asm/tlb.h>
>  
>  #define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \
>    __asm__ __volatile ( \
> @@ -113,7 +114,15 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
>          : "cc", "memory");
>  
>  	*uval = prev;
> -        return ret;
> +
> +	/* Futex assumes the kernel has permission to write to
> +	 * COW pages, grant the kernel write permission on COW
> +	 * pages because it has none by default.
> +	 */
> +	if (ret == -EFAULT)
> +		__tlb_fixup_write_permission(current->mm, (unsigned long)uaddr);
> +
> +	return ret;
>  }
>  
>  #endif /* __KERNEL__ */
> diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
> index e2b428b..3863c6a 100644
> --- a/arch/powerpc/include/asm/tlb.h
> +++ b/arch/powerpc/include/asm/tlb.h
> @@ -45,5 +45,30 @@ static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep,
>  #endif
>  }
>  
> +/* Grant write permission to the kernel on a page. */
> +static inline void __tlb_fixup_write_permission(struct mm_struct *mm,
> +						unsigned long address)
> +{
> +#if defined(CONFIG_FSL_BOOKE)
> +	/* Grant write permission to the kernel on a page by setting TLB.SW
> +	 * bit, the bit setting operation is tricky here, calling
> +	 * handle_mm_fault with FAULT_FLAG_WRITE causes _PAGE_DIRTY bit of
> +	 * the pte to be set, the _PAGE_DIRTY of the pte is translated into
> +	 * TLB.SW on Powerpc e500 core.
> +	 */
> +
> +	struct vm_area_struct *vma;
> +
> +	vma = find_vma(mm, address);

Uhm, find_vma() needs mmap_sem, and futex_atomic_cmpxchg_inatomic() is
most certainly not called with that lock held.

> +	if (likely(vma)) {
> +		/* only fixup present page */
> +		if (follow_page(vma, address, FOLL_WRITE)) {
> +			handle_mm_fault(mm, vma, address, FAULT_FLAG_WRITE);

So how can this toggle your sw dirty/young tracking, that's pretty much
what gup(.write=1) does too!

> +			flush_tlb_page(vma, address);
> +		}
> +	}
> +#endif
> +}
> +
>  #endif /* __KERNEL__ */
>  #endif /* __ASM_POWERPC_TLB_H */


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-15 10:23     ` Peter Zijlstra
  0 siblings, 0 replies; 138+ messages in thread
From: Peter Zijlstra @ 2011-07-15 10:23 UTC (permalink / raw)
  To: Shan Hai
  Cc: tony.luck, linux-kernel, cmetcalf, dhowells, paulus, tglx,
	walken, linuxppc-dev, akpm

On Fri, 2011-07-15 at 16:07 +0800, Shan Hai wrote:
> The kernel has no write permission on COW pages by default on e500 core, =
this
> will cause endless loop in futex_lock_pi, because futex code assumes the =
kernel
> has write permission on COW pages. Grant write permission to the kernel o=
n COW
> pages when access violation page fault occurs.
>=20
> Signed-off-by: Shan Hai <haishan.bai@gmail.com>
> ---
>  arch/powerpc/include/asm/futex.h |   11 ++++++++++-
>  arch/powerpc/include/asm/tlb.h   |   25 +++++++++++++++++++++++++
>  2 files changed, 35 insertions(+), 1 deletions(-)
>=20
> diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/=
futex.h
> index c94e4a3..54c3e74 100644
> --- a/arch/powerpc/include/asm/futex.h
> +++ b/arch/powerpc/include/asm/futex.h
> @@ -8,6 +8,7 @@
>  #include <asm/errno.h>
>  #include <asm/synch.h>
>  #include <asm/asm-compat.h>
> +#include <asm/tlb.h>
> =20
>  #define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \
>    __asm__ __volatile ( \
> @@ -113,7 +114,15 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user =
*uaddr,
>          : "cc", "memory");
> =20
>  	*uval =3D prev;
> -        return ret;
> +
> +	/* Futex assumes the kernel has permission to write to
> +	 * COW pages, grant the kernel write permission on COW
> +	 * pages because it has none by default.
> +	 */
> +	if (ret =3D=3D -EFAULT)
> +		__tlb_fixup_write_permission(current->mm, (unsigned long)uaddr);
> +
> +	return ret;
>  }
> =20
>  #endif /* __KERNEL__ */
> diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tl=
b.h
> index e2b428b..3863c6a 100644
> --- a/arch/powerpc/include/asm/tlb.h
> +++ b/arch/powerpc/include/asm/tlb.h
> @@ -45,5 +45,30 @@ static inline void __tlb_remove_tlb_entry(struct mmu_g=
ather *tlb, pte_t *ptep,
>  #endif
>  }
> =20
> +/* Grant write permission to the kernel on a page. */
> +static inline void __tlb_fixup_write_permission(struct mm_struct *mm,
> +						unsigned long address)
> +{
> +#if defined(CONFIG_FSL_BOOKE)
> +	/* Grant write permission to the kernel on a page by setting TLB.SW
> +	 * bit, the bit setting operation is tricky here, calling
> +	 * handle_mm_fault with FAULT_FLAG_WRITE causes _PAGE_DIRTY bit of
> +	 * the pte to be set, the _PAGE_DIRTY of the pte is translated into
> +	 * TLB.SW on Powerpc e500 core.
> +	 */
> +
> +	struct vm_area_struct *vma;
> +
> +	vma =3D find_vma(mm, address);

Uhm, find_vma() needs mmap_sem, and futex_atomic_cmpxchg_inatomic() is
most certainly not called with that lock held.

> +	if (likely(vma)) {
> +		/* only fixup present page */
> +		if (follow_page(vma, address, FOLL_WRITE)) {
> +			handle_mm_fault(mm, vma, address, FAULT_FLAG_WRITE);

So how can this toggle your sw dirty/young tracking, that's pretty much
what gup(.write=3D1) does too!

> +			flush_tlb_page(vma, address);
> +		}
> +	}
> +#endif
> +}
> +
>  #endif /* __KERNEL__ */
>  #endif /* __ASM_POWERPC_TLB_H */

^ permalink raw reply	[flat|nested] 138+ messages in thread

* RE: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-15 10:06             ` Shan Hai
@ 2011-07-15 10:32               ` David Laight
  -1 siblings, 0 replies; 138+ messages in thread
From: David Laight @ 2011-07-15 10:32 UTC (permalink / raw)
  To: Shan Hai, Peter Zijlstra
  Cc: tony.luck, linux-kernel, cmetcalf, dhowells, paulus, tglx,
	walken, linuxppc-dev, akpm

 
> The fault causing futex_atomic_cmpxchg_inatomic() is
> protected by pagefault_disable(), so the page fault handler has
> no chance to toggle the SW dirty/young tracking.

Perhaps that is the bug!
Whatever pagefault_disable() does, it shouldn't disable the
SW dirty/young tracking - which should only needs bits moving
in the page table itself (and TLB update??) rather than any
operations on the rest of the data areas.

It looks to me as though this could happen any time a page
is marked inaccessible by the dirty/young tracking.
Not just as a result of COW.

	David



^ permalink raw reply	[flat|nested] 138+ messages in thread

* RE: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-15 10:32               ` David Laight
  0 siblings, 0 replies; 138+ messages in thread
From: David Laight @ 2011-07-15 10:32 UTC (permalink / raw)
  To: Shan Hai, Peter Zijlstra
  Cc: tony.luck, linux-kernel, cmetcalf, dhowells, paulus, tglx,
	walken, linuxppc-dev, akpm

=20
> The fault causing futex_atomic_cmpxchg_inatomic() is
> protected by pagefault_disable(), so the page fault handler has
> no chance to toggle the SW dirty/young tracking.

Perhaps that is the bug!
Whatever pagefault_disable() does, it shouldn't disable the
SW dirty/young tracking - which should only needs bits moving
in the page table itself (and TLB update??) rather than any
operations on the rest of the data areas.

It looks to me as though this could happen any time a page
is marked inaccessible by the dirty/young tracking.
Not just as a result of COW.

	David

^ permalink raw reply	[flat|nested] 138+ messages in thread

* RE: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-15 10:32               ` David Laight
@ 2011-07-15 10:39                 ` Peter Zijlstra
  -1 siblings, 0 replies; 138+ messages in thread
From: Peter Zijlstra @ 2011-07-15 10:39 UTC (permalink / raw)
  To: David Laight
  Cc: Shan Hai, tony.luck, linux-kernel, cmetcalf, dhowells, paulus,
	tglx, walken, linuxppc-dev, akpm

On Fri, 2011-07-15 at 11:32 +0100, David Laight wrote:
> > The fault causing futex_atomic_cmpxchg_inatomic() is
> > protected by pagefault_disable(), so the page fault handler has
> > no chance to toggle the SW dirty/young tracking.
> 
> Perhaps that is the bug!
> Whatever pagefault_disable() does, it shouldn't disable the
> SW dirty/young tracking - which should only needs bits moving
> in the page table itself (and TLB update??) rather than any
> operations on the rest of the data areas.
> 
> It looks to me as though this could happen any time a page
> is marked inaccessible by the dirty/young tracking.
> Not just as a result of COW.

I've thought of that as well, but couldn't find the actual code in the
ppc fault bits. It could be it relies on vma information to know what it
should allow due to lack of bits in the pte.

If it requires the vma, it requires mmap_sem and it thus has to avoid
the whole thing when pagefault_disabled().

^ permalink raw reply	[flat|nested] 138+ messages in thread

* RE: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-15 10:39                 ` Peter Zijlstra
  0 siblings, 0 replies; 138+ messages in thread
From: Peter Zijlstra @ 2011-07-15 10:39 UTC (permalink / raw)
  To: David Laight
  Cc: tony.luck, Shan Hai, linux-kernel, cmetcalf, dhowells, paulus,
	tglx, walken, linuxppc-dev, akpm

On Fri, 2011-07-15 at 11:32 +0100, David Laight wrote:
> > The fault causing futex_atomic_cmpxchg_inatomic() is
> > protected by pagefault_disable(), so the page fault handler has
> > no chance to toggle the SW dirty/young tracking.
>=20
> Perhaps that is the bug!
> Whatever pagefault_disable() does, it shouldn't disable the
> SW dirty/young tracking - which should only needs bits moving
> in the page table itself (and TLB update??) rather than any
> operations on the rest of the data areas.
>=20
> It looks to me as though this could happen any time a page
> is marked inaccessible by the dirty/young tracking.
> Not just as a result of COW.

I've thought of that as well, but couldn't find the actual code in the
ppc fault bits. It could be it relies on vma information to know what it
should allow due to lack of bits in the pte.

If it requires the vma, it requires mmap_sem and it thus has to avoid
the whole thing when pagefault_disabled().

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-15 10:23     ` Peter Zijlstra
@ 2011-07-15 15:18       ` Shan Hai
  -1 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-15 15:18 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: benh, paulus, tglx, walken, dhowells, cmetcalf, tony.luck, akpm,
	linuxppc-dev, linux-kernel

On 07/15/2011 06:23 AM, Peter Zijlstra wrote:
> On Fri, 2011-07-15 at 16:07 +0800, Shan Hai wrote:
>> The kernel has no write permission on COW pages by default on e500 core, this
>> will cause endless loop in futex_lock_pi, because futex code assumes the kernel
>> has write permission on COW pages. Grant write permission to the kernel on COW
>> pages when access violation page fault occurs.
>>
>> Signed-off-by: Shan Hai<haishan.bai@gmail.com>
>> ---
>>   arch/powerpc/include/asm/futex.h |   11 ++++++++++-
>>   arch/powerpc/include/asm/tlb.h   |   25 +++++++++++++++++++++++++
>>   2 files changed, 35 insertions(+), 1 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h
>> index c94e4a3..54c3e74 100644
>> --- a/arch/powerpc/include/asm/futex.h
>> +++ b/arch/powerpc/include/asm/futex.h
>> @@ -8,6 +8,7 @@
>>   #include<asm/errno.h>
>>   #include<asm/synch.h>
>>   #include<asm/asm-compat.h>
>> +#include<asm/tlb.h>
>>
>>   #define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \
>>     __asm__ __volatile ( \
>> @@ -113,7 +114,15 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
>>           : "cc", "memory");
>>
>>   	*uval = prev;
>> -        return ret;
>> +
>> +	/* Futex assumes the kernel has permission to write to
>> +	 * COW pages, grant the kernel write permission on COW
>> +	 * pages because it has none by default.
>> +	 */
>> +	if (ret == -EFAULT)
>> +		__tlb_fixup_write_permission(current->mm, (unsigned long)uaddr);
>> +
>> +	return ret;
>>   }
>>
>>   #endif /* __KERNEL__ */
>> diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
>> index e2b428b..3863c6a 100644
>> --- a/arch/powerpc/include/asm/tlb.h
>> +++ b/arch/powerpc/include/asm/tlb.h
>> @@ -45,5 +45,30 @@ static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep,
>>   #endif
>>   }
>>
>> +/* Grant write permission to the kernel on a page. */
>> +static inline void __tlb_fixup_write_permission(struct mm_struct *mm,
>> +						unsigned long address)
>> +{
>> +#if defined(CONFIG_FSL_BOOKE)
>> +	/* Grant write permission to the kernel on a page by setting TLB.SW
>> +	 * bit, the bit setting operation is tricky here, calling
>> +	 * handle_mm_fault with FAULT_FLAG_WRITE causes _PAGE_DIRTY bit of
>> +	 * the pte to be set, the _PAGE_DIRTY of the pte is translated into
>> +	 * TLB.SW on Powerpc e500 core.
>> +	 */
>> +
>> +	struct vm_area_struct *vma;
>> +
>> +	vma = find_vma(mm, address);
> Uhm, find_vma() needs mmap_sem, and futex_atomic_cmpxchg_inatomic() is
> most certainly not called with that lock held.
>

My fault, that will be fixed in the V2 patch.

>> +	if (likely(vma)) {
>> +		/* only fixup present page */
>> +		if (follow_page(vma, address, FOLL_WRITE)) {
>> +			handle_mm_fault(mm, vma, address, FAULT_FLAG_WRITE);
> So how can this toggle your sw dirty/young tracking, that's pretty much
> what gup(.write=1) does too!
>

because of the kernel read only permission of the page is transparent
to the follow_page(),  the handle_mm_fault() is not to be activated
in the __get_use_pages(), so the gup(.write=1) could not help to fixup
the write permission.


Thanks
Shan Hai

>> +			flush_tlb_page(vma, address);
>> +		}
>> +	}
>> +#endif
>> +}
>> +
>>   #endif /* __KERNEL__ */
>>   #endif /* __ASM_POWERPC_TLB_H */


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-15 15:18       ` Shan Hai
  0 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-15 15:18 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tony.luck, linux-kernel, cmetcalf, dhowells, paulus, tglx,
	walken, linuxppc-dev, akpm

On 07/15/2011 06:23 AM, Peter Zijlstra wrote:
> On Fri, 2011-07-15 at 16:07 +0800, Shan Hai wrote:
>> The kernel has no write permission on COW pages by default on e500 core, this
>> will cause endless loop in futex_lock_pi, because futex code assumes the kernel
>> has write permission on COW pages. Grant write permission to the kernel on COW
>> pages when access violation page fault occurs.
>>
>> Signed-off-by: Shan Hai<haishan.bai@gmail.com>
>> ---
>>   arch/powerpc/include/asm/futex.h |   11 ++++++++++-
>>   arch/powerpc/include/asm/tlb.h   |   25 +++++++++++++++++++++++++
>>   2 files changed, 35 insertions(+), 1 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h
>> index c94e4a3..54c3e74 100644
>> --- a/arch/powerpc/include/asm/futex.h
>> +++ b/arch/powerpc/include/asm/futex.h
>> @@ -8,6 +8,7 @@
>>   #include<asm/errno.h>
>>   #include<asm/synch.h>
>>   #include<asm/asm-compat.h>
>> +#include<asm/tlb.h>
>>
>>   #define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \
>>     __asm__ __volatile ( \
>> @@ -113,7 +114,15 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
>>           : "cc", "memory");
>>
>>   	*uval = prev;
>> -        return ret;
>> +
>> +	/* Futex assumes the kernel has permission to write to
>> +	 * COW pages, grant the kernel write permission on COW
>> +	 * pages because it has none by default.
>> +	 */
>> +	if (ret == -EFAULT)
>> +		__tlb_fixup_write_permission(current->mm, (unsigned long)uaddr);
>> +
>> +	return ret;
>>   }
>>
>>   #endif /* __KERNEL__ */
>> diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
>> index e2b428b..3863c6a 100644
>> --- a/arch/powerpc/include/asm/tlb.h
>> +++ b/arch/powerpc/include/asm/tlb.h
>> @@ -45,5 +45,30 @@ static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep,
>>   #endif
>>   }
>>
>> +/* Grant write permission to the kernel on a page. */
>> +static inline void __tlb_fixup_write_permission(struct mm_struct *mm,
>> +						unsigned long address)
>> +{
>> +#if defined(CONFIG_FSL_BOOKE)
>> +	/* Grant write permission to the kernel on a page by setting TLB.SW
>> +	 * bit, the bit setting operation is tricky here, calling
>> +	 * handle_mm_fault with FAULT_FLAG_WRITE causes _PAGE_DIRTY bit of
>> +	 * the pte to be set, the _PAGE_DIRTY of the pte is translated into
>> +	 * TLB.SW on Powerpc e500 core.
>> +	 */
>> +
>> +	struct vm_area_struct *vma;
>> +
>> +	vma = find_vma(mm, address);
> Uhm, find_vma() needs mmap_sem, and futex_atomic_cmpxchg_inatomic() is
> most certainly not called with that lock held.
>

My fault, that will be fixed in the V2 patch.

>> +	if (likely(vma)) {
>> +		/* only fixup present page */
>> +		if (follow_page(vma, address, FOLL_WRITE)) {
>> +			handle_mm_fault(mm, vma, address, FAULT_FLAG_WRITE);
> So how can this toggle your sw dirty/young tracking, that's pretty much
> what gup(.write=1) does too!
>

because of the kernel read only permission of the page is transparent
to the follow_page(),  the handle_mm_fault() is not to be activated
in the __get_use_pages(), so the gup(.write=1) could not help to fixup
the write permission.


Thanks
Shan Hai

>> +			flush_tlb_page(vma, address);
>> +		}
>> +	}
>> +#endif
>> +}
>> +
>>   #endif /* __KERNEL__ */
>>   #endif /* __ASM_POWERPC_TLB_H */

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-15 15:18       ` Shan Hai
@ 2011-07-15 15:24         ` Peter Zijlstra
  -1 siblings, 0 replies; 138+ messages in thread
From: Peter Zijlstra @ 2011-07-15 15:24 UTC (permalink / raw)
  To: Shan Hai
  Cc: benh, paulus, tglx, walken, dhowells, cmetcalf, tony.luck, akpm,
	linuxppc-dev, linux-kernel

On Fri, 2011-07-15 at 11:18 -0400, Shan Hai wrote:

> >> +	vma = find_vma(mm, address);
> > Uhm, find_vma() needs mmap_sem, and futex_atomic_cmpxchg_inatomic() is
> > most certainly not called with that lock held.
> >
> 
> My fault, that will be fixed in the V2 patch.

But you cannot, the function isn't called _atomic_ just for kicks, its
used while holding spinlocks.

> >> +	if (likely(vma)) {
> >> +		/* only fixup present page */
> >> +		if (follow_page(vma, address, FOLL_WRITE)) {
> >> +			handle_mm_fault(mm, vma, address, FAULT_FLAG_WRITE);
> > So how can this toggle your sw dirty/young tracking, that's pretty much
> > what gup(.write=1) does too!
> >
> 
> because of the kernel read only permission of the page is transparent
> to the follow_page(),  the handle_mm_fault() is not to be activated
> in the __get_use_pages(), so the gup(.write=1) could not help to fixup
> the write permission.

So why do you need the vma? Is it like I wrote earlier that you don't
have spare PTE bits and need the vma flags to see if it may become
writable?

gup(.write=1) not triggering this is a serious problem though, not
something you can just paper over. I wouldn't be at all surprised to
find there's more things broken because of that.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-15 15:24         ` Peter Zijlstra
  0 siblings, 0 replies; 138+ messages in thread
From: Peter Zijlstra @ 2011-07-15 15:24 UTC (permalink / raw)
  To: Shan Hai
  Cc: tony.luck, linux-kernel, cmetcalf, dhowells, paulus, tglx,
	walken, linuxppc-dev, akpm

On Fri, 2011-07-15 at 11:18 -0400, Shan Hai wrote:

> >> +	vma =3D find_vma(mm, address);
> > Uhm, find_vma() needs mmap_sem, and futex_atomic_cmpxchg_inatomic() is
> > most certainly not called with that lock held.
> >
>=20
> My fault, that will be fixed in the V2 patch.

But you cannot, the function isn't called _atomic_ just for kicks, its
used while holding spinlocks.

> >> +	if (likely(vma)) {
> >> +		/* only fixup present page */
> >> +		if (follow_page(vma, address, FOLL_WRITE)) {
> >> +			handle_mm_fault(mm, vma, address, FAULT_FLAG_WRITE);
> > So how can this toggle your sw dirty/young tracking, that's pretty much
> > what gup(.write=3D1) does too!
> >
>=20
> because of the kernel read only permission of the page is transparent
> to the follow_page(),  the handle_mm_fault() is not to be activated
> in the __get_use_pages(), so the gup(.write=3D1) could not help to fixup
> the write permission.

So why do you need the vma? Is it like I wrote earlier that you don't
have spare PTE bits and need the vma flags to see if it may become
writable?

gup(.write=3D1) not triggering this is a serious problem though, not
something you can just paper over. I wouldn't be at all surprised to
find there's more things broken because of that.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-15 10:32               ` David Laight
@ 2011-07-15 15:32                 ` Shan Hai
  -1 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-15 15:32 UTC (permalink / raw)
  To: David Laight
  Cc: Peter Zijlstra, tony.luck, linux-kernel, cmetcalf, dhowells,
	paulus, tglx, walken, linuxppc-dev, akpm

On 07/15/2011 06:32 AM, David Laight wrote:
>
>> The fault causing futex_atomic_cmpxchg_inatomic() is
>> protected by pagefault_disable(), so the page fault handler has
>> no chance to toggle the SW dirty/young tracking.
> Perhaps that is the bug!
> Whatever pagefault_disable() does, it shouldn't disable the
> SW dirty/young tracking - which should only needs bits moving
> in the page table itself (and TLB update??) rather than any
> operations on the rest of the data areas.
>
> It looks to me as though this could happen any time a page
> is marked inaccessible by the dirty/young tracking.
> Not just as a result of COW.
>

I agree with you, the problem could be triggered by accessing
any user space page which has kernel read only permission
in the page fault disabled context, the problem also affects
architectures which depend on SW dirty/young tracking as
stated by Benjamin in this thread.

In the e500 case, the commit 6cfd8990e27d3a491c1c605d6cbc18a46ae51fef
removed the write permission fixup from TLB miss handlers and left it to
generic code, so it might be right time to fixup the write permission here
in the generic code.


Thanks
Shan Hai

> 	David
>
>


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-15 15:32                 ` Shan Hai
  0 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-15 15:32 UTC (permalink / raw)
  To: David Laight
  Cc: tony.luck, Peter Zijlstra, linux-kernel, cmetcalf, dhowells,
	paulus, tglx, walken, linuxppc-dev, akpm

On 07/15/2011 06:32 AM, David Laight wrote:
>
>> The fault causing futex_atomic_cmpxchg_inatomic() is
>> protected by pagefault_disable(), so the page fault handler has
>> no chance to toggle the SW dirty/young tracking.
> Perhaps that is the bug!
> Whatever pagefault_disable() does, it shouldn't disable the
> SW dirty/young tracking - which should only needs bits moving
> in the page table itself (and TLB update??) rather than any
> operations on the rest of the data areas.
>
> It looks to me as though this could happen any time a page
> is marked inaccessible by the dirty/young tracking.
> Not just as a result of COW.
>

I agree with you, the problem could be triggered by accessing
any user space page which has kernel read only permission
in the page fault disabled context, the problem also affects
architectures which depend on SW dirty/young tracking as
stated by Benjamin in this thread.

In the e500 case, the commit 6cfd8990e27d3a491c1c605d6cbc18a46ae51fef
removed the write permission fixup from TLB miss handlers and left it to
generic code, so it might be right time to fixup the write permission here
in the generic code.


Thanks
Shan Hai

> 	David
>
>

^ permalink raw reply	[flat|nested] 138+ messages in thread

* RE: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-15 10:32               ` David Laight
@ 2011-07-15 23:47                 ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-15 23:47 UTC (permalink / raw)
  To: David Laight
  Cc: Shan Hai, Peter Zijlstra, tony.luck, linux-kernel, cmetcalf,
	dhowells, paulus, tglx, walken, linuxppc-dev, akpm

On Fri, 2011-07-15 at 11:32 +0100, David Laight wrote:
> > The fault causing futex_atomic_cmpxchg_inatomic() is
> > protected by pagefault_disable(), so the page fault handler has
> > no chance to toggle the SW dirty/young tracking.
> 
> Perhaps that is the bug!
> Whatever pagefault_disable() does, it shouldn't disable the
> SW dirty/young tracking - which should only needs bits moving
> in the page table itself (and TLB update??) rather than any
> operations on the rest of the data areas.
> 
> It looks to me as though this could happen any time a page
> is marked inaccessible by the dirty/young tracking.
> Not just as a result of COW.

Except that for many architectures, there's a hard wired assumption that
the state of the PTEs won't change at interrupt time.

If we allow the "atomic" user accesses, we'll break that rule (think
about perf backtraces for example), and so would have to at -least-
disable interrupts around all the PTE accessors, or use atomic ops,
which will slow things down all over the place.

Cheers,
Ben.



^ permalink raw reply	[flat|nested] 138+ messages in thread

* RE: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-15 23:47                 ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-15 23:47 UTC (permalink / raw)
  To: David Laight
  Cc: tony.luck, Peter Zijlstra, Shan Hai, linux-kernel, cmetcalf,
	dhowells, paulus, tglx, walken, linuxppc-dev, akpm

On Fri, 2011-07-15 at 11:32 +0100, David Laight wrote:
> > The fault causing futex_atomic_cmpxchg_inatomic() is
> > protected by pagefault_disable(), so the page fault handler has
> > no chance to toggle the SW dirty/young tracking.
> 
> Perhaps that is the bug!
> Whatever pagefault_disable() does, it shouldn't disable the
> SW dirty/young tracking - which should only needs bits moving
> in the page table itself (and TLB update??) rather than any
> operations on the rest of the data areas.
> 
> It looks to me as though this could happen any time a page
> is marked inaccessible by the dirty/young tracking.
> Not just as a result of COW.

Except that for many architectures, there's a hard wired assumption that
the state of the PTEs won't change at interrupt time.

If we allow the "atomic" user accesses, we'll break that rule (think
about perf backtraces for example), and so would have to at -least-
disable interrupts around all the PTE accessors, or use atomic ops,
which will slow things down all over the place.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-15 15:32                 ` Shan Hai
@ 2011-07-16  0:20                   ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-16  0:20 UTC (permalink / raw)
  To: Shan Hai
  Cc: David Laight, Peter Zijlstra, tony.luck, linux-kernel, cmetcalf,
	dhowells, paulus, tglx, walken, linuxppc-dev, akpm

On Fri, 2011-07-15 at 11:32 -0400, Shan Hai wrote:
> 
> I agree with you, the problem could be triggered by accessing
> any user space page which has kernel read only permission
> in the page fault disabled context, the problem also affects
> architectures which depend on SW dirty/young tracking as
> stated by Benjamin in this thread.
> 
> In the e500 case, the commit 6cfd8990e27d3a491c1c605d6cbc18a46ae51fef
> removed the write permission fixup from TLB miss handlers and left it to
> generic code, so it might be right time to fixup the write permission here
> in the generic code.

But we can't. The must not modify the PTE from an interrupt context and
the "atomic" variants of user accesses can be called in such contexts.

I think the problem is that we try to actually do things other than just
"peek" at user memory (for backtraces etc...) but actually useful things
in page fault disabled contexts. That's bad and various archs mm were
designed with the assumption that this never happens.

If the futex case is seldom here, we could probably find a way to work
around in that specific case.

However, I -still- don't understand why gup didn't fixup the write
permission. gup doesn't set dirty ?

Cheers,
Ben.



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-16  0:20                   ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-16  0:20 UTC (permalink / raw)
  To: Shan Hai
  Cc: tony.luck, Peter Zijlstra, linux-kernel, cmetcalf, dhowells,
	David Laight, paulus, tglx, walken, linuxppc-dev, akpm

On Fri, 2011-07-15 at 11:32 -0400, Shan Hai wrote:
> 
> I agree with you, the problem could be triggered by accessing
> any user space page which has kernel read only permission
> in the page fault disabled context, the problem also affects
> architectures which depend on SW dirty/young tracking as
> stated by Benjamin in this thread.
> 
> In the e500 case, the commit 6cfd8990e27d3a491c1c605d6cbc18a46ae51fef
> removed the write permission fixup from TLB miss handlers and left it to
> generic code, so it might be right time to fixup the write permission here
> in the generic code.

But we can't. The must not modify the PTE from an interrupt context and
the "atomic" variants of user accesses can be called in such contexts.

I think the problem is that we try to actually do things other than just
"peek" at user memory (for backtraces etc...) but actually useful things
in page fault disabled contexts. That's bad and various archs mm were
designed with the assumption that this never happens.

If the futex case is seldom here, we could probably find a way to work
around in that specific case.

However, I -still- don't understand why gup didn't fixup the write
permission. gup doesn't set dirty ?

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-15 10:23     ` Peter Zijlstra
@ 2011-07-16 14:50       ` Shan Hai
  -1 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-16 14:50 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: benh, paulus, tglx, walken, dhowells, cmetcalf, tony.luck, akpm,
	linuxppc-dev, linux-kernel

On 07/15/2011 06:23 AM, Peter Zijlstra wrote:
> On Fri, 2011-07-15 at 16:07 +0800, Shan Hai wrote:
>> The kernel has no write permission on COW pages by default on e500 core, this
>> will cause endless loop in futex_lock_pi, because futex code assumes the kernel
>> has write permission on COW pages. Grant write permission to the kernel on COW
>> pages when access violation page fault occurs.
>>
>> Signed-off-by: Shan Hai<haishan.bai@gmail.com>
>> ---
>>   arch/powerpc/include/asm/futex.h |   11 ++++++++++-
>>   arch/powerpc/include/asm/tlb.h   |   25 +++++++++++++++++++++++++
>>   2 files changed, 35 insertions(+), 1 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h
>> index c94e4a3..54c3e74 100644
>> --- a/arch/powerpc/include/asm/futex.h
>> +++ b/arch/powerpc/include/asm/futex.h
>> @@ -8,6 +8,7 @@
>>   #include<asm/errno.h>
>>   #include<asm/synch.h>
>>   #include<asm/asm-compat.h>
>> +#include<asm/tlb.h>
>>
>>   #define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \
>>     __asm__ __volatile ( \
>> @@ -113,7 +114,15 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
>>           : "cc", "memory");
>>
>>   	*uval = prev;
>> -        return ret;
>> +
>> +	/* Futex assumes the kernel has permission to write to
>> +	 * COW pages, grant the kernel write permission on COW
>> +	 * pages because it has none by default.
>> +	 */
>> +	if (ret == -EFAULT)
>> +		__tlb_fixup_write_permission(current->mm, (unsigned long)uaddr);
>> +
>> +	return ret;
>>   }
>>
>>   #endif /* __KERNEL__ */
>> diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
>> index e2b428b..3863c6a 100644
>> --- a/arch/powerpc/include/asm/tlb.h
>> +++ b/arch/powerpc/include/asm/tlb.h
>> @@ -45,5 +45,30 @@ static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep,
>>   #endif
>>   }
>>
>> +/* Grant write permission to the kernel on a page. */
>> +static inline void __tlb_fixup_write_permission(struct mm_struct *mm,
>> +						unsigned long address)
>> +{
>> +#if defined(CONFIG_FSL_BOOKE)
>> +	/* Grant write permission to the kernel on a page by setting TLB.SW
>> +	 * bit, the bit setting operation is tricky here, calling
>> +	 * handle_mm_fault with FAULT_FLAG_WRITE causes _PAGE_DIRTY bit of
>> +	 * the pte to be set, the _PAGE_DIRTY of the pte is translated into
>> +	 * TLB.SW on Powerpc e500 core.
>> +	 */
>> +
>> +	struct vm_area_struct *vma;
>> +
>> +	vma = find_vma(mm, address);
> Uhm, find_vma() needs mmap_sem, and futex_atomic_cmpxchg_inatomic() is
> most certainly not called with that lock held.
>
>> +	if (likely(vma)) {
>> +		/* only fixup present page */
>> +		if (follow_page(vma, address, FOLL_WRITE)) {
>> +			handle_mm_fault(mm, vma, address, FAULT_FLAG_WRITE);
> So how can this toggle your sw dirty/young tracking, that's pretty much
> what gup(.write=1) does too!
>

That's right the gup(.write=1) want to do the same thing as the
above code snippet, but it failed for the following reason:
because the get_user_pages() would not dirty pte for the reason
the follow_page() returns not NULL on *present* and *writable*
page, the page which holds the lock is present because its a shared page,
writable because demand paging set that up so for shared
writable page, so the handle_mm_fault() in the __get_user_page()
could not be called.

Why the above code could do the same task, because by calling
handle_mm_fault() will set pte dirty by
[do_annonymous_page(), memory.c]
if (vma->vm_flags & VM_WRITE)
                 entry = pte_mkwrite(pte_mkdirty(entry));

Thanks
Shan Hai

>> +			flush_tlb_page(vma, address);
>> +		}
>> +	}
>> +#endif
>> +}
>> +
>>   #endif /* __KERNEL__ */
>>   #endif /* __ASM_POWERPC_TLB_H */


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-16 14:50       ` Shan Hai
  0 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-16 14:50 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tony.luck, linux-kernel, cmetcalf, dhowells, paulus, tglx,
	walken, linuxppc-dev, akpm

On 07/15/2011 06:23 AM, Peter Zijlstra wrote:
> On Fri, 2011-07-15 at 16:07 +0800, Shan Hai wrote:
>> The kernel has no write permission on COW pages by default on e500 core, this
>> will cause endless loop in futex_lock_pi, because futex code assumes the kernel
>> has write permission on COW pages. Grant write permission to the kernel on COW
>> pages when access violation page fault occurs.
>>
>> Signed-off-by: Shan Hai<haishan.bai@gmail.com>
>> ---
>>   arch/powerpc/include/asm/futex.h |   11 ++++++++++-
>>   arch/powerpc/include/asm/tlb.h   |   25 +++++++++++++++++++++++++
>>   2 files changed, 35 insertions(+), 1 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h
>> index c94e4a3..54c3e74 100644
>> --- a/arch/powerpc/include/asm/futex.h
>> +++ b/arch/powerpc/include/asm/futex.h
>> @@ -8,6 +8,7 @@
>>   #include<asm/errno.h>
>>   #include<asm/synch.h>
>>   #include<asm/asm-compat.h>
>> +#include<asm/tlb.h>
>>
>>   #define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \
>>     __asm__ __volatile ( \
>> @@ -113,7 +114,15 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
>>           : "cc", "memory");
>>
>>   	*uval = prev;
>> -        return ret;
>> +
>> +	/* Futex assumes the kernel has permission to write to
>> +	 * COW pages, grant the kernel write permission on COW
>> +	 * pages because it has none by default.
>> +	 */
>> +	if (ret == -EFAULT)
>> +		__tlb_fixup_write_permission(current->mm, (unsigned long)uaddr);
>> +
>> +	return ret;
>>   }
>>
>>   #endif /* __KERNEL__ */
>> diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
>> index e2b428b..3863c6a 100644
>> --- a/arch/powerpc/include/asm/tlb.h
>> +++ b/arch/powerpc/include/asm/tlb.h
>> @@ -45,5 +45,30 @@ static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep,
>>   #endif
>>   }
>>
>> +/* Grant write permission to the kernel on a page. */
>> +static inline void __tlb_fixup_write_permission(struct mm_struct *mm,
>> +						unsigned long address)
>> +{
>> +#if defined(CONFIG_FSL_BOOKE)
>> +	/* Grant write permission to the kernel on a page by setting TLB.SW
>> +	 * bit, the bit setting operation is tricky here, calling
>> +	 * handle_mm_fault with FAULT_FLAG_WRITE causes _PAGE_DIRTY bit of
>> +	 * the pte to be set, the _PAGE_DIRTY of the pte is translated into
>> +	 * TLB.SW on Powerpc e500 core.
>> +	 */
>> +
>> +	struct vm_area_struct *vma;
>> +
>> +	vma = find_vma(mm, address);
> Uhm, find_vma() needs mmap_sem, and futex_atomic_cmpxchg_inatomic() is
> most certainly not called with that lock held.
>
>> +	if (likely(vma)) {
>> +		/* only fixup present page */
>> +		if (follow_page(vma, address, FOLL_WRITE)) {
>> +			handle_mm_fault(mm, vma, address, FAULT_FLAG_WRITE);
> So how can this toggle your sw dirty/young tracking, that's pretty much
> what gup(.write=1) does too!
>

That's right the gup(.write=1) want to do the same thing as the
above code snippet, but it failed for the following reason:
because the get_user_pages() would not dirty pte for the reason
the follow_page() returns not NULL on *present* and *writable*
page, the page which holds the lock is present because its a shared page,
writable because demand paging set that up so for shared
writable page, so the handle_mm_fault() in the __get_user_page()
could not be called.

Why the above code could do the same task, because by calling
handle_mm_fault() will set pte dirty by
[do_annonymous_page(), memory.c]
if (vma->vm_flags & VM_WRITE)
                 entry = pte_mkwrite(pte_mkdirty(entry));

Thanks
Shan Hai

>> +			flush_tlb_page(vma, address);
>> +		}
>> +	}
>> +#endif
>> +}
>> +
>>   #endif /* __KERNEL__ */
>>   #endif /* __ASM_POWERPC_TLB_H */

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-16  0:20                   ` Benjamin Herrenschmidt
@ 2011-07-16 15:03                     ` Shan Hai
  -1 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-16 15:03 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: David Laight, Peter Zijlstra, tony.luck, linux-kernel, cmetcalf,
	dhowells, paulus, tglx, walken, linuxppc-dev, akpm

On 07/15/2011 08:20 PM, Benjamin Herrenschmidt wrote:
> On Fri, 2011-07-15 at 11:32 -0400, Shan Hai wrote:
>> I agree with you, the problem could be triggered by accessing
>> any user space page which has kernel read only permission
>> in the page fault disabled context, the problem also affects
>> architectures which depend on SW dirty/young tracking as
>> stated by Benjamin in this thread.
>>
>> In the e500 case, the commit 6cfd8990e27d3a491c1c605d6cbc18a46ae51fef
>> removed the write permission fixup from TLB miss handlers and left it to
>> generic code, so it might be right time to fixup the write permission here
>> in the generic code.
> But we can't. The must not modify the PTE from an interrupt context and
> the "atomic" variants of user accesses can be called in such contexts.
>
> I think the problem is that we try to actually do things other than just
> "peek" at user memory (for backtraces etc...) but actually useful things
> in page fault disabled contexts. That's bad and various archs mm were
> designed with the assumption that this never happens.
>

Yes I understood, the *here* above means 'generic code' like futex code,
I am sorry for my ambiguous description.

> If the futex case is seldom here, we could probably find a way to work
> around in that specific case.
>

That's what my patch wants to do.

> However, I -still- don't understand why gup didn't fixup the write
> permission. gup doesn't set dirty ?
>

Yep, gup doesn't set dirty, because when the page fault
occurs on the kernel accessing a user page which is
read only to the kernel the following conditions hold,
- the page is present, because its a shared page
- the page is writable, because demand paging
     sets up the pte for the current  process to so

The follow_page() called in the __get_user_page()
returns non NULL to its caller on the above mentioned
present and writable page, so the gup(.write=1) has no
chance to set pte dirty by calling handle_mm_fault

Thanks
Shan Hai
s
> Cheers,
> Ben.
>
>


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-16 15:03                     ` Shan Hai
  0 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-16 15:03 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: tony.luck, Peter Zijlstra, linux-kernel, cmetcalf, dhowells,
	David Laight, paulus, tglx, walken, linuxppc-dev, akpm

On 07/15/2011 08:20 PM, Benjamin Herrenschmidt wrote:
> On Fri, 2011-07-15 at 11:32 -0400, Shan Hai wrote:
>> I agree with you, the problem could be triggered by accessing
>> any user space page which has kernel read only permission
>> in the page fault disabled context, the problem also affects
>> architectures which depend on SW dirty/young tracking as
>> stated by Benjamin in this thread.
>>
>> In the e500 case, the commit 6cfd8990e27d3a491c1c605d6cbc18a46ae51fef
>> removed the write permission fixup from TLB miss handlers and left it to
>> generic code, so it might be right time to fixup the write permission here
>> in the generic code.
> But we can't. The must not modify the PTE from an interrupt context and
> the "atomic" variants of user accesses can be called in such contexts.
>
> I think the problem is that we try to actually do things other than just
> "peek" at user memory (for backtraces etc...) but actually useful things
> in page fault disabled contexts. That's bad and various archs mm were
> designed with the assumption that this never happens.
>

Yes I understood, the *here* above means 'generic code' like futex code,
I am sorry for my ambiguous description.

> If the futex case is seldom here, we could probably find a way to work
> around in that specific case.
>

That's what my patch wants to do.

> However, I -still- don't understand why gup didn't fixup the write
> permission. gup doesn't set dirty ?
>

Yep, gup doesn't set dirty, because when the page fault
occurs on the kernel accessing a user page which is
read only to the kernel the following conditions hold,
- the page is present, because its a shared page
- the page is writable, because demand paging
     sets up the pte for the current  process to so

The follow_page() called in the __get_user_page()
returns non NULL to its caller on the above mentioned
present and writable page, so the gup(.write=1) has no
chance to set pte dirty by calling handle_mm_fault

Thanks
Shan Hai
s
> Cheers,
> Ben.
>
>

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-15 15:24         ` Peter Zijlstra
@ 2011-07-16 15:36           ` Shan Hai
  -1 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-16 15:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: benh, paulus, tglx, walken, dhowells, cmetcalf, tony.luck, akpm,
	linuxppc-dev, linux-kernel

On 07/15/2011 11:24 AM, Peter Zijlstra wrote:
> On Fri, 2011-07-15 at 11:18 -0400, Shan Hai wrote:
>
>>>> +	vma = find_vma(mm, address);
>>> Uhm, find_vma() needs mmap_sem, and futex_atomic_cmpxchg_inatomic() is
>>> most certainly not called with that lock held.
>>>
>> My fault, that will be fixed in the V2 patch.
> But you cannot, the function isn't called _atomic_ just for kicks, its
> used while holding spinlocks.
>

Yes we can do that, _atomic_ here is just atomic for cmpxchg
implemented by the combination of 'lwarx' and 'stwcx.' instructions
as done in the spin lock implementation, so even we hold the
mmap_sem that has no impact on the _atomic_ feature of the
futex_atomic_cmpxchg_inatomic().

>>>> +	if (likely(vma)) {
>>>> +		/* only fixup present page */
>>>> +		if (follow_page(vma, address, FOLL_WRITE)) {
>>>> +			handle_mm_fault(mm, vma, address, FAULT_FLAG_WRITE);
>>> So how can this toggle your sw dirty/young tracking, that's pretty much
>>> what gup(.write=1) does too!
>>>
>> because of the kernel read only permission of the page is transparent
>> to the follow_page(),  the handle_mm_fault() is not to be activated
>> in the __get_use_pages(), so the gup(.write=1) could not help to fixup
>> the write permission.
> So why do you need the vma? Is it like I wrote earlier that you don't
> have spare PTE bits and need the vma flags to see if it may become
> writable?
>

Need vma for the reason to call handle_mm_fault(), that's all.

> gup(.write=1) not triggering this is a serious problem though, not
> something you can just paper over. I wouldn't be at all surprised to
> find there's more things broken because of that.

In my opinion another solution might be check the read only for kernel
feature of a page in the follow_page() on gup(.write=1) to avoid this
problem on all architectures.

Thanks
Shan Hai


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-16 15:36           ` Shan Hai
  0 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-16 15:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tony.luck, linux-kernel, cmetcalf, dhowells, paulus, tglx,
	walken, linuxppc-dev, akpm

On 07/15/2011 11:24 AM, Peter Zijlstra wrote:
> On Fri, 2011-07-15 at 11:18 -0400, Shan Hai wrote:
>
>>>> +	vma = find_vma(mm, address);
>>> Uhm, find_vma() needs mmap_sem, and futex_atomic_cmpxchg_inatomic() is
>>> most certainly not called with that lock held.
>>>
>> My fault, that will be fixed in the V2 patch.
> But you cannot, the function isn't called _atomic_ just for kicks, its
> used while holding spinlocks.
>

Yes we can do that, _atomic_ here is just atomic for cmpxchg
implemented by the combination of 'lwarx' and 'stwcx.' instructions
as done in the spin lock implementation, so even we hold the
mmap_sem that has no impact on the _atomic_ feature of the
futex_atomic_cmpxchg_inatomic().

>>>> +	if (likely(vma)) {
>>>> +		/* only fixup present page */
>>>> +		if (follow_page(vma, address, FOLL_WRITE)) {
>>>> +			handle_mm_fault(mm, vma, address, FAULT_FLAG_WRITE);
>>> So how can this toggle your sw dirty/young tracking, that's pretty much
>>> what gup(.write=1) does too!
>>>
>> because of the kernel read only permission of the page is transparent
>> to the follow_page(),  the handle_mm_fault() is not to be activated
>> in the __get_use_pages(), so the gup(.write=1) could not help to fixup
>> the write permission.
> So why do you need the vma? Is it like I wrote earlier that you don't
> have spare PTE bits and need the vma flags to see if it may become
> writable?
>

Need vma for the reason to call handle_mm_fault(), that's all.

> gup(.write=1) not triggering this is a serious problem though, not
> something you can just paper over. I wouldn't be at all surprised to
> find there's more things broken because of that.

In my opinion another solution might be check the read only for kernel
feature of a page in the follow_page() on gup(.write=1) to avoid this
problem on all architectures.

Thanks
Shan Hai

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-16 14:50       ` Shan Hai
@ 2011-07-16 23:49         ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-16 23:49 UTC (permalink / raw)
  To: Shan Hai
  Cc: Peter Zijlstra, paulus, tglx, walken, dhowells, cmetcalf,
	tony.luck, akpm, linuxppc-dev, linux-kernel

On Sat, 2011-07-16 at 10:50 -0400, Shan Hai wrote:
> 
> That's right the gup(.write=1) want to do the same thing as the
> above code snippet, but it failed for the following reason:
> because the get_user_pages() would not dirty pte for the reason
> the follow_page() returns not NULL on *present* and *writable*
> page, the page which holds the lock is present because its a shared
> page,
> writable because demand paging set that up so for shared
> writable page, so the handle_mm_fault() in the __get_user_page()
> could not be called.
> 
> Why the above code could do the same task, because by calling
> handle_mm_fault() will set pte dirty by
> [do_annonymous_page(), memory.c]
> if (vma->vm_flags & VM_WRITE)
>                  entry = pte_mkwrite(pte_mkdirty(entry));
> 

Right. gup won't set page_dirty, it expects the caller to do so (in case
it doesn't dirty all the gup'ed pages a suppose).

You could probably fix the problem here by setting dirty after gup in
the futex code if you know you're going to write. You must do that with
the PTE lock held though and -not- at interrupt time.

Note however that the exact same problem exist with normal "read"
accesses and page_young (_PAGE_ACCESSED on powerpc). The page will not
be accessible until that bit is set and it's set by SW.

As I wrote earlier, fixing that by making "atomic" page faults perform
the dirty/accessed tracking is not right, since such faults can happen
at interrupt time and the PTE lock cannot be taken at interrupt time.

IE. The implementation of those "SW" TLB archs heavily relies on the PTE
lock to serialize write access to the PTE and writing it outside of that
lock would do really bad things.

So there's a deeper problem here. The whole user access "in atomic"
concept is by itself a violation of some of the basic access rules of
user memory that have existed from day 1 of the kernel. That we allow it
for semi-harmless (and allowed to fail) things like snapshot of
backtraces for perf is one thing, but relying on it for the futex case
like that is not going to fly very well. I sincerely hope that this kind
of usage is not going to become a habit.

In the meantime, other than rewriting the futex code to not require
those in-atomic accesses (can't it just access the pages via the linear
mapping and/or kmap after the gup ?), all I see would be a way to force
dirty and young after gup, with appropriate locks, or a variant of gup
(via a flag ?) to require it to do so.

Cheers,
Ben.


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-16 23:49         ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-16 23:49 UTC (permalink / raw)
  To: Shan Hai
  Cc: tony.luck, Peter Zijlstra, linux-kernel, cmetcalf, dhowells,
	paulus, tglx, walken, linuxppc-dev, akpm

On Sat, 2011-07-16 at 10:50 -0400, Shan Hai wrote:
> 
> That's right the gup(.write=1) want to do the same thing as the
> above code snippet, but it failed for the following reason:
> because the get_user_pages() would not dirty pte for the reason
> the follow_page() returns not NULL on *present* and *writable*
> page, the page which holds the lock is present because its a shared
> page,
> writable because demand paging set that up so for shared
> writable page, so the handle_mm_fault() in the __get_user_page()
> could not be called.
> 
> Why the above code could do the same task, because by calling
> handle_mm_fault() will set pte dirty by
> [do_annonymous_page(), memory.c]
> if (vma->vm_flags & VM_WRITE)
>                  entry = pte_mkwrite(pte_mkdirty(entry));
> 

Right. gup won't set page_dirty, it expects the caller to do so (in case
it doesn't dirty all the gup'ed pages a suppose).

You could probably fix the problem here by setting dirty after gup in
the futex code if you know you're going to write. You must do that with
the PTE lock held though and -not- at interrupt time.

Note however that the exact same problem exist with normal "read"
accesses and page_young (_PAGE_ACCESSED on powerpc). The page will not
be accessible until that bit is set and it's set by SW.

As I wrote earlier, fixing that by making "atomic" page faults perform
the dirty/accessed tracking is not right, since such faults can happen
at interrupt time and the PTE lock cannot be taken at interrupt time.

IE. The implementation of those "SW" TLB archs heavily relies on the PTE
lock to serialize write access to the PTE and writing it outside of that
lock would do really bad things.

So there's a deeper problem here. The whole user access "in atomic"
concept is by itself a violation of some of the basic access rules of
user memory that have existed from day 1 of the kernel. That we allow it
for semi-harmless (and allowed to fail) things like snapshot of
backtraces for perf is one thing, but relying on it for the futex case
like that is not going to fly very well. I sincerely hope that this kind
of usage is not going to become a habit.

In the meantime, other than rewriting the futex code to not require
those in-atomic accesses (can't it just access the pages via the linear
mapping and/or kmap after the gup ?), all I see would be a way to force
dirty and young after gup, with appropriate locks, or a variant of gup
(via a flag ?) to require it to do so.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-16 23:49         ` Benjamin Herrenschmidt
@ 2011-07-17  9:38           ` Peter Zijlstra
  -1 siblings, 0 replies; 138+ messages in thread
From: Peter Zijlstra @ 2011-07-17  9:38 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Shan Hai
  Cc: Peter Zijlstra, paulus, tglx, walken, dhowells, cmetcalf,
	tony.luck, akpm, linuxppc-dev, linux-kernel

Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

>On Sat, 2011-07-16 at 10:50 -0400, Shan Hai wrote:
>> 
>> That's right the gup(.write=1) want to do the same thing as the
>> above code snippet, but it failed for the following reason:
>> because the get_user_pages() would not dirty pte for the reason
>> the follow_page() returns not NULL on *present* and *writable*
>> page, the page which holds the lock is present because its a shared
>> page,
>> writable because demand paging set that up so for shared
>> writable page, so the handle_mm_fault() in the __get_user_page()
>> could not be called.
>> 
>> Why the above code could do the same task, because by calling
>> handle_mm_fault() will set pte dirty by
>> [do_annonymous_page(), memory.c]
>> if (vma->vm_flags & VM_WRITE)
>>                  entry = pte_mkwrite(pte_mkdirty(entry));
>> 
>
>Right. gup won't set page_dirty, it expects the caller to do so (in
>case
>it doesn't dirty all the gup'ed pages a suppose).
>
>You could probably fix the problem here by setting dirty after gup in
>the futex code if you know you're going to write. You must do that with
>the PTE lock held though and -not- at interrupt time.
>
>Note however that the exact same problem exist with normal "read"
>accesses and page_young (_PAGE_ACCESSED on powerpc). The page will not
>be accessible until that bit is set and it's set by SW.
>
>As I wrote earlier, fixing that by making "atomic" page faults perform
>the dirty/accessed tracking is not right, since such faults can happen
>at interrupt time and the PTE lock cannot be taken at interrupt time.
>
>IE. The implementation of those "SW" TLB archs heavily relies on the
>PTE
>lock to serialize write access to the PTE and writing it outside of
>that
>lock would do really bad things.
>
>So there's a deeper problem here. The whole user access "in atomic"
>concept is by itself a violation of some of the basic access rules of
>user memory that have existed from day 1 of the kernel. That we allow
>it
>for semi-harmless (and allowed to fail) things like snapshot of
>backtraces for perf is one thing, but relying on it for the futex case
>like that is not going to fly very well. I sincerely hope that this
>kind
>of usage is not going to become a habit.
>
>In the meantime, other than rewriting the futex code to not require
>those in-atomic accesses (can't it just access the pages via the linear
>mapping and/or kmap after the gup ?), all I see would be a way to force
>dirty and young after gup, with appropriate locks, or a variant of gup
>(via a flag ?) to require it to do so.
>
>Cheers,
>Ben.

Whats this talk about interrup context? There is non of that involved.

Furthermore, I still dont see the problem. The futex code is optimistically trying to poke at user memory while holding spinlocks.

We fully expect that to fail, hence the error path that drops all locks and does the gup(.write=1) to fix up the mapping after which we try again.

This has worked for years, its by no means new code. Nor do I see how its broken.
-- 
Sent from my Android phone with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-17  9:38           ` Peter Zijlstra
  0 siblings, 0 replies; 138+ messages in thread
From: Peter Zijlstra @ 2011-07-17  9:38 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Shan Hai
  Cc: tony.luck, Peter Zijlstra, linux-kernel, cmetcalf, dhowells,
	paulus, tglx, walken, linuxppc-dev, akpm

Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

>On Sat, 2011-07-16 at 10:50 -0400, Shan Hai wrote:
>> 
>> That's right the gup(.write=1) want to do the same thing as the
>> above code snippet, but it failed for the following reason:
>> because the get_user_pages() would not dirty pte for the reason
>> the follow_page() returns not NULL on *present* and *writable*
>> page, the page which holds the lock is present because its a shared
>> page,
>> writable because demand paging set that up so for shared
>> writable page, so the handle_mm_fault() in the __get_user_page()
>> could not be called.
>> 
>> Why the above code could do the same task, because by calling
>> handle_mm_fault() will set pte dirty by
>> [do_annonymous_page(), memory.c]
>> if (vma->vm_flags & VM_WRITE)
>>                  entry = pte_mkwrite(pte_mkdirty(entry));
>> 
>
>Right. gup won't set page_dirty, it expects the caller to do so (in
>case
>it doesn't dirty all the gup'ed pages a suppose).
>
>You could probably fix the problem here by setting dirty after gup in
>the futex code if you know you're going to write. You must do that with
>the PTE lock held though and -not- at interrupt time.
>
>Note however that the exact same problem exist with normal "read"
>accesses and page_young (_PAGE_ACCESSED on powerpc). The page will not
>be accessible until that bit is set and it's set by SW.
>
>As I wrote earlier, fixing that by making "atomic" page faults perform
>the dirty/accessed tracking is not right, since such faults can happen
>at interrupt time and the PTE lock cannot be taken at interrupt time.
>
>IE. The implementation of those "SW" TLB archs heavily relies on the
>PTE
>lock to serialize write access to the PTE and writing it outside of
>that
>lock would do really bad things.
>
>So there's a deeper problem here. The whole user access "in atomic"
>concept is by itself a violation of some of the basic access rules of
>user memory that have existed from day 1 of the kernel. That we allow
>it
>for semi-harmless (and allowed to fail) things like snapshot of
>backtraces for perf is one thing, but relying on it for the futex case
>like that is not going to fly very well. I sincerely hope that this
>kind
>of usage is not going to become a habit.
>
>In the meantime, other than rewriting the futex code to not require
>those in-atomic accesses (can't it just access the pages via the linear
>mapping and/or kmap after the gup ?), all I see would be a way to force
>dirty and young after gup, with appropriate locks, or a variant of gup
>(via a flag ?) to require it to do so.
>
>Cheers,
>Ben.

Whats this talk about interrup context? There is non of that involved.

Furthermore, I still dont see the problem. The futex code is optimistically trying to poke at user memory while holding spinlocks.

We fully expect that to fail, hence the error path that drops all locks and does the gup(.write=1) to fix up the mapping after which we try again.

This has worked for years, its by no means new code. Nor do I see how its broken.
-- 
Sent from my Android phone with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-16 23:49         ` Benjamin Herrenschmidt
@ 2011-07-17 11:02           ` Peter Zijlstra
  -1 siblings, 0 replies; 138+ messages in thread
From: Peter Zijlstra @ 2011-07-17 11:02 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Shan Hai, paulus, tglx, walken, dhowells, cmetcalf, tony.luck,
	akpm, linuxppc-dev, linux-kernel

On Sun, 2011-07-17 at 09:49 +1000, Benjamin Herrenschmidt wrote:
> In the meantime, other than rewriting the futex code to not require
> those in-atomic accesses (can't it just access the pages via the linear
> mapping and/or kmap after the gup ?),

That'll wreck performance on things like ARM and SPARC that have to deal
with cache aliasing.

>  all I see would be a way to force
> dirty and young after gup, with appropriate locks, or a variant of gup
> (via a flag ?) to require it to do so. 

Again, _WHY_ isn't gup(.write=1) a complete write fault? Its supposed to
be, it needs to break COW, do dirty page tracking and call page_mkwrite.
I'm still thinking this e500 stuff is smoking crack.

ARM has no hardware dirty bit either, and yet it works for them. I can't
exactly tell how because I got lost in there, but it does, again,
suggest e500 is on crack.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-17 11:02           ` Peter Zijlstra
  0 siblings, 0 replies; 138+ messages in thread
From: Peter Zijlstra @ 2011-07-17 11:02 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: tony.luck, Shan Hai, linux-kernel, cmetcalf, dhowells, paulus,
	tglx, walken, linuxppc-dev, akpm

On Sun, 2011-07-17 at 09:49 +1000, Benjamin Herrenschmidt wrote:
> In the meantime, other than rewriting the futex code to not require
> those in-atomic accesses (can't it just access the pages via the linear
> mapping and/or kmap after the gup ?),

That'll wreck performance on things like ARM and SPARC that have to deal
with cache aliasing.

>  all I see would be a way to force
> dirty and young after gup, with appropriate locks, or a variant of gup
> (via a flag ?) to require it to do so.=20

Again, _WHY_ isn't gup(.write=3D1) a complete write fault? Its supposed to
be, it needs to break COW, do dirty page tracking and call page_mkwrite.
I'm still thinking this e500 stuff is smoking crack.

ARM has no hardware dirty bit either, and yet it works for them. I can't
exactly tell how because I got lost in there, but it does, again,
suggest e500 is on crack.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-17 11:02           ` Peter Zijlstra
@ 2011-07-17 13:33             ` Shan Hai
  -1 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-17 13:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Benjamin Herrenschmidt, paulus, tglx, walken, dhowells, cmetcalf,
	tony.luck, akpm, linuxppc-dev, linux-kernel

On 07/17/2011 07:02 PM, Peter Zijlstra wrote:
> On Sun, 2011-07-17 at 09:49 +1000, Benjamin Herrenschmidt wrote:
>> In the meantime, other than rewriting the futex code to not require
>> those in-atomic accesses (can't it just access the pages via the linear
>> mapping and/or kmap after the gup ?),
> That'll wreck performance on things like ARM and SPARC that have to deal
> with cache aliasing.
>
>>   all I see would be a way to force
>> dirty and young after gup, with appropriate locks, or a variant of gup
>> (via a flag ?) to require it to do so.
> Again, _WHY_ isn't gup(.write=1) a complete write fault? Its supposed to
> be, it needs to break COW, do dirty page tracking and call page_mkwrite.
> I'm still thinking this e500 stuff is smoking crack.
>
> ARM has no hardware dirty bit either, and yet it works for them. I can't
> exactly tell how because I got lost in there, but it does, again,
> suggest e500 is on crack.

Ok, the following feature of the architecture causes failure of
gup(.write=1) on dirtying pages,
- allows pages to be protected from supervisor-mode writes

On ARM you could not protect pages from supervisor-mode writes,
isn't it?  That means, all writable user pages are writable for
supervisor too, but its not hold for at least x86 and powerpc,
x86 and powerpc can be configured to protect pages from
supervisor-mode writes.

Think about the following situation,
a page fault occurs on the kernel trying to write to a writable shared
user page which is read only to the kernel, the following conditions hold,
- the page is *present*, because its a shared page
- the page is *writable*, because demand paging sets up the pte for
     the current process to so

The follow_page() called in the __get_user_page() returns non NULL
to its caller on the above mentioned *present* and *writable* page,
so the gup(.write=1) has no chance to set pte dirty by calling 
handle_mm_fault,
the follow_page() has no knowledge of supervisor-mode write protected pages,
that's the culprit in the bug discussed here.

Thanks
Shan Hai



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-17 13:33             ` Shan Hai
  0 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-17 13:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tony.luck, linux-kernel, cmetcalf, dhowells, paulus, tglx,
	walken, linuxppc-dev, akpm

On 07/17/2011 07:02 PM, Peter Zijlstra wrote:
> On Sun, 2011-07-17 at 09:49 +1000, Benjamin Herrenschmidt wrote:
>> In the meantime, other than rewriting the futex code to not require
>> those in-atomic accesses (can't it just access the pages via the linear
>> mapping and/or kmap after the gup ?),
> That'll wreck performance on things like ARM and SPARC that have to deal
> with cache aliasing.
>
>>   all I see would be a way to force
>> dirty and young after gup, with appropriate locks, or a variant of gup
>> (via a flag ?) to require it to do so.
> Again, _WHY_ isn't gup(.write=1) a complete write fault? Its supposed to
> be, it needs to break COW, do dirty page tracking and call page_mkwrite.
> I'm still thinking this e500 stuff is smoking crack.
>
> ARM has no hardware dirty bit either, and yet it works for them. I can't
> exactly tell how because I got lost in there, but it does, again,
> suggest e500 is on crack.

Ok, the following feature of the architecture causes failure of
gup(.write=1) on dirtying pages,
- allows pages to be protected from supervisor-mode writes

On ARM you could not protect pages from supervisor-mode writes,
isn't it?  That means, all writable user pages are writable for
supervisor too, but its not hold for at least x86 and powerpc,
x86 and powerpc can be configured to protect pages from
supervisor-mode writes.

Think about the following situation,
a page fault occurs on the kernel trying to write to a writable shared
user page which is read only to the kernel, the following conditions hold,
- the page is *present*, because its a shared page
- the page is *writable*, because demand paging sets up the pte for
     the current process to so

The follow_page() called in the __get_user_page() returns non NULL
to its caller on the above mentioned *present* and *writable* page,
so the gup(.write=1) has no chance to set pte dirty by calling 
handle_mm_fault,
the follow_page() has no knowledge of supervisor-mode write protected pages,
that's the culprit in the bug discussed here.

Thanks
Shan Hai

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-17  9:38           ` Peter Zijlstra
@ 2011-07-17 14:29             ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-17 14:29 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Shan Hai, Peter Zijlstra, paulus, tglx, walken, dhowells,
	cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel

On Sun, 2011-07-17 at 11:38 +0200, Peter Zijlstra wrote:

> Whats this talk about interrup context? There is non of that involved.

Ok, let's not mix things here. I was going to fast and may have murkied
the waters. First let's get back to the futex code issue with gup:

> Furthermore, I still dont see the problem. The futex code is
> optimistically trying to poke at user memory while holding spinlocks.
> 
> We fully expect that to fail, hence the error path that drops all
> locks and does the gup(.write=1) to fix up the mapping after which we
> try again.

See below

> This has worked for years, its by no means new code. Nor do I see how
> its broken.

No it hasn't worked for years, it's been broken for years on some archs
but nobody noticed :-)

The problem I see is that gup doesn't set dirty (or young) itself. It
requires the caller to set dirty before releasing the pages basically
and there's no provision for young. Afaik, callers set it directly in
the struct page and not the PTE too (which means a spurrious fault on
subsequent access for archs that do dirty tracking).

On archs where dirty and young are SW tracked, pages that don't have
them set in the PTE will fault on access. The lack of dirty means the
page is effectively going to be read-only and the lack of young means
the page will be inaccessible.

gup itself isn't mean to fix those, at least the way it's been used so
far, the caller of gup is.

Thus the only "proper" way to fix that is to have the futex code itself
perform dirty and accessed updates, which sucks (means going back down
the page tables, taking the PTL and doing the deed).

Having the actual fault handlers do the fixup even when in atomic isn't
a good option for us:

I don't know what other archs that rely on that SW tracking do, but in
the case of powerpc, that would be problematic due to the fact that
those archs have been written with the assumption that all changes to
PTEs are done under the PTL (which allows to simplify code and thus make
things faster).

Among others, that also means no changes at interrupt time. Enabling our
fault handlers to update the dirty & young bits even while in "atomic"
context would potentially open the door to things like interrupt-time
perf backtraces to cause PTE updates, etc... (in addition to generally
breaking the locking rules for PTE modifications).

Now, I suppose we -could- differentiate preempt disabled from real
interrupt time in the fault handlers, tho that's somewhat sucky. It
might require moving the dirty/young updates from generic code to arch
code, I suppose.

I'm also not sure how risky it would be to take the PTL in that case...
code doing user accesses within pagefault_disable() might be written
with the assumption that the PTL won't be taken (it might be already
held ? I don't know what all the users are at this point, too late to
grep, I can have a look tomorrow). It makes me a bit nervous too.

A better approach might be a flag to pass to gup (via the "write"
argument ? top bits ?) to tell it to immediately perform dirty/young
updates.

Cheers,
Ben.



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-17 14:29             ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-17 14:29 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tony.luck, Peter Zijlstra, Shan Hai, linux-kernel, cmetcalf,
	dhowells, paulus, tglx, walken, linuxppc-dev, akpm

On Sun, 2011-07-17 at 11:38 +0200, Peter Zijlstra wrote:

> Whats this talk about interrup context? There is non of that involved.

Ok, let's not mix things here. I was going to fast and may have murkied
the waters. First let's get back to the futex code issue with gup:

> Furthermore, I still dont see the problem. The futex code is
> optimistically trying to poke at user memory while holding spinlocks.
> 
> We fully expect that to fail, hence the error path that drops all
> locks and does the gup(.write=1) to fix up the mapping after which we
> try again.

See below

> This has worked for years, its by no means new code. Nor do I see how
> its broken.

No it hasn't worked for years, it's been broken for years on some archs
but nobody noticed :-)

The problem I see is that gup doesn't set dirty (or young) itself. It
requires the caller to set dirty before releasing the pages basically
and there's no provision for young. Afaik, callers set it directly in
the struct page and not the PTE too (which means a spurrious fault on
subsequent access for archs that do dirty tracking).

On archs where dirty and young are SW tracked, pages that don't have
them set in the PTE will fault on access. The lack of dirty means the
page is effectively going to be read-only and the lack of young means
the page will be inaccessible.

gup itself isn't mean to fix those, at least the way it's been used so
far, the caller of gup is.

Thus the only "proper" way to fix that is to have the futex code itself
perform dirty and accessed updates, which sucks (means going back down
the page tables, taking the PTL and doing the deed).

Having the actual fault handlers do the fixup even when in atomic isn't
a good option for us:

I don't know what other archs that rely on that SW tracking do, but in
the case of powerpc, that would be problematic due to the fact that
those archs have been written with the assumption that all changes to
PTEs are done under the PTL (which allows to simplify code and thus make
things faster).

Among others, that also means no changes at interrupt time. Enabling our
fault handlers to update the dirty & young bits even while in "atomic"
context would potentially open the door to things like interrupt-time
perf backtraces to cause PTE updates, etc... (in addition to generally
breaking the locking rules for PTE modifications).

Now, I suppose we -could- differentiate preempt disabled from real
interrupt time in the fault handlers, tho that's somewhat sucky. It
might require moving the dirty/young updates from generic code to arch
code, I suppose.

I'm also not sure how risky it would be to take the PTL in that case...
code doing user accesses within pagefault_disable() might be written
with the assumption that the PTL won't be taken (it might be already
held ? I don't know what all the users are at this point, too late to
grep, I can have a look tomorrow). It makes me a bit nervous too.

A better approach might be a flag to pass to gup (via the "write"
argument ? top bits ?) to tell it to immediately perform dirty/young
updates.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-17 11:02           ` Peter Zijlstra
@ 2011-07-17 14:34             ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-17 14:34 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Shan Hai, paulus, tglx, walken, dhowells, cmetcalf, tony.luck,
	akpm, linuxppc-dev, linux-kernel

On Sun, 2011-07-17 at 13:02 +0200, Peter Zijlstra wrote:
> 
> Again, _WHY_ isn't gup(.write=1) a complete write fault? Its supposed to
> be, it needs to break COW, do dirty page tracking and call page_mkwrite.
> I'm still thinking this e500 stuff is smoking crack.
> 
> ARM has no hardware dirty bit either, and yet it works for them. I can't
> exactly tell how because I got lost in there, but it does, again,
> suggest e500 is on crack. 

Because gup won't set dirty for a page that is already writable but
whose dirty bit has been "harvested" by the VM already. Same with young.

Maybe nobody sees it on ARM because nobody ever swaps on it ? :-) Or
they have some different way of handling dirty/young updates.. I don't
know.

e500 isn't the only one who will be affected. All the non-hash powerpc
subarchs will (I wrote a lot of that mm code so it's all my fault :-)
That includes 4xx and 64-bit BookE.

Cheers,
Ben.



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-17 14:34             ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-17 14:34 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tony.luck, Shan Hai, linux-kernel, cmetcalf, dhowells, paulus,
	tglx, walken, linuxppc-dev, akpm

On Sun, 2011-07-17 at 13:02 +0200, Peter Zijlstra wrote:
> 
> Again, _WHY_ isn't gup(.write=1) a complete write fault? Its supposed to
> be, it needs to break COW, do dirty page tracking and call page_mkwrite.
> I'm still thinking this e500 stuff is smoking crack.
> 
> ARM has no hardware dirty bit either, and yet it works for them. I can't
> exactly tell how because I got lost in there, but it does, again,
> suggest e500 is on crack. 

Because gup won't set dirty for a page that is already writable but
whose dirty bit has been "harvested" by the VM already. Same with young.

Maybe nobody sees it on ARM because nobody ever swaps on it ? :-) Or
they have some different way of handling dirty/young updates.. I don't
know.

e500 isn't the only one who will be affected. All the non-hash powerpc
subarchs will (I wrote a lot of that mm code so it's all my fault :-)
That includes 4xx and 64-bit BookE.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-17 13:33             ` Shan Hai
@ 2011-07-17 14:48               ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-17 14:48 UTC (permalink / raw)
  To: Shan Hai
  Cc: Peter Zijlstra, paulus, tglx, walken, dhowells, cmetcalf,
	tony.luck, akpm, linuxppc-dev, linux-kernel

On Sun, 2011-07-17 at 21:33 +0800, Shan Hai wrote:
> 
> On ARM you could not protect pages from supervisor-mode writes,
> isn't it?  That means, all writable user pages are writable for
> supervisor too, but its not hold for at least x86 and powerpc,
> x86 and powerpc can be configured to protect pages from
> supervisor-mode writes.

That doesn't sound right... how would put_user() work properly then ? A
cursory glance at the ARM code doesn't show it doing anything "special",
just stores ... but I might have missing something.

> Think about the following situation,
> a page fault occurs on the kernel trying to write to a writable shared
> user page which is read only to the kernel, the following conditions
> hold,
> - the page is *present*, because its a shared page
> - the page is *writable*, because demand paging sets up the pte for
>      the current process to so
> 
> The follow_page() called in the __get_user_page() returns non NULL
> to its caller on the above mentioned *present* and *writable* page,
> so the gup(.write=1) has no chance to set pte dirty by calling 
> handle_mm_fault,
> the follow_page() has no knowledge of supervisor-mode write protected
> pages,
> that's the culprit in the bug discussed here. 

Right, the problem is with writable pages that have "lost" (or never had
but usually it's lost, due to swapping for example) their dirty bit, or
any page that has lost young.

>From what I can tell, we need to either fix those bits from the caller
of gup (futex code), which sound nasty, or more easily fix those from
gup itself, possibly under control of flags in the "write" argument to 
avoid breaking code relying on the existing behaviour, expecially vs.
dirty.

Cheers,
Ben.



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-17 14:48               ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-17 14:48 UTC (permalink / raw)
  To: Shan Hai
  Cc: tony.luck, Peter Zijlstra, linux-kernel, cmetcalf, dhowells,
	paulus, tglx, walken, linuxppc-dev, akpm

On Sun, 2011-07-17 at 21:33 +0800, Shan Hai wrote:
> 
> On ARM you could not protect pages from supervisor-mode writes,
> isn't it?  That means, all writable user pages are writable for
> supervisor too, but its not hold for at least x86 and powerpc,
> x86 and powerpc can be configured to protect pages from
> supervisor-mode writes.

That doesn't sound right... how would put_user() work properly then ? A
cursory glance at the ARM code doesn't show it doing anything "special",
just stores ... but I might have missing something.

> Think about the following situation,
> a page fault occurs on the kernel trying to write to a writable shared
> user page which is read only to the kernel, the following conditions
> hold,
> - the page is *present*, because its a shared page
> - the page is *writable*, because demand paging sets up the pte for
>      the current process to so
> 
> The follow_page() called in the __get_user_page() returns non NULL
> to its caller on the above mentioned *present* and *writable* page,
> so the gup(.write=1) has no chance to set pte dirty by calling 
> handle_mm_fault,
> the follow_page() has no knowledge of supervisor-mode write protected
> pages,
> that's the culprit in the bug discussed here. 

Right, the problem is with writable pages that have "lost" (or never had
but usually it's lost, due to swapping for example) their dirty bit, or
any page that has lost young.

>From what I can tell, we need to either fix those bits from the caller
of gup (futex code), which sound nasty, or more easily fix those from
gup itself, possibly under control of flags in the "write" argument to 
avoid breaking code relying on the existing behaviour, expecially vs.
dirty.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-17 14:48               ` Benjamin Herrenschmidt
@ 2011-07-17 15:40                 ` Shan Hai
  -1 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-17 15:40 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Peter Zijlstra, paulus, tglx, walken, dhowells, cmetcalf,
	tony.luck, akpm, linuxppc-dev, linux-kernel

On 07/17/2011 10:48 PM, Benjamin Herrenschmidt wrote:
> On Sun, 2011-07-17 at 21:33 +0800, Shan Hai wrote:
>> On ARM you could not protect pages from supervisor-mode writes,
>> isn't it?  That means, all writable user pages are writable for
>> supervisor too, but its not hold for at least x86 and powerpc,
>> x86 and powerpc can be configured to protect pages from
>> supervisor-mode writes.
> That doesn't sound right... how would put_user() work properly then ? A
> cursory glance at the ARM code doesn't show it doing anything "special",
> just stores ... but I might have missing something.
>

That's real for ARM, for the reason put_user() work properly is that
the first time access to the write protected page triggers a page
fault, and the handle_mm_fault() will fix up the write permission
for the kernel, because at this time no one disabled the page fault
as done in the futex case.

>> Think about the following situation,
>> a page fault occurs on the kernel trying to write to a writable shared
>> user page which is read only to the kernel, the following conditions
>> hold,
>> - the page is *present*, because its a shared page
>> - the page is *writable*, because demand paging sets up the pte for
>>       the current process to so
>>
>> The follow_page() called in the __get_user_page() returns non NULL
>> to its caller on the above mentioned *present* and *writable* page,
>> so the gup(.write=1) has no chance to set pte dirty by calling
>> handle_mm_fault,
>> the follow_page() has no knowledge of supervisor-mode write protected
>> pages,
>> that's the culprit in the bug discussed here.
> Right, the problem is with writable pages that have "lost" (or never had
> but usually it's lost, due to swapping for example) their dirty bit, or
> any page that has lost young.
>
>  From what I can tell, we need to either fix those bits from the caller
> of gup (futex code), which sound nasty, or more easily fix those from
> gup itself, possibly under control of flags in the "write" argument to
> avoid breaking code relying on the existing behaviour, expecially vs.
> dirty.
>

So, for the reason the SW tracked dirty/young and supervisor protected
pages has potential effects on not only *futex* but also on other components
of the kernel which might access the non-dirty supervisor protected page,
in my opinion it might be more sensible to fix it from gup instead of fixing
it in the futex.

Thanks
Shan Hai

> Cheers,
> Ben.
>
>


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-17 15:40                 ` Shan Hai
  0 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-17 15:40 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: tony.luck, Peter Zijlstra, linux-kernel, cmetcalf, dhowells,
	paulus, tglx, walken, linuxppc-dev, akpm

On 07/17/2011 10:48 PM, Benjamin Herrenschmidt wrote:
> On Sun, 2011-07-17 at 21:33 +0800, Shan Hai wrote:
>> On ARM you could not protect pages from supervisor-mode writes,
>> isn't it?  That means, all writable user pages are writable for
>> supervisor too, but its not hold for at least x86 and powerpc,
>> x86 and powerpc can be configured to protect pages from
>> supervisor-mode writes.
> That doesn't sound right... how would put_user() work properly then ? A
> cursory glance at the ARM code doesn't show it doing anything "special",
> just stores ... but I might have missing something.
>

That's real for ARM, for the reason put_user() work properly is that
the first time access to the write protected page triggers a page
fault, and the handle_mm_fault() will fix up the write permission
for the kernel, because at this time no one disabled the page fault
as done in the futex case.

>> Think about the following situation,
>> a page fault occurs on the kernel trying to write to a writable shared
>> user page which is read only to the kernel, the following conditions
>> hold,
>> - the page is *present*, because its a shared page
>> - the page is *writable*, because demand paging sets up the pte for
>>       the current process to so
>>
>> The follow_page() called in the __get_user_page() returns non NULL
>> to its caller on the above mentioned *present* and *writable* page,
>> so the gup(.write=1) has no chance to set pte dirty by calling
>> handle_mm_fault,
>> the follow_page() has no knowledge of supervisor-mode write protected
>> pages,
>> that's the culprit in the bug discussed here.
> Right, the problem is with writable pages that have "lost" (or never had
> but usually it's lost, due to swapping for example) their dirty bit, or
> any page that has lost young.
>
>  From what I can tell, we need to either fix those bits from the caller
> of gup (futex code), which sound nasty, or more easily fix those from
> gup itself, possibly under control of flags in the "write" argument to
> avoid breaking code relying on the existing behaviour, expecially vs.
> dirty.
>

So, for the reason the SW tracked dirty/young and supervisor protected
pages has potential effects on not only *futex* but also on other components
of the kernel which might access the non-dirty supervisor protected page,
in my opinion it might be more sensible to fix it from gup instead of fixing
it in the futex.

Thanks
Shan Hai

> Cheers,
> Ben.
>
>

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-17 15:40                 ` Shan Hai
@ 2011-07-17 22:34                   ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-17 22:34 UTC (permalink / raw)
  To: Shan Hai
  Cc: Peter Zijlstra, paulus, tglx, walken, dhowells, cmetcalf,
	tony.luck, akpm, linuxppc-dev, linux-kernel

On Sun, 2011-07-17 at 23:40 +0800, Shan Hai wrote:
> On 07/17/2011 10:48 PM, Benjamin Herrenschmidt wrote:
> > On Sun, 2011-07-17 at 21:33 +0800, Shan Hai wrote:
> >> On ARM you could not protect pages from supervisor-mode writes,
> >> isn't it?  That means, all writable user pages are writable for
> >> supervisor too, but its not hold for at least x86 and powerpc,
> >> x86 and powerpc can be configured to protect pages from
> >> supervisor-mode writes.
> > That doesn't sound right... how would put_user() work properly then ? A
> > cursory glance at the ARM code doesn't show it doing anything "special",
> > just stores ... but I might have missing something.
> >
> 
> That's real for ARM, for the reason put_user() work properly is that
> the first time access to the write protected page triggers a page
> fault, and the handle_mm_fault() will fix up the write permission
> for the kernel, because at this time no one disabled the page fault
> as done in the futex case.

Sorry, you don't make sense here, you first say that ARM cannot protect
pages from supervisor writes, and then you write that put_user() will
work because it triggers a page fault :-) Those are mutually exclusive.

If you have a read-only PTE present, put_user() will trigger a page
fault on writes because the supervisor sees the same "write" protection
as userspace, for user pages that is, at least that's how it works on
most archs and I didn't think ARM was any different.

Note that things are different for -kernel- pages (ie, linear mapping,
vmalloc, etc...) for some archs. For example, on hash-table based
powerpc MMUs, it's not always possible to create a kernel-only read-only
mapping. But user mappings (below PAGE_OFFSET) always honor userspace
protections regardless of whether the CPU is in supervisor or user mode.

Anyway, we are getting on a side track here. Let's sort out our original
futex problem first.

> So, for the reason the SW tracked dirty/young and supervisor protected
> pages has potential effects on not only *futex* but also on other components
> of the kernel which might access the non-dirty supervisor protected page,
> in my opinion it might be more sensible to fix it from gup instead of fixing
> it in the futex.

Well, again it depends. First let's not use "supervisor protected" here.
Those pages are user pages. The supervisor just honors the user
protection from what I can tell. Real "supervisor protected" (such as
read-only kernel text pages etc...) are a completely different beast.

Secondly, we don't need anything special for the "normal" user access
cases, which are get/put_user, copy_tofrom_user, or gup followed with
access to the pages directly via the linear mapping, kmap, etc...

The problem is specific to code path doing user accesses within
pagefault_disable() sections -and- expecting to use gup to "fixup"
things when they fail.

Do we have many more than futex here ?

I -do- tend to prefer the fix inside gup approach for different reasons,
simply because gup already walks the page tables (well, follow_pages()
does) and we already have usable "flags" arguments to both gup and
follow_pages() that we can hijack to add our "update dirty & young now"
attribute.

So it should be a simple patch, provided Peter etc... agree in principle
with the approach.

Cheers,
Ben.

> Thanks
> Shan Hai
> 
> > Cheers,
> > Ben.
> >
> >
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-17 22:34                   ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-17 22:34 UTC (permalink / raw)
  To: Shan Hai
  Cc: tony.luck, Peter Zijlstra, linux-kernel, cmetcalf, dhowells,
	paulus, tglx, walken, linuxppc-dev, akpm

On Sun, 2011-07-17 at 23:40 +0800, Shan Hai wrote:
> On 07/17/2011 10:48 PM, Benjamin Herrenschmidt wrote:
> > On Sun, 2011-07-17 at 21:33 +0800, Shan Hai wrote:
> >> On ARM you could not protect pages from supervisor-mode writes,
> >> isn't it?  That means, all writable user pages are writable for
> >> supervisor too, but its not hold for at least x86 and powerpc,
> >> x86 and powerpc can be configured to protect pages from
> >> supervisor-mode writes.
> > That doesn't sound right... how would put_user() work properly then ? A
> > cursory glance at the ARM code doesn't show it doing anything "special",
> > just stores ... but I might have missing something.
> >
> 
> That's real for ARM, for the reason put_user() work properly is that
> the first time access to the write protected page triggers a page
> fault, and the handle_mm_fault() will fix up the write permission
> for the kernel, because at this time no one disabled the page fault
> as done in the futex case.

Sorry, you don't make sense here, you first say that ARM cannot protect
pages from supervisor writes, and then you write that put_user() will
work because it triggers a page fault :-) Those are mutually exclusive.

If you have a read-only PTE present, put_user() will trigger a page
fault on writes because the supervisor sees the same "write" protection
as userspace, for user pages that is, at least that's how it works on
most archs and I didn't think ARM was any different.

Note that things are different for -kernel- pages (ie, linear mapping,
vmalloc, etc...) for some archs. For example, on hash-table based
powerpc MMUs, it's not always possible to create a kernel-only read-only
mapping. But user mappings (below PAGE_OFFSET) always honor userspace
protections regardless of whether the CPU is in supervisor or user mode.

Anyway, we are getting on a side track here. Let's sort out our original
futex problem first.

> So, for the reason the SW tracked dirty/young and supervisor protected
> pages has potential effects on not only *futex* but also on other components
> of the kernel which might access the non-dirty supervisor protected page,
> in my opinion it might be more sensible to fix it from gup instead of fixing
> it in the futex.

Well, again it depends. First let's not use "supervisor protected" here.
Those pages are user pages. The supervisor just honors the user
protection from what I can tell. Real "supervisor protected" (such as
read-only kernel text pages etc...) are a completely different beast.

Secondly, we don't need anything special for the "normal" user access
cases, which are get/put_user, copy_tofrom_user, or gup followed with
access to the pages directly via the linear mapping, kmap, etc...

The problem is specific to code path doing user accesses within
pagefault_disable() sections -and- expecting to use gup to "fixup"
things when they fail.

Do we have many more than futex here ?

I -do- tend to prefer the fix inside gup approach for different reasons,
simply because gup already walks the page tables (well, follow_pages()
does) and we already have usable "flags" arguments to both gup and
follow_pages() that we can hijack to add our "update dirty & young now"
attribute.

So it should be a simple patch, provided Peter etc... agree in principle
with the approach.

Cheers,
Ben.

> Thanks
> Shan Hai
> 
> > Cheers,
> > Ben.
> >
> >
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-17 14:29             ` Benjamin Herrenschmidt
@ 2011-07-17 23:14               ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-17 23:14 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Shan Hai, Peter Zijlstra, paulus, tglx, walken, dhowells,
	cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel

On Mon, 2011-07-18 at 00:29 +1000, Benjamin Herrenschmidt wrote:

> A better approach might be a flag to pass to gup (via the "write"
> argument ? top bits ?) to tell it to immediately perform dirty/young
> updates.

So I dug a bit now that it's not 1am anymore :-)

Looks like gup changed a lot since I last looked. In fact, it already
has a very similar logic to what I want, with FOLL_TOUCH (which is set
by gup always and passed to __gup):

	if (flags & FOLL_TOUCH) {
		if ((flags & FOLL_WRITE) &&
		    !pte_dirty(pte) && !PageDirty(page))
			set_page_dirty(page);
		/*
		 * pte_mkyoung() would be more correct here, but atomic care
		 * is needed to avoid losing the dirty bit: it is easier to use
		 * mark_page_accessed().
		 */
		mark_page_accessed(page);
	}

The problem here, is that we assume that having the struct page bits is
enough and we don't bother setting the PTE for either bits.

The problem with setting the PTE here is that while it would be
perfectly ok to do so under the PTL for archs that maintain dirty and
young in SW, for archs that do it in HW, this needs to be done in a way
that will be atomic vs. potential concurrent HW updates.

This could be done, I believe by using ptep_set_access_flags() but that
would be a waste on things like x86 or hash based powerpc who don't need
the PTE to be updated (x86 because of HW dirty/young updates, hash based
powerpc because our hash code does the updates and so looks to Linux
like it is HW updates).

At this point, I believe, we need to introduce a different behaviour
between architectures depending on how their mm works.

Peter, what do you reckon ? We could just have an
_ARCH_NEEDS_GUP_PTE_UPDATES and call ptep_set_access_flags() on those, I
believe that would be enough (it, it would mimmic what
handle_pte_fault() does to do the updates).

Something (not even compile tested) like:

diff --git a/mm/memory.c b/mm/memory.c
index 40b7531..32024ac 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1515,6 +1515,17 @@ split_fallthrough:
 	if (flags & FOLL_GET)
 		get_page(page);
 	if (flags & FOLL_TOUCH) {
+#ifdef _ARCH_NEEDS_GUP_PTE_UPDATES
+		if (!pte_young(pte) ||
+		    ((flags & FOLL_WRITE) && !pte_dirty(pte))) {
+			pte_t new_pte = pte_mkyoung(pte);
+
+			if (flags & FOLL_WRITE)
+				new_pte = pte_mkdirty(pte);
+			ptep_set_access_flags(vma, address, ptep, pte,
+					      flags & FOLL_WRITE);
+		}
+#else
 		if ((flags & FOLL_WRITE) &&
 		    !pte_dirty(pte) && !PageDirty(page))
 			set_page_dirty(page);
@@ -1524,6 +1535,7 @@ split_fallthrough:
 		 * mark_page_accessed().
 		 */
 		mark_page_accessed(page);
+#endif
 	}
 	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
 		/*

I though we could try to factor the young/dirty update from handle_pte_fault
into a separate function and call it there, but I'm not sure whether we want
gup to go to the else case in there for flushing spurrious mappings.... actually,
thinking baout it:

That leads to another potential issue with the way we use gup
here to "fixup" atomic user access (ie, fake fault)... this call to
flush_tlb_fix_spurious_fault(), I'm not entirely certain what it's
doing, ie, it shouldn't be necessary on powerpc and is #ifdef'ed out on
x86, but I suppose -some- archs at least may lazily fixup permissions in a way
that requires it.

That means that an arch that needs that fixup will potentially also break with
the way the futex code relies on gup to do the faulting, since here too, we are
in a situation where gup -will- find a valid struct page and valid write
permission, but some kind of fixup is still needed.

It looks like a more robust fix would be to indeed factor out that code from
handle_pte_fault() and call it from gup as well, at least if the arch requires
it (and we can make "safe" archs like x86 not require it).

We do want to avoid that flush spurrious mapping on common gup's however,
it's going to be a killer. That means we need to inform gup that it's been
called in order to fixup a previously EFAULT'ing atomic user access, and
thus that we require it to perform all the necessary fixups.

In fact, with such a flag, we could probably avoid the ifdef entirely, and
always go toward the PTE fixup path when called in such a fixup case, my gut
feeling is that this is going to be seldom enough not to hurt x86 measurably
but we'll have to try it out.

That leads to that even less tested patch:

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9670f71..8a76694 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1546,6 +1546,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
 #define FOLL_MLOCK	0x40	/* mark page as mlocked */
 #define FOLL_SPLIT	0x80	/* don't return transhuge pages, split them */
 #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */
+#define FOLL_FIXFAULT	0x200	/* fixup after a fault (PTE dirty/young upd) */
 
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 			void *data);
diff --git a/kernel/futex.c b/kernel/futex.c
index fe28dc2..7480a93 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
 	int ret;
 
 	down_read(&mm->mmap_sem);
-	ret = get_user_pages(current, mm, (unsigned long)uaddr,
-			     1, 1, 0, NULL, NULL);
+	ret = __get_user_pages(current, mm, (unsigned long)uaddr,
+			       FOLL_WRITE | FOLL_FIXFAULT, NULL, NULL, NULL);
 	up_read(&mm->mmap_sem);
 
 	return ret < 0 ? ret : 0;
diff --git a/mm/memory.c b/mm/memory.c
index 40b7531..c61fddc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1419,6 +1419,29 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 }
 EXPORT_SYMBOL_GPL(zap_vma_ptes);
 
+static void handle_pte_sw_young_dirty(struct vm_area_struct *vma,
+				      unsigned long address,
+				      pte_t *ptep, int write)
+{
+	pte_t entry = *ptep;
+
+	if (write)
+		pte_mkdirty(entry);
+	entry = pte_mkyoung(entry);
+	if (ptep_set_access_flags(vma, address, pte, entry, write)) {
+		update_mmu_cache(vma, address, pte);
+	} else if (fault) {
+		/*
+		 * This is needed only for protection faults but the arch code
+		 * is not yet telling us if this is a protection fault or not.
+		 * This still avoids useless tlb flushes for .text page faults
+		 * with threads.
+		 */
+		if (write)
+			flush_tlb_fix_spurious_fault(vma, address);
+	}
+}
+
 /**
  * follow_page - look up a page descriptor from a user-virtual address
  * @vma: vm_area_struct mapping @address
@@ -1514,16 +1537,22 @@ split_fallthrough:
 
 	if (flags & FOLL_GET)
 		get_page(page);
-	if (flags & FOLL_TOUCH) {
-		if ((flags & FOLL_WRITE) &&
-		    !pte_dirty(pte) && !PageDirty(page))
-			set_page_dirty(page);
-		/*
-		 * pte_mkyoung() would be more correct here, but atomic care
-		 * is needed to avoid losing the dirty bit: it is easier to use
-		 * mark_page_accessed().
-		 */
-		mark_page_accessed(page);
+
+	if (!pte_young(pte) || ((flags & FOLL_WRITE) && !pte_dirty(pte))) {
+		if (flags & FOLL_FIXFAULT)
+			handle_pte_sw_young_dirty(vma, address, ptep,
+						  flags & FOLL_WRITE);
+		else if (flags & FOLL_TOUCH) {
+			if ((flags & FOLL_WRITE) &&
+			    !pte_dirty(pte) && !PageDirty(page))
+				set_page_dirty(page);
+			/*
+			 * pte_mkyoung() would be more correct here, but atomic care
+			 * is needed to avoid losing the dirty bit: it is easier to use
+			 * mark_page_accessed().
+			 */
+			mark_page_accessed(page);
+		}
 	}
 	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
 		/*
@@ -3358,21 +3387,8 @@ int handle_pte_fault(struct mm_struct *mm,
 		if (!pte_write(entry))
 			return do_wp_page(mm, vma, address,
 					pte, pmd, ptl, entry);
-		entry = pte_mkdirty(entry);
-	}
-	entry = pte_mkyoung(entry);
-	if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
-		update_mmu_cache(vma, address, pte);
-	} else {
-		/*
-		 * This is needed only for protection faults but the arch code
-		 * is not yet telling us if this is a protection fault or not.
-		 * This still avoids useless tlb flushes for .text page faults
-		 * with threads.
-		 */
-		if (flags & FAULT_FLAG_WRITE)
-			flush_tlb_fix_spurious_fault(vma, address);
 	}
+	handle_pte_sw_young_dirty(vma, address, pte, flags & FAULT_FLAG_WRITE);
 unlock:
 	pte_unmap_unlock(pte, ptl);
 	return 0;

Any comment ?

Cheers,
Ben.



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-17 23:14               ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-17 23:14 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tony.luck, Peter Zijlstra, Shan Hai, linux-kernel, cmetcalf,
	dhowells, paulus, tglx, walken, linuxppc-dev, akpm

On Mon, 2011-07-18 at 00:29 +1000, Benjamin Herrenschmidt wrote:

> A better approach might be a flag to pass to gup (via the "write"
> argument ? top bits ?) to tell it to immediately perform dirty/young
> updates.

So I dug a bit now that it's not 1am anymore :-)

Looks like gup changed a lot since I last looked. In fact, it already
has a very similar logic to what I want, with FOLL_TOUCH (which is set
by gup always and passed to __gup):

	if (flags & FOLL_TOUCH) {
		if ((flags & FOLL_WRITE) &&
		    !pte_dirty(pte) && !PageDirty(page))
			set_page_dirty(page);
		/*
		 * pte_mkyoung() would be more correct here, but atomic care
		 * is needed to avoid losing the dirty bit: it is easier to use
		 * mark_page_accessed().
		 */
		mark_page_accessed(page);
	}

The problem here, is that we assume that having the struct page bits is
enough and we don't bother setting the PTE for either bits.

The problem with setting the PTE here is that while it would be
perfectly ok to do so under the PTL for archs that maintain dirty and
young in SW, for archs that do it in HW, this needs to be done in a way
that will be atomic vs. potential concurrent HW updates.

This could be done, I believe by using ptep_set_access_flags() but that
would be a waste on things like x86 or hash based powerpc who don't need
the PTE to be updated (x86 because of HW dirty/young updates, hash based
powerpc because our hash code does the updates and so looks to Linux
like it is HW updates).

At this point, I believe, we need to introduce a different behaviour
between architectures depending on how their mm works.

Peter, what do you reckon ? We could just have an
_ARCH_NEEDS_GUP_PTE_UPDATES and call ptep_set_access_flags() on those, I
believe that would be enough (it, it would mimmic what
handle_pte_fault() does to do the updates).

Something (not even compile tested) like:

diff --git a/mm/memory.c b/mm/memory.c
index 40b7531..32024ac 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1515,6 +1515,17 @@ split_fallthrough:
 	if (flags & FOLL_GET)
 		get_page(page);
 	if (flags & FOLL_TOUCH) {
+#ifdef _ARCH_NEEDS_GUP_PTE_UPDATES
+		if (!pte_young(pte) ||
+		    ((flags & FOLL_WRITE) && !pte_dirty(pte))) {
+			pte_t new_pte = pte_mkyoung(pte);
+
+			if (flags & FOLL_WRITE)
+				new_pte = pte_mkdirty(pte);
+			ptep_set_access_flags(vma, address, ptep, pte,
+					      flags & FOLL_WRITE);
+		}
+#else
 		if ((flags & FOLL_WRITE) &&
 		    !pte_dirty(pte) && !PageDirty(page))
 			set_page_dirty(page);
@@ -1524,6 +1535,7 @@ split_fallthrough:
 		 * mark_page_accessed().
 		 */
 		mark_page_accessed(page);
+#endif
 	}
 	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
 		/*

I though we could try to factor the young/dirty update from handle_pte_fault
into a separate function and call it there, but I'm not sure whether we want
gup to go to the else case in there for flushing spurrious mappings.... actually,
thinking baout it:

That leads to another potential issue with the way we use gup
here to "fixup" atomic user access (ie, fake fault)... this call to
flush_tlb_fix_spurious_fault(), I'm not entirely certain what it's
doing, ie, it shouldn't be necessary on powerpc and is #ifdef'ed out on
x86, but I suppose -some- archs at least may lazily fixup permissions in a way
that requires it.

That means that an arch that needs that fixup will potentially also break with
the way the futex code relies on gup to do the faulting, since here too, we are
in a situation where gup -will- find a valid struct page and valid write
permission, but some kind of fixup is still needed.

It looks like a more robust fix would be to indeed factor out that code from
handle_pte_fault() and call it from gup as well, at least if the arch requires
it (and we can make "safe" archs like x86 not require it).

We do want to avoid that flush spurrious mapping on common gup's however,
it's going to be a killer. That means we need to inform gup that it's been
called in order to fixup a previously EFAULT'ing atomic user access, and
thus that we require it to perform all the necessary fixups.

In fact, with such a flag, we could probably avoid the ifdef entirely, and
always go toward the PTE fixup path when called in such a fixup case, my gut
feeling is that this is going to be seldom enough not to hurt x86 measurably
but we'll have to try it out.

That leads to that even less tested patch:

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9670f71..8a76694 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1546,6 +1546,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
 #define FOLL_MLOCK	0x40	/* mark page as mlocked */
 #define FOLL_SPLIT	0x80	/* don't return transhuge pages, split them */
 #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */
+#define FOLL_FIXFAULT	0x200	/* fixup after a fault (PTE dirty/young upd) */
 
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 			void *data);
diff --git a/kernel/futex.c b/kernel/futex.c
index fe28dc2..7480a93 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
 	int ret;
 
 	down_read(&mm->mmap_sem);
-	ret = get_user_pages(current, mm, (unsigned long)uaddr,
-			     1, 1, 0, NULL, NULL);
+	ret = __get_user_pages(current, mm, (unsigned long)uaddr,
+			       FOLL_WRITE | FOLL_FIXFAULT, NULL, NULL, NULL);
 	up_read(&mm->mmap_sem);
 
 	return ret < 0 ? ret : 0;
diff --git a/mm/memory.c b/mm/memory.c
index 40b7531..c61fddc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1419,6 +1419,29 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 }
 EXPORT_SYMBOL_GPL(zap_vma_ptes);
 
+static void handle_pte_sw_young_dirty(struct vm_area_struct *vma,
+				      unsigned long address,
+				      pte_t *ptep, int write)
+{
+	pte_t entry = *ptep;
+
+	if (write)
+		pte_mkdirty(entry);
+	entry = pte_mkyoung(entry);
+	if (ptep_set_access_flags(vma, address, pte, entry, write)) {
+		update_mmu_cache(vma, address, pte);
+	} else if (fault) {
+		/*
+		 * This is needed only for protection faults but the arch code
+		 * is not yet telling us if this is a protection fault or not.
+		 * This still avoids useless tlb flushes for .text page faults
+		 * with threads.
+		 */
+		if (write)
+			flush_tlb_fix_spurious_fault(vma, address);
+	}
+}
+
 /**
  * follow_page - look up a page descriptor from a user-virtual address
  * @vma: vm_area_struct mapping @address
@@ -1514,16 +1537,22 @@ split_fallthrough:
 
 	if (flags & FOLL_GET)
 		get_page(page);
-	if (flags & FOLL_TOUCH) {
-		if ((flags & FOLL_WRITE) &&
-		    !pte_dirty(pte) && !PageDirty(page))
-			set_page_dirty(page);
-		/*
-		 * pte_mkyoung() would be more correct here, but atomic care
-		 * is needed to avoid losing the dirty bit: it is easier to use
-		 * mark_page_accessed().
-		 */
-		mark_page_accessed(page);
+
+	if (!pte_young(pte) || ((flags & FOLL_WRITE) && !pte_dirty(pte))) {
+		if (flags & FOLL_FIXFAULT)
+			handle_pte_sw_young_dirty(vma, address, ptep,
+						  flags & FOLL_WRITE);
+		else if (flags & FOLL_TOUCH) {
+			if ((flags & FOLL_WRITE) &&
+			    !pte_dirty(pte) && !PageDirty(page))
+				set_page_dirty(page);
+			/*
+			 * pte_mkyoung() would be more correct here, but atomic care
+			 * is needed to avoid losing the dirty bit: it is easier to use
+			 * mark_page_accessed().
+			 */
+			mark_page_accessed(page);
+		}
 	}
 	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
 		/*
@@ -3358,21 +3387,8 @@ int handle_pte_fault(struct mm_struct *mm,
 		if (!pte_write(entry))
 			return do_wp_page(mm, vma, address,
 					pte, pmd, ptl, entry);
-		entry = pte_mkdirty(entry);
-	}
-	entry = pte_mkyoung(entry);
-	if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
-		update_mmu_cache(vma, address, pte);
-	} else {
-		/*
-		 * This is needed only for protection faults but the arch code
-		 * is not yet telling us if this is a protection fault or not.
-		 * This still avoids useless tlb flushes for .text page faults
-		 * with threads.
-		 */
-		if (flags & FAULT_FLAG_WRITE)
-			flush_tlb_fix_spurious_fault(vma, address);
 	}
+	handle_pte_sw_young_dirty(vma, address, pte, flags & FAULT_FLAG_WRITE);
 unlock:
 	pte_unmap_unlock(pte, ptl);
 	return 0;

Any comment ?

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-17 23:14               ` Benjamin Herrenschmidt
@ 2011-07-18  3:53                 ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-18  3:53 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Shan Hai, Peter Zijlstra, paulus, tglx, walken, dhowells,
	cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel

On Mon, 2011-07-18 at 09:14 +1000, Benjamin Herrenschmidt wrote:

> In fact, with such a flag, we could probably avoid the ifdef entirely, and
> always go toward the PTE fixup path when called in such a fixup case, my gut
> feeling is that this is going to be seldom enough not to hurt x86 measurably
> but we'll have to try it out.
> 
> That leads to that even less tested patch:

 And here's a version that builds (still not tested :-)

Shan, can you verify whether that fixes the problem for you ?

I also had a cursory glance at the ARM code and it seems to rely on the
same stuff as embedded powerpc does for dirty/young updates, so in
theory it should exhibit the same problem.

I suspect the scenario is rare enough in practice in embedded workloads
that nobody noticed until now.

Cheers,
Ben.

diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c
index 06c564e..e8ef0e6 100644
--- a/drivers/ata/sata_sil24.c
+++ b/drivers/ata/sata_sil24.c
@@ -1296,7 +1296,7 @@ static int sil24_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 				DRV_NAME);
 	if (rc)
 		return rc;
-	iomap = pcim_iomap_table(pdev);
+	iomap = pcim_iomap_table~(pdev);
 
 	/* apply workaround for completion IRQ loss on PCI-X errata */
 	if (pi.flags & SIL24_FLAG_PCIX_IRQ_WOC) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9670f71..8a76694 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1546,6 +1546,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
 #define FOLL_MLOCK	0x40	/* mark page as mlocked */
 #define FOLL_SPLIT	0x80	/* don't return transhuge pages, split them */
 #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */
+#define FOLL_FIXFAULT	0x200	/* fixup after a fault (PTE dirty/young upd) */
 
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 			void *data);
diff --git a/kernel/futex.c b/kernel/futex.c
index fe28dc2..02adff7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
 	int ret;
 
 	down_read(&mm->mmap_sem);
-	ret = get_user_pages(current, mm, (unsigned long)uaddr,
-			     1, 1, 0, NULL, NULL);
+	ret = __get_user_pages(current, mm, (unsigned long)uaddr, 1,
+			       FOLL_WRITE | FOLL_FIXFAULT, NULL, NULL, NULL);
 	up_read(&mm->mmap_sem);
 
 	return ret < 0 ? ret : 0;
diff --git a/mm/memory.c b/mm/memory.c
index 40b7531..94b1d3f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1419,6 +1419,29 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 }
 EXPORT_SYMBOL_GPL(zap_vma_ptes);
 
+static void handle_pte_sw_young_dirty(struct vm_area_struct *vma,
+				      unsigned long address,
+				      pte_t *ptep, int write)
+{
+	pte_t entry = *ptep;
+
+	if (write)
+		pte_mkdirty(entry);
+	entry = pte_mkyoung(entry);
+	if (ptep_set_access_flags(vma, address, ptep, entry, write)) {
+		update_mmu_cache(vma, address, ptep);
+	} else {
+		/*
+		 * This is needed only for protection faults but the arch code
+		 * is not yet telling us if this is a protection fault or not.
+		 * This still avoids useless tlb flushes for .text page faults
+		 * with threads.
+		 */
+		if (write)
+			flush_tlb_fix_spurious_fault(vma, address);
+	}
+}
+
 /**
  * follow_page - look up a page descriptor from a user-virtual address
  * @vma: vm_area_struct mapping @address
@@ -1514,16 +1537,22 @@ split_fallthrough:
 
 	if (flags & FOLL_GET)
 		get_page(page);
-	if (flags & FOLL_TOUCH) {
-		if ((flags & FOLL_WRITE) &&
-		    !pte_dirty(pte) && !PageDirty(page))
-			set_page_dirty(page);
-		/*
-		 * pte_mkyoung() would be more correct here, but atomic care
-		 * is needed to avoid losing the dirty bit: it is easier to use
-		 * mark_page_accessed().
-		 */
-		mark_page_accessed(page);
+
+	if (!pte_young(pte) || ((flags & FOLL_WRITE) && !pte_dirty(pte))) {
+		if (flags & FOLL_FIXFAULT)
+			handle_pte_sw_young_dirty(vma, address, ptep,
+						  flags & FOLL_WRITE);
+		else if (flags & FOLL_TOUCH) {
+			if ((flags & FOLL_WRITE) &&
+			    !pte_dirty(pte) && !PageDirty(page))
+				set_page_dirty(page);
+			/*
+			 * pte_mkyoung() would be more correct here, but atomic care
+			 * is needed to avoid losing the dirty bit: it is easier to use
+			 * mark_page_accessed().
+			 */
+			mark_page_accessed(page);
+		}
 	}
 	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
 		/*
@@ -3358,21 +3387,8 @@ int handle_pte_fault(struct mm_struct *mm,
 		if (!pte_write(entry))
 			return do_wp_page(mm, vma, address,
 					pte, pmd, ptl, entry);
-		entry = pte_mkdirty(entry);
-	}
-	entry = pte_mkyoung(entry);
-	if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
-		update_mmu_cache(vma, address, pte);
-	} else {
-		/*
-		 * This is needed only for protection faults but the arch code
-		 * is not yet telling us if this is a protection fault or not.
-		 * This still avoids useless tlb flushes for .text page faults
-		 * with threads.
-		 */
-		if (flags & FAULT_FLAG_WRITE)
-			flush_tlb_fix_spurious_fault(vma, address);
 	}
+	handle_pte_sw_young_dirty(vma, address, pte, flags & FAULT_FLAG_WRITE);
 unlock:
 	pte_unmap_unlock(pte, ptl);
 	return 0;



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-18  3:53                 ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-18  3:53 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tony.luck, Peter Zijlstra, Shan Hai, linux-kernel, cmetcalf,
	dhowells, paulus, tglx, walken, linuxppc-dev, akpm

On Mon, 2011-07-18 at 09:14 +1000, Benjamin Herrenschmidt wrote:

> In fact, with such a flag, we could probably avoid the ifdef entirely, and
> always go toward the PTE fixup path when called in such a fixup case, my gut
> feeling is that this is going to be seldom enough not to hurt x86 measurably
> but we'll have to try it out.
> 
> That leads to that even less tested patch:

 And here's a version that builds (still not tested :-)

Shan, can you verify whether that fixes the problem for you ?

I also had a cursory glance at the ARM code and it seems to rely on the
same stuff as embedded powerpc does for dirty/young updates, so in
theory it should exhibit the same problem.

I suspect the scenario is rare enough in practice in embedded workloads
that nobody noticed until now.

Cheers,
Ben.

diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c
index 06c564e..e8ef0e6 100644
--- a/drivers/ata/sata_sil24.c
+++ b/drivers/ata/sata_sil24.c
@@ -1296,7 +1296,7 @@ static int sil24_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 				DRV_NAME);
 	if (rc)
 		return rc;
-	iomap = pcim_iomap_table(pdev);
+	iomap = pcim_iomap_table~(pdev);
 
 	/* apply workaround for completion IRQ loss on PCI-X errata */
 	if (pi.flags & SIL24_FLAG_PCIX_IRQ_WOC) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9670f71..8a76694 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1546,6 +1546,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
 #define FOLL_MLOCK	0x40	/* mark page as mlocked */
 #define FOLL_SPLIT	0x80	/* don't return transhuge pages, split them */
 #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */
+#define FOLL_FIXFAULT	0x200	/* fixup after a fault (PTE dirty/young upd) */
 
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 			void *data);
diff --git a/kernel/futex.c b/kernel/futex.c
index fe28dc2..02adff7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
 	int ret;
 
 	down_read(&mm->mmap_sem);
-	ret = get_user_pages(current, mm, (unsigned long)uaddr,
-			     1, 1, 0, NULL, NULL);
+	ret = __get_user_pages(current, mm, (unsigned long)uaddr, 1,
+			       FOLL_WRITE | FOLL_FIXFAULT, NULL, NULL, NULL);
 	up_read(&mm->mmap_sem);
 
 	return ret < 0 ? ret : 0;
diff --git a/mm/memory.c b/mm/memory.c
index 40b7531..94b1d3f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1419,6 +1419,29 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 }
 EXPORT_SYMBOL_GPL(zap_vma_ptes);
 
+static void handle_pte_sw_young_dirty(struct vm_area_struct *vma,
+				      unsigned long address,
+				      pte_t *ptep, int write)
+{
+	pte_t entry = *ptep;
+
+	if (write)
+		pte_mkdirty(entry);
+	entry = pte_mkyoung(entry);
+	if (ptep_set_access_flags(vma, address, ptep, entry, write)) {
+		update_mmu_cache(vma, address, ptep);
+	} else {
+		/*
+		 * This is needed only for protection faults but the arch code
+		 * is not yet telling us if this is a protection fault or not.
+		 * This still avoids useless tlb flushes for .text page faults
+		 * with threads.
+		 */
+		if (write)
+			flush_tlb_fix_spurious_fault(vma, address);
+	}
+}
+
 /**
  * follow_page - look up a page descriptor from a user-virtual address
  * @vma: vm_area_struct mapping @address
@@ -1514,16 +1537,22 @@ split_fallthrough:
 
 	if (flags & FOLL_GET)
 		get_page(page);
-	if (flags & FOLL_TOUCH) {
-		if ((flags & FOLL_WRITE) &&
-		    !pte_dirty(pte) && !PageDirty(page))
-			set_page_dirty(page);
-		/*
-		 * pte_mkyoung() would be more correct here, but atomic care
-		 * is needed to avoid losing the dirty bit: it is easier to use
-		 * mark_page_accessed().
-		 */
-		mark_page_accessed(page);
+
+	if (!pte_young(pte) || ((flags & FOLL_WRITE) && !pte_dirty(pte))) {
+		if (flags & FOLL_FIXFAULT)
+			handle_pte_sw_young_dirty(vma, address, ptep,
+						  flags & FOLL_WRITE);
+		else if (flags & FOLL_TOUCH) {
+			if ((flags & FOLL_WRITE) &&
+			    !pte_dirty(pte) && !PageDirty(page))
+				set_page_dirty(page);
+			/*
+			 * pte_mkyoung() would be more correct here, but atomic care
+			 * is needed to avoid losing the dirty bit: it is easier to use
+			 * mark_page_accessed().
+			 */
+			mark_page_accessed(page);
+		}
 	}
 	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
 		/*
@@ -3358,21 +3387,8 @@ int handle_pte_fault(struct mm_struct *mm,
 		if (!pte_write(entry))
 			return do_wp_page(mm, vma, address,
 					pte, pmd, ptl, entry);
-		entry = pte_mkdirty(entry);
-	}
-	entry = pte_mkyoung(entry);
-	if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
-		update_mmu_cache(vma, address, pte);
-	} else {
-		/*
-		 * This is needed only for protection faults but the arch code
-		 * is not yet telling us if this is a protection fault or not.
-		 * This still avoids useless tlb flushes for .text page faults
-		 * with threads.
-		 */
-		if (flags & FAULT_FLAG_WRITE)
-			flush_tlb_fix_spurious_fault(vma, address);
 	}
+	handle_pte_sw_young_dirty(vma, address, pte, flags & FAULT_FLAG_WRITE);
 unlock:
 	pte_unmap_unlock(pte, ptl);
 	return 0;

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-17 23:14               ` Benjamin Herrenschmidt
@ 2011-07-18  4:01                 ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-18  4:01 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Shan Hai, Peter Zijlstra, paulus, tglx, walken, dhowells,
	cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel

On Mon, 2011-07-18 at 09:14 +1000, Benjamin Herrenschmidt wrote:

> In fact, with such a flag, we could probably avoid the ifdef entirely, and
> always go toward the PTE fixup path when called in such a fixup case, my gut
> feeling is that this is going to be seldom enough not to hurt x86 measurably
> but we'll have to try it out.
> 
> That leads to that even less tested patch:

And here's a version that builds and fixes a bug or two
(still not tested :-)

Shan, can you verify whether that fixes the problem for you ?

I also had a cursory glance at the ARM code and it seems to rely on the
same stuff as embedded powerpc does for dirty/young updates, so in
theory it should exhibit the same problem.

I suspect the scenario is rare enough in practice in embedded workloads
that nobody noticed until now.

Cheers,
Ben.

mm/futex: Fix use of gup() to "fixup" failing atomic user accesses

The futex code uses atomic (page fault disabled) accesses to user space,
and when they fail, uses get_user_pages() to "fixup" the PTE and try again.

However, on arch with SW tracking of the dirty and young bits, this will
not work properly as neither of the above will perform the necessary fixup
of those bits.

There's also a possible corner cases with archs who rely on
handle_pte_fault() to invalidate the TLB for "spurrious" faults (though
I don't know which arch actually needs that). Those would break the
same way.

This fixes it by factoring out the fixup code from handle_pte_fault() into
a separate function, and use it from within gup as well, whenever the
FOLL_FIXFAULT flag has been passed to it. The futex code is modified to
pass that flag.

This doesn't change the "normal" gup case (and thus avoids the overhead
of doing that tracking)

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9670f71..8a76694 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1546,6 +1546,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
 #define FOLL_MLOCK	0x40	/* mark page as mlocked */
 #define FOLL_SPLIT	0x80	/* don't return transhuge pages, split them */
 #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */
+#define FOLL_FIXFAULT	0x200	/* fixup after a fault (PTE dirty/young upd) */
 
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 			void *data);
diff --git a/kernel/futex.c b/kernel/futex.c
index fe28dc2..02adff7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
 	int ret;
 
 	down_read(&mm->mmap_sem);
-	ret = get_user_pages(current, mm, (unsigned long)uaddr,
-			     1, 1, 0, NULL, NULL);
+	ret = __get_user_pages(current, mm, (unsigned long)uaddr, 1,
+			       FOLL_WRITE | FOLL_FIXFAULT, NULL, NULL, NULL);
 	up_read(&mm->mmap_sem);
 
 	return ret < 0 ? ret : 0;
diff --git a/mm/memory.c b/mm/memory.c
index 40b7531..3c4d502 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1419,6 +1419,29 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 }
 EXPORT_SYMBOL_GPL(zap_vma_ptes);
 
+static void handle_pte_sw_young_dirty(struct vm_area_struct *vma,
+				      unsigned long address,
+				      pte_t *ptep, int write)
+{
+	pte_t entry = *ptep;
+
+	if (write)
+		pte_mkdirty(entry);
+	entry = pte_mkyoung(entry);
+	if (ptep_set_access_flags(vma, address, ptep, entry, write)) {
+		update_mmu_cache(vma, address, ptep);
+	} else {
+		/*
+		 * This is needed only for protection faults but the arch code
+		 * is not yet telling us if this is a protection fault or not.
+		 * This still avoids useless tlb flushes for .text page faults
+		 * with threads.
+		 */
+		if (write)
+			flush_tlb_fix_spurious_fault(vma, address);
+	}
+}
+
 /**
  * follow_page - look up a page descriptor from a user-virtual address
  * @vma: vm_area_struct mapping @address
@@ -1514,6 +1537,10 @@ split_fallthrough:
 
 	if (flags & FOLL_GET)
 		get_page(page);
+
+	if (flags & FOLL_FIXFAULT)
+		handle_pte_sw_young_dirty(vma, address, ptep,
+					  flags & FOLL_WRITE);
 	if (flags & FOLL_TOUCH) {
 		if ((flags & FOLL_WRITE) &&
 		    !pte_dirty(pte) && !PageDirty(page))
@@ -1525,6 +1552,7 @@ split_fallthrough:
 		 */
 		mark_page_accessed(page);
 	}
+
 	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
 		/*
 		 * The preliminary mapping check is mainly to avoid the
@@ -3358,21 +3386,8 @@ int handle_pte_fault(struct mm_struct *mm,
 		if (!pte_write(entry))
 			return do_wp_page(mm, vma, address,
 					pte, pmd, ptl, entry);
-		entry = pte_mkdirty(entry);
-	}
-	entry = pte_mkyoung(entry);
-	if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
-		update_mmu_cache(vma, address, pte);
-	} else {
-		/*
-		 * This is needed only for protection faults but the arch code
-		 * is not yet telling us if this is a protection fault or not.
-		 * This still avoids useless tlb flushes for .text page faults
-		 * with threads.
-		 */
-		if (flags & FAULT_FLAG_WRITE)
-			flush_tlb_fix_spurious_fault(vma, address);
 	}
+	handle_pte_sw_young_dirty(vma, address, pte, flags & FAULT_FLAG_WRITE);
 unlock:
 	pte_unmap_unlock(pte, ptl);
 	return 0;



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-18  4:01                 ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-18  4:01 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tony.luck, Peter Zijlstra, Shan Hai, linux-kernel, cmetcalf,
	dhowells, paulus, tglx, walken, linuxppc-dev, akpm

On Mon, 2011-07-18 at 09:14 +1000, Benjamin Herrenschmidt wrote:

> In fact, with such a flag, we could probably avoid the ifdef entirely, and
> always go toward the PTE fixup path when called in such a fixup case, my gut
> feeling is that this is going to be seldom enough not to hurt x86 measurably
> but we'll have to try it out.
> 
> That leads to that even less tested patch:

And here's a version that builds and fixes a bug or two
(still not tested :-)

Shan, can you verify whether that fixes the problem for you ?

I also had a cursory glance at the ARM code and it seems to rely on the
same stuff as embedded powerpc does for dirty/young updates, so in
theory it should exhibit the same problem.

I suspect the scenario is rare enough in practice in embedded workloads
that nobody noticed until now.

Cheers,
Ben.

mm/futex: Fix use of gup() to "fixup" failing atomic user accesses

The futex code uses atomic (page fault disabled) accesses to user space,
and when they fail, uses get_user_pages() to "fixup" the PTE and try again.

However, on arch with SW tracking of the dirty and young bits, this will
not work properly as neither of the above will perform the necessary fixup
of those bits.

There's also a possible corner cases with archs who rely on
handle_pte_fault() to invalidate the TLB for "spurrious" faults (though
I don't know which arch actually needs that). Those would break the
same way.

This fixes it by factoring out the fixup code from handle_pte_fault() into
a separate function, and use it from within gup as well, whenever the
FOLL_FIXFAULT flag has been passed to it. The futex code is modified to
pass that flag.

This doesn't change the "normal" gup case (and thus avoids the overhead
of doing that tracking)

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9670f71..8a76694 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1546,6 +1546,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
 #define FOLL_MLOCK	0x40	/* mark page as mlocked */
 #define FOLL_SPLIT	0x80	/* don't return transhuge pages, split them */
 #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */
+#define FOLL_FIXFAULT	0x200	/* fixup after a fault (PTE dirty/young upd) */
 
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 			void *data);
diff --git a/kernel/futex.c b/kernel/futex.c
index fe28dc2..02adff7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
 	int ret;
 
 	down_read(&mm->mmap_sem);
-	ret = get_user_pages(current, mm, (unsigned long)uaddr,
-			     1, 1, 0, NULL, NULL);
+	ret = __get_user_pages(current, mm, (unsigned long)uaddr, 1,
+			       FOLL_WRITE | FOLL_FIXFAULT, NULL, NULL, NULL);
 	up_read(&mm->mmap_sem);
 
 	return ret < 0 ? ret : 0;
diff --git a/mm/memory.c b/mm/memory.c
index 40b7531..3c4d502 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1419,6 +1419,29 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 }
 EXPORT_SYMBOL_GPL(zap_vma_ptes);
 
+static void handle_pte_sw_young_dirty(struct vm_area_struct *vma,
+				      unsigned long address,
+				      pte_t *ptep, int write)
+{
+	pte_t entry = *ptep;
+
+	if (write)
+		pte_mkdirty(entry);
+	entry = pte_mkyoung(entry);
+	if (ptep_set_access_flags(vma, address, ptep, entry, write)) {
+		update_mmu_cache(vma, address, ptep);
+	} else {
+		/*
+		 * This is needed only for protection faults but the arch code
+		 * is not yet telling us if this is a protection fault or not.
+		 * This still avoids useless tlb flushes for .text page faults
+		 * with threads.
+		 */
+		if (write)
+			flush_tlb_fix_spurious_fault(vma, address);
+	}
+}
+
 /**
  * follow_page - look up a page descriptor from a user-virtual address
  * @vma: vm_area_struct mapping @address
@@ -1514,6 +1537,10 @@ split_fallthrough:
 
 	if (flags & FOLL_GET)
 		get_page(page);
+
+	if (flags & FOLL_FIXFAULT)
+		handle_pte_sw_young_dirty(vma, address, ptep,
+					  flags & FOLL_WRITE);
 	if (flags & FOLL_TOUCH) {
 		if ((flags & FOLL_WRITE) &&
 		    !pte_dirty(pte) && !PageDirty(page))
@@ -1525,6 +1552,7 @@ split_fallthrough:
 		 */
 		mark_page_accessed(page);
 	}
+
 	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
 		/*
 		 * The preliminary mapping check is mainly to avoid the
@@ -3358,21 +3386,8 @@ int handle_pte_fault(struct mm_struct *mm,
 		if (!pte_write(entry))
 			return do_wp_page(mm, vma, address,
 					pte, pmd, ptl, entry);
-		entry = pte_mkdirty(entry);
-	}
-	entry = pte_mkyoung(entry);
-	if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
-		update_mmu_cache(vma, address, pte);
-	} else {
-		/*
-		 * This is needed only for protection faults but the arch code
-		 * is not yet telling us if this is a protection fault or not.
-		 * This still avoids useless tlb flushes for .text page faults
-		 * with threads.
-		 */
-		if (flags & FAULT_FLAG_WRITE)
-			flush_tlb_fix_spurious_fault(vma, address);
 	}
+	handle_pte_sw_young_dirty(vma, address, pte, flags & FAULT_FLAG_WRITE);
 unlock:
 	pte_unmap_unlock(pte, ptl);
 	return 0;

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-18  3:53                 ` Benjamin Herrenschmidt
@ 2011-07-18  4:02                   ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-18  4:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Shan Hai, Peter Zijlstra, paulus, tglx, walken, dhowells,
	cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel

On Mon, 2011-07-18 at 13:53 +1000, Benjamin Herrenschmidt wrote:
> On Mon, 2011-07-18 at 09:14 +1000, Benjamin Herrenschmidt wrote:
> 
> > In fact, with such a flag, we could probably avoid the ifdef entirely, and
> > always go toward the PTE fixup path when called in such a fixup case, my gut
> > feeling is that this is going to be seldom enough not to hurt x86 measurably
> > but we'll have to try it out.
> > 
> > That leads to that even less tested patch:
> 
>  And here's a version that builds (still not tested :-)
> 
> Shan, can you verify whether that fixes the problem for you ?
> 
> I also had a cursory glance at the ARM code and it seems to rely on the
> same stuff as embedded powerpc does for dirty/young updates, so in
> theory it should exhibit the same problem.
> 
> I suspect the scenario is rare enough in practice in embedded workloads
> that nobody noticed until now.

Ignore that bogus send, I sent a proper one immediately after (evolution
FAIL, sorry about that)

Cheers,
Ben.



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-18  4:02                   ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-18  4:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tony.luck, Peter Zijlstra, Shan Hai, linux-kernel, cmetcalf,
	dhowells, paulus, tglx, walken, linuxppc-dev, akpm

On Mon, 2011-07-18 at 13:53 +1000, Benjamin Herrenschmidt wrote:
> On Mon, 2011-07-18 at 09:14 +1000, Benjamin Herrenschmidt wrote:
> 
> > In fact, with such a flag, we could probably avoid the ifdef entirely, and
> > always go toward the PTE fixup path when called in such a fixup case, my gut
> > feeling is that this is going to be seldom enough not to hurt x86 measurably
> > but we'll have to try it out.
> > 
> > That leads to that even less tested patch:
> 
>  And here's a version that builds (still not tested :-)
> 
> Shan, can you verify whether that fixes the problem for you ?
> 
> I also had a cursory glance at the ARM code and it seems to rely on the
> same stuff as embedded powerpc does for dirty/young updates, so in
> theory it should exhibit the same problem.
> 
> I suspect the scenario is rare enough in practice in embedded workloads
> that nobody noticed until now.

Ignore that bogus send, I sent a proper one immediately after (evolution
FAIL, sorry about that)

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-18  4:01                 ` Benjamin Herrenschmidt
@ 2011-07-18  6:48                   ` Shan Hai
  -1 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-18  6:48 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Peter Zijlstra, Peter Zijlstra, paulus, tglx, walken, dhowells,
	cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel

On 07/18/2011 12:01 PM, Benjamin Herrenschmidt wrote:
> On Mon, 2011-07-18 at 09:14 +1000, Benjamin Herrenschmidt wrote:
>
>> In fact, with such a flag, we could probably avoid the ifdef entirely, and
>> always go toward the PTE fixup path when called in such a fixup case, my gut
>> feeling is that this is going to be seldom enough not to hurt x86 measurably
>> but we'll have to try it out.
>>
>> That leads to that even less tested patch:
> And here's a version that builds and fixes a bug or two
> (still not tested :-)
>
> Shan, can you verify whether that fixes the problem for you ?
>

It could not fix the problem, refer the following reply for
the reasons.

> I also had a cursory glance at the ARM code and it seems to rely on the
> same stuff as embedded powerpc does for dirty/young updates, so in
> theory it should exhibit the same problem.
>
> I suspect the scenario is rare enough in practice in embedded workloads
> that nobody noticed until now.
>
> Cheers,
> Ben.
>
> mm/futex: Fix use of gup() to "fixup" failing atomic user accesses
>
> The futex code uses atomic (page fault disabled) accesses to user space,
> and when they fail, uses get_user_pages() to "fixup" the PTE and try again.
>
> However, on arch with SW tracking of the dirty and young bits, this will
> not work properly as neither of the above will perform the necessary fixup
> of those bits.
>
> There's also a possible corner cases with archs who rely on
> handle_pte_fault() to invalidate the TLB for "spurrious" faults (though
> I don't know which arch actually needs that). Those would break the
> same way.
>
> This fixes it by factoring out the fixup code from handle_pte_fault() into
> a separate function, and use it from within gup as well, whenever the
> FOLL_FIXFAULT flag has been passed to it. The futex code is modified to
> pass that flag.
>
> This doesn't change the "normal" gup case (and thus avoids the overhead
> of doing that tracking)
>
> Signed-off-by: Benjamin Herrenschmidt<benh@kernel.crashing.org>
> ---
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 9670f71..8a76694 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1546,6 +1546,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
>   #define FOLL_MLOCK	0x40	/* mark page as mlocked */
>   #define FOLL_SPLIT	0x80	/* don't return transhuge pages, split them */
>   #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */
> +#define FOLL_FIXFAULT	0x200	/* fixup after a fault (PTE dirty/young upd) */
>
>   typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
>   			void *data);
> diff --git a/kernel/futex.c b/kernel/futex.c
> index fe28dc2..02adff7 100644
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
>   	int ret;
>
>   	down_read(&mm->mmap_sem);
> -	ret = get_user_pages(current, mm, (unsigned long)uaddr,
> -			     1, 1, 0, NULL, NULL);
> +	ret = __get_user_pages(current, mm, (unsigned long)uaddr, 1,
> +			       FOLL_WRITE | FOLL_FIXFAULT, NULL, NULL, NULL);

the FOLL_FIXFAULT is filtered out at the following code
get_user_pages()
     if (write)
                 flags |= FOLL_WRITE;

>   	up_read(&mm->mmap_sem);
>
>   	return ret<  0 ? ret : 0;
> diff --git a/mm/memory.c b/mm/memory.c
> index 40b7531..3c4d502 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1419,6 +1419,29 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
>   }
>   EXPORT_SYMBOL_GPL(zap_vma_ptes);
>
> +static void handle_pte_sw_young_dirty(struct vm_area_struct *vma,
> +				      unsigned long address,
> +				      pte_t *ptep, int write)
> +{
> +	pte_t entry = *ptep;
> +
> +	if (write)
> +		pte_mkdirty(entry);
> +	entry = pte_mkyoung(entry);
> +	if (ptep_set_access_flags(vma, address, ptep, entry, write)) {
> +		update_mmu_cache(vma, address, ptep);
> +	} else {
> +		/*
> +		 * This is needed only for protection faults but the arch code
> +		 * is not yet telling us if this is a protection fault or not.
> +		 * This still avoids useless tlb flushes for .text page faults
> +		 * with threads.
> +		 */
> +		if (write)
> +			flush_tlb_fix_spurious_fault(vma, address);
> +	}
> +}
> +
>   /**
>    * follow_page - look up a page descriptor from a user-virtual address
>    * @vma: vm_area_struct mapping @address
> @@ -1514,6 +1537,10 @@ split_fallthrough:
>
>   	if (flags&  FOLL_GET)
>   		get_page(page);
> +
> +	if (flags&  FOLL_FIXFAULT)
> +		handle_pte_sw_young_dirty(vma, address, ptep,
> +					  flags&  FOLL_WRITE);
>   	if (flags&  FOLL_TOUCH) {
>   		if ((flags&  FOLL_WRITE)&&
>   		!pte_dirty(pte)&&  !PageDirty(page))

call handle_pte_sw_young_dirty before !pte_dirty(pte)
might has problems.

> @@ -1525,6 +1552,7 @@ split_fallthrough:
>   		 */
>   		mark_page_accessed(page);
>   	}
> +
>   	if ((flags&  FOLL_MLOCK)&&  (vma->vm_flags&  VM_LOCKED)) {
>   		/*
>   		 * The preliminary mapping check is mainly to avoid the
> @@ -3358,21 +3386,8 @@ int handle_pte_fault(struct mm_struct *mm,
>   		if (!pte_write(entry))
>   			return do_wp_page(mm, vma, address,
>   					pte, pmd, ptl, entry);
> -		entry = pte_mkdirty(entry);
> -	}
> -	entry = pte_mkyoung(entry);
> -	if (ptep_set_access_flags(vma, address, pte, entry, flags&  FAULT_FLAG_WRITE)) {
> -		update_mmu_cache(vma, address, pte);
> -	} else {
> -		/*
> -		 * This is needed only for protection faults but the arch code
> -		 * is not yet telling us if this is a protection fault or not.
> -		 * This still avoids useless tlb flushes for .text page faults
> -		 * with threads.
> -		 */
> -		if (flags&  FAULT_FLAG_WRITE)
> -			flush_tlb_fix_spurious_fault(vma, address);
>   	}
> +	handle_pte_sw_young_dirty(vma, address, pte, flags&  FAULT_FLAG_WRITE);
>   unlock:
>   	pte_unmap_unlock(pte, ptl);
>   	return 0;
>
>

So what about the following?
diff --git a/mm/memory.c b/mm/memory.c
index 9b8a01d..fb48122 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1442,6 +1442,7 @@ struct page *follow_page(struct vm_area_struct 
*vma, unsig
         spinlock_t *ptl;
         struct page *page;
         struct mm_struct *mm = vma->vm_mm;
+       int fix_write_permission = false;

         page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
         if (!IS_ERR(page)) {
@@ -1519,6 +1520,11 @@ split_fallthrough:
                 if ((flags & FOLL_WRITE) &&
                     !pte_dirty(pte) && !PageDirty(page))
                         set_page_dirty(page);
+
+#ifdef CONFIG_FIXUP_WRITE_PERMISSION
+               if ((flags & FOLL_WRITE) && !pte_dirty(pte))
+                       fix_write_permission = true;
+#endif
                 /*
                  * pte_mkyoung() would be more correct here, but atomic 
care
                  * is needed to avoid losing the dirty bit: it is 
easier to use
@@ -1551,7 +1557,7 @@ split_fallthrough:
  unlock:
         pte_unmap_unlock(ptep, ptl);
  out:
-       return page;
+       return (fix_write_permission == true) ? NULL: page;

  bad_page:
         pte_unmap_unlock(ptep, ptl);



 From the CONFIG_FIXUP_WRITE_PERMISSION and
(flags & FOLL_WRITE) && !pte_dirty(pte) the follow_page()
could figure out that the caller want to write to the
(present && writable && non-dirty) pte, and the architecture
want to fixup the problem by indicating CONFIG_FIXUP_WRITE_PERMISSION,
so let the follow_page() return NULL to the __get_user_pages, and
let the handle_mm_fault to fixup dirty/young tracking.

Checking the following code we can conclude that the handle_mm_fault
is ready to handle the faults and the write permission violation is
a kind of fault, so why don't we let the handle_mm_fault to
handle that fault?

__get_user_pages()
      while (!(page = follow_page(vma, start, foll_flags))) {
         ...
         ret = handle_mm_fault(mm, vma, start,
                                                         fault_flags);
         ...
     }

Thanks
Shan Hai

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-18  6:48                   ` Shan Hai
  0 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-18  6:48 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: tony.luck, Peter Zijlstra, Peter Zijlstra, linux-kernel,
	cmetcalf, dhowells, paulus, tglx, walken, linuxppc-dev, akpm

On 07/18/2011 12:01 PM, Benjamin Herrenschmidt wrote:
> On Mon, 2011-07-18 at 09:14 +1000, Benjamin Herrenschmidt wrote:
>
>> In fact, with such a flag, we could probably avoid the ifdef entirely, and
>> always go toward the PTE fixup path when called in such a fixup case, my gut
>> feeling is that this is going to be seldom enough not to hurt x86 measurably
>> but we'll have to try it out.
>>
>> That leads to that even less tested patch:
> And here's a version that builds and fixes a bug or two
> (still not tested :-)
>
> Shan, can you verify whether that fixes the problem for you ?
>

It could not fix the problem, refer the following reply for
the reasons.

> I also had a cursory glance at the ARM code and it seems to rely on the
> same stuff as embedded powerpc does for dirty/young updates, so in
> theory it should exhibit the same problem.
>
> I suspect the scenario is rare enough in practice in embedded workloads
> that nobody noticed until now.
>
> Cheers,
> Ben.
>
> mm/futex: Fix use of gup() to "fixup" failing atomic user accesses
>
> The futex code uses atomic (page fault disabled) accesses to user space,
> and when they fail, uses get_user_pages() to "fixup" the PTE and try again.
>
> However, on arch with SW tracking of the dirty and young bits, this will
> not work properly as neither of the above will perform the necessary fixup
> of those bits.
>
> There's also a possible corner cases with archs who rely on
> handle_pte_fault() to invalidate the TLB for "spurrious" faults (though
> I don't know which arch actually needs that). Those would break the
> same way.
>
> This fixes it by factoring out the fixup code from handle_pte_fault() into
> a separate function, and use it from within gup as well, whenever the
> FOLL_FIXFAULT flag has been passed to it. The futex code is modified to
> pass that flag.
>
> This doesn't change the "normal" gup case (and thus avoids the overhead
> of doing that tracking)
>
> Signed-off-by: Benjamin Herrenschmidt<benh@kernel.crashing.org>
> ---
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 9670f71..8a76694 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1546,6 +1546,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
>   #define FOLL_MLOCK	0x40	/* mark page as mlocked */
>   #define FOLL_SPLIT	0x80	/* don't return transhuge pages, split them */
>   #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */
> +#define FOLL_FIXFAULT	0x200	/* fixup after a fault (PTE dirty/young upd) */
>
>   typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
>   			void *data);
> diff --git a/kernel/futex.c b/kernel/futex.c
> index fe28dc2..02adff7 100644
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
>   	int ret;
>
>   	down_read(&mm->mmap_sem);
> -	ret = get_user_pages(current, mm, (unsigned long)uaddr,
> -			     1, 1, 0, NULL, NULL);
> +	ret = __get_user_pages(current, mm, (unsigned long)uaddr, 1,
> +			       FOLL_WRITE | FOLL_FIXFAULT, NULL, NULL, NULL);

the FOLL_FIXFAULT is filtered out at the following code
get_user_pages()
     if (write)
                 flags |= FOLL_WRITE;

>   	up_read(&mm->mmap_sem);
>
>   	return ret<  0 ? ret : 0;
> diff --git a/mm/memory.c b/mm/memory.c
> index 40b7531..3c4d502 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1419,6 +1419,29 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
>   }
>   EXPORT_SYMBOL_GPL(zap_vma_ptes);
>
> +static void handle_pte_sw_young_dirty(struct vm_area_struct *vma,
> +				      unsigned long address,
> +				      pte_t *ptep, int write)
> +{
> +	pte_t entry = *ptep;
> +
> +	if (write)
> +		pte_mkdirty(entry);
> +	entry = pte_mkyoung(entry);
> +	if (ptep_set_access_flags(vma, address, ptep, entry, write)) {
> +		update_mmu_cache(vma, address, ptep);
> +	} else {
> +		/*
> +		 * This is needed only for protection faults but the arch code
> +		 * is not yet telling us if this is a protection fault or not.
> +		 * This still avoids useless tlb flushes for .text page faults
> +		 * with threads.
> +		 */
> +		if (write)
> +			flush_tlb_fix_spurious_fault(vma, address);
> +	}
> +}
> +
>   /**
>    * follow_page - look up a page descriptor from a user-virtual address
>    * @vma: vm_area_struct mapping @address
> @@ -1514,6 +1537,10 @@ split_fallthrough:
>
>   	if (flags&  FOLL_GET)
>   		get_page(page);
> +
> +	if (flags&  FOLL_FIXFAULT)
> +		handle_pte_sw_young_dirty(vma, address, ptep,
> +					  flags&  FOLL_WRITE);
>   	if (flags&  FOLL_TOUCH) {
>   		if ((flags&  FOLL_WRITE)&&
>   		!pte_dirty(pte)&&  !PageDirty(page))

call handle_pte_sw_young_dirty before !pte_dirty(pte)
might has problems.

> @@ -1525,6 +1552,7 @@ split_fallthrough:
>   		 */
>   		mark_page_accessed(page);
>   	}
> +
>   	if ((flags&  FOLL_MLOCK)&&  (vma->vm_flags&  VM_LOCKED)) {
>   		/*
>   		 * The preliminary mapping check is mainly to avoid the
> @@ -3358,21 +3386,8 @@ int handle_pte_fault(struct mm_struct *mm,
>   		if (!pte_write(entry))
>   			return do_wp_page(mm, vma, address,
>   					pte, pmd, ptl, entry);
> -		entry = pte_mkdirty(entry);
> -	}
> -	entry = pte_mkyoung(entry);
> -	if (ptep_set_access_flags(vma, address, pte, entry, flags&  FAULT_FLAG_WRITE)) {
> -		update_mmu_cache(vma, address, pte);
> -	} else {
> -		/*
> -		 * This is needed only for protection faults but the arch code
> -		 * is not yet telling us if this is a protection fault or not.
> -		 * This still avoids useless tlb flushes for .text page faults
> -		 * with threads.
> -		 */
> -		if (flags&  FAULT_FLAG_WRITE)
> -			flush_tlb_fix_spurious_fault(vma, address);
>   	}
> +	handle_pte_sw_young_dirty(vma, address, pte, flags&  FAULT_FLAG_WRITE);
>   unlock:
>   	pte_unmap_unlock(pte, ptl);
>   	return 0;
>
>

So what about the following?
diff --git a/mm/memory.c b/mm/memory.c
index 9b8a01d..fb48122 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1442,6 +1442,7 @@ struct page *follow_page(struct vm_area_struct 
*vma, unsig
         spinlock_t *ptl;
         struct page *page;
         struct mm_struct *mm = vma->vm_mm;
+       int fix_write_permission = false;

         page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
         if (!IS_ERR(page)) {
@@ -1519,6 +1520,11 @@ split_fallthrough:
                 if ((flags & FOLL_WRITE) &&
                     !pte_dirty(pte) && !PageDirty(page))
                         set_page_dirty(page);
+
+#ifdef CONFIG_FIXUP_WRITE_PERMISSION
+               if ((flags & FOLL_WRITE) && !pte_dirty(pte))
+                       fix_write_permission = true;
+#endif
                 /*
                  * pte_mkyoung() would be more correct here, but atomic 
care
                  * is needed to avoid losing the dirty bit: it is 
easier to use
@@ -1551,7 +1557,7 @@ split_fallthrough:
  unlock:
         pte_unmap_unlock(ptep, ptl);
  out:
-       return page;
+       return (fix_write_permission == true) ? NULL: page;

  bad_page:
         pte_unmap_unlock(ptep, ptl);



 From the CONFIG_FIXUP_WRITE_PERMISSION and
(flags & FOLL_WRITE) && !pte_dirty(pte) the follow_page()
could figure out that the caller want to write to the
(present && writable && non-dirty) pte, and the architecture
want to fixup the problem by indicating CONFIG_FIXUP_WRITE_PERMISSION,
so let the follow_page() return NULL to the __get_user_pages, and
let the handle_mm_fault to fixup dirty/young tracking.

Checking the following code we can conclude that the handle_mm_fault
is ready to handle the faults and the write permission violation is
a kind of fault, so why don't we let the handle_mm_fault to
handle that fault?

__get_user_pages()
      while (!(page = follow_page(vma, start, foll_flags))) {
         ...
         ret = handle_mm_fault(mm, vma, start,
                                                         fault_flags);
         ...
     }

Thanks
Shan Hai

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-18  6:48                   ` Shan Hai
@ 2011-07-18  7:01                     ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-18  7:01 UTC (permalink / raw)
  To: Shan Hai
  Cc: Peter Zijlstra, Peter Zijlstra, paulus, tglx, walken, dhowells,
	cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel

On Mon, 2011-07-18 at 14:48 +0800, Shan Hai wrote:

> It could not fix the problem, refer the following reply for
> the reasons.

 .../...

> > diff --git a/kernel/futex.c b/kernel/futex.c
> > index fe28dc2..02adff7 100644
> > --- a/kernel/futex.c
> > +++ b/kernel/futex.c
> > @@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
> >   	int ret;
> >
> >   	down_read(&mm->mmap_sem);
> > -	ret = get_user_pages(current, mm, (unsigned long)uaddr,
> > -			     1, 1, 0, NULL, NULL);
> > +	ret = __get_user_pages(current, mm, (unsigned long)uaddr, 1,
> > +			       FOLL_WRITE | FOLL_FIXFAULT, NULL, NULL, NULL);
> 
> the FOLL_FIXFAULT is filtered out at the following code
> get_user_pages()
>      if (write)
>                  flags |= FOLL_WRITE;
> 

I'm not sure what you're talking about here, you may notice that I'm
calling __get_user_pages() not get_user_pages(). Make sure you get my
-second- post of the patch (the one with a proper description & s-o-b)
since the first one was a mis-send of an wip version.

> > +
> > +	if (flags&  FOLL_FIXFAULT)
> > +		handle_pte_sw_young_dirty(vma, address, ptep,
> > +					  flags&  FOLL_WRITE);
> >   	if (flags&  FOLL_TOUCH) {
> >   		if ((flags&  FOLL_WRITE)&&
> >   		!pte_dirty(pte)&&  !PageDirty(page))
> 
> call handle_pte_sw_young_dirty before !pte_dirty(pte)
> might has problems.

No this is on purpose. 

My initial patch was only calling it under the same condition as the
FOLL_TOUCH case, but I got concerned by this whole
flush_tlb_fix_spurious_fault() business.

Basically, our generic code is designed so that relaxing write
protection on a PTE can be done without flushing the TLB on all CPUs, so
that a "spurrious" fault on a secondary CPU will flush the TLB at that
point.

I don't know which arch relies on this feature (ARM maybe ?) but if we
are going to be semantically equivalent to a real fault, we must also do
that, so the right thing to do here is to always call in there if
FOLL_FIXFAULT is set.

It's up to the caller to only set FOLL_FIXFAULT when really trying to
deal with an -EFAULT, to avoid possible unnecessary overhead, but in
this case I think we are fine, this is all a fallback slow path.

 .../...

> So what about the following?
> diff --git a/mm/memory.c b/mm/memory.c
> index 9b8a01d..fb48122 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1442,6 +1442,7 @@ struct page *follow_page(struct vm_area_struct 
> *vma, unsig
>          spinlock_t *ptl;
>          struct page *page;
>          struct mm_struct *mm = vma->vm_mm;
> +       int fix_write_permission = false;
> 
>          page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
>          if (!IS_ERR(page)) {
> @@ -1519,6 +1520,11 @@ split_fallthrough:
>                  if ((flags & FOLL_WRITE) &&
>                      !pte_dirty(pte) && !PageDirty(page))
>                          set_page_dirty(page);
> +
> +#ifdef CONFIG_FIXUP_WRITE_PERMISSION
> +               if ((flags & FOLL_WRITE) && !pte_dirty(pte))
> +                       fix_write_permission = true;
> +#endif
>                  /*
>                   * pte_mkyoung() would be more correct here, but atomic 
> care
>                   * is needed to avoid losing the dirty bit: it is 
> easier to use
> @@ -1551,7 +1557,7 @@ split_fallthrough:
>   unlock:
>          pte_unmap_unlock(ptep, ptl);
>   out:
> -       return page;
> +       return (fix_write_permission == true) ? NULL: page;
> 
>   bad_page:
>          pte_unmap_unlock(ptep, ptl);

You patch not only is uglier (more ifdef's) but also incomplete since it
doesn't handle the young case and it doesn't handle the spurious fault
case either.

What the futex code is trying to do is use gup() as a way to fixup from
a fault which means essentially to have the -exact- same semantics as a
normal fault would have.

Thus by factoring the common fault fixup code and using that exact same
code in gup(), we get a much more robust guarantee that this will work
in the long run.

I don't expect gup to be that commonly used to fixup access after an
attempt at doing a user access with page faults disabled, only those
case will need to be modified to use the new flag. 

>  From the CONFIG_FIXUP_WRITE_PERMISSION and
> (flags & FOLL_WRITE) && !pte_dirty(pte) the follow_page()
> could figure out that the caller want to write to the
> (present && writable && non-dirty) pte, and the architecture
> want to fixup the problem by indicating CONFIG_FIXUP_WRITE_PERMISSION,
> so let the follow_page() return NULL to the __get_user_pages, and
> let the handle_mm_fault to fixup dirty/young tracking.
> 
> Checking the following code we can conclude that the handle_mm_fault
> is ready to handle the faults and the write permission violation is
> a kind of fault, so why don't we let the handle_mm_fault to
> handle that fault?
> 
> __get_user_pages()
>       while (!(page = follow_page(vma, start, foll_flags))) {
>          ...
>          ret = handle_mm_fault(mm, vma, start,
>                                                          fault_flags);
>          ...
>      }

Cheers,
Ben.



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-18  7:01                     ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-18  7:01 UTC (permalink / raw)
  To: Shan Hai
  Cc: tony.luck, Peter Zijlstra, Peter Zijlstra, linux-kernel,
	cmetcalf, dhowells, paulus, tglx, walken, linuxppc-dev, akpm

On Mon, 2011-07-18 at 14:48 +0800, Shan Hai wrote:

> It could not fix the problem, refer the following reply for
> the reasons.

 .../...

> > diff --git a/kernel/futex.c b/kernel/futex.c
> > index fe28dc2..02adff7 100644
> > --- a/kernel/futex.c
> > +++ b/kernel/futex.c
> > @@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
> >   	int ret;
> >
> >   	down_read(&mm->mmap_sem);
> > -	ret = get_user_pages(current, mm, (unsigned long)uaddr,
> > -			     1, 1, 0, NULL, NULL);
> > +	ret = __get_user_pages(current, mm, (unsigned long)uaddr, 1,
> > +			       FOLL_WRITE | FOLL_FIXFAULT, NULL, NULL, NULL);
> 
> the FOLL_FIXFAULT is filtered out at the following code
> get_user_pages()
>      if (write)
>                  flags |= FOLL_WRITE;
> 

I'm not sure what you're talking about here, you may notice that I'm
calling __get_user_pages() not get_user_pages(). Make sure you get my
-second- post of the patch (the one with a proper description & s-o-b)
since the first one was a mis-send of an wip version.

> > +
> > +	if (flags&  FOLL_FIXFAULT)
> > +		handle_pte_sw_young_dirty(vma, address, ptep,
> > +					  flags&  FOLL_WRITE);
> >   	if (flags&  FOLL_TOUCH) {
> >   		if ((flags&  FOLL_WRITE)&&
> >   		!pte_dirty(pte)&&  !PageDirty(page))
> 
> call handle_pte_sw_young_dirty before !pte_dirty(pte)
> might has problems.

No this is on purpose. 

My initial patch was only calling it under the same condition as the
FOLL_TOUCH case, but I got concerned by this whole
flush_tlb_fix_spurious_fault() business.

Basically, our generic code is designed so that relaxing write
protection on a PTE can be done without flushing the TLB on all CPUs, so
that a "spurrious" fault on a secondary CPU will flush the TLB at that
point.

I don't know which arch relies on this feature (ARM maybe ?) but if we
are going to be semantically equivalent to a real fault, we must also do
that, so the right thing to do here is to always call in there if
FOLL_FIXFAULT is set.

It's up to the caller to only set FOLL_FIXFAULT when really trying to
deal with an -EFAULT, to avoid possible unnecessary overhead, but in
this case I think we are fine, this is all a fallback slow path.

 .../...

> So what about the following?
> diff --git a/mm/memory.c b/mm/memory.c
> index 9b8a01d..fb48122 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1442,6 +1442,7 @@ struct page *follow_page(struct vm_area_struct 
> *vma, unsig
>          spinlock_t *ptl;
>          struct page *page;
>          struct mm_struct *mm = vma->vm_mm;
> +       int fix_write_permission = false;
> 
>          page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
>          if (!IS_ERR(page)) {
> @@ -1519,6 +1520,11 @@ split_fallthrough:
>                  if ((flags & FOLL_WRITE) &&
>                      !pte_dirty(pte) && !PageDirty(page))
>                          set_page_dirty(page);
> +
> +#ifdef CONFIG_FIXUP_WRITE_PERMISSION
> +               if ((flags & FOLL_WRITE) && !pte_dirty(pte))
> +                       fix_write_permission = true;
> +#endif
>                  /*
>                   * pte_mkyoung() would be more correct here, but atomic 
> care
>                   * is needed to avoid losing the dirty bit: it is 
> easier to use
> @@ -1551,7 +1557,7 @@ split_fallthrough:
>   unlock:
>          pte_unmap_unlock(ptep, ptl);
>   out:
> -       return page;
> +       return (fix_write_permission == true) ? NULL: page;
> 
>   bad_page:
>          pte_unmap_unlock(ptep, ptl);

You patch not only is uglier (more ifdef's) but also incomplete since it
doesn't handle the young case and it doesn't handle the spurious fault
case either.

What the futex code is trying to do is use gup() as a way to fixup from
a fault which means essentially to have the -exact- same semantics as a
normal fault would have.

Thus by factoring the common fault fixup code and using that exact same
code in gup(), we get a much more robust guarantee that this will work
in the long run.

I don't expect gup to be that commonly used to fixup access after an
attempt at doing a user access with page faults disabled, only those
case will need to be modified to use the new flag. 

>  From the CONFIG_FIXUP_WRITE_PERMISSION and
> (flags & FOLL_WRITE) && !pte_dirty(pte) the follow_page()
> could figure out that the caller want to write to the
> (present && writable && non-dirty) pte, and the architecture
> want to fixup the problem by indicating CONFIG_FIXUP_WRITE_PERMISSION,
> so let the follow_page() return NULL to the __get_user_pages, and
> let the handle_mm_fault to fixup dirty/young tracking.
> 
> Checking the following code we can conclude that the handle_mm_fault
> is ready to handle the faults and the write permission violation is
> a kind of fault, so why don't we let the handle_mm_fault to
> handle that fault?
> 
> __get_user_pages()
>       while (!(page = follow_page(vma, start, foll_flags))) {
>          ...
>          ret = handle_mm_fault(mm, vma, start,
>                                                          fault_flags);
>          ...
>      }

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-18  7:01                     ` Benjamin Herrenschmidt
@ 2011-07-18  7:26                       ` Shan Hai
  -1 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-18  7:26 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Peter Zijlstra, Peter Zijlstra, paulus, tglx, walken, dhowells,
	cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel

On 07/18/2011 03:01 PM, Benjamin Herrenschmidt wrote:
> On Mon, 2011-07-18 at 14:48 +0800, Shan Hai wrote:
>
>> It could not fix the problem, refer the following reply for
>> the reasons.
>   .../...
>
>>> diff --git a/kernel/futex.c b/kernel/futex.c
>>> index fe28dc2..02adff7 100644
>>> --- a/kernel/futex.c
>>> +++ b/kernel/futex.c
>>> @@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
>>>    	int ret;
>>>
>>>    	down_read(&mm->mmap_sem);
>>> -	ret = get_user_pages(current, mm, (unsigned long)uaddr,
>>> -			     1, 1, 0, NULL, NULL);
>>> +	ret = __get_user_pages(current, mm, (unsigned long)uaddr, 1,
>>> +			       FOLL_WRITE | FOLL_FIXFAULT, NULL, NULL, NULL);
>> the FOLL_FIXFAULT is filtered out at the following code
>> get_user_pages()
>>       if (write)
>>                   flags |= FOLL_WRITE;
>>
> I'm not sure what you're talking about here, you may notice that I'm
> calling __get_user_pages() not get_user_pages(). Make sure you get my
> -second- post of the patch (the one with a proper description&  s-o-b)
> since the first one was a mis-send of an wip version.
>

I am sorry I hadn't tried your newer patch, I tried it but it still 
could not
work in my test environment, I will dig into and tell you why
that failed later.

>>> +
>>> +	if (flags&   FOLL_FIXFAULT)
>>> +		handle_pte_sw_young_dirty(vma, address, ptep,
>>> +					  flags&   FOLL_WRITE);
>>>    	if (flags&   FOLL_TOUCH) {
>>>    		if ((flags&   FOLL_WRITE)&&
>>>    		!pte_dirty(pte)&&   !PageDirty(page))
>> call handle_pte_sw_young_dirty before !pte_dirty(pte)
>> might has problems.
> No this is on purpose.
>
> My initial patch was only calling it under the same condition as the
> FOLL_TOUCH case, but I got concerned by this whole
> flush_tlb_fix_spurious_fault() business.
>
> Basically, our generic code is designed so that relaxing write
> protection on a PTE can be done without flushing the TLB on all CPUs, so
> that a "spurrious" fault on a secondary CPU will flush the TLB at that
> point.
>
> I don't know which arch relies on this feature (ARM maybe ?) but if we
> are going to be semantically equivalent to a real fault, we must also do
> that, so the right thing to do here is to always call in there if
> FOLL_FIXFAULT is set.
>
> It's up to the caller to only set FOLL_FIXFAULT when really trying to
> deal with an -EFAULT, to avoid possible unnecessary overhead, but in
> this case I think we are fine, this is all a fallback slow path.
>
>   .../...
>
>> So what about the following?
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 9b8a01d..fb48122 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -1442,6 +1442,7 @@ struct page *follow_page(struct vm_area_struct
>> *vma, unsig
>>           spinlock_t *ptl;
>>           struct page *page;
>>           struct mm_struct *mm = vma->vm_mm;
>> +       int fix_write_permission = false;
>>
>>           page = follow_huge_addr(mm, address, flags&  FOLL_WRITE);
>>           if (!IS_ERR(page)) {
>> @@ -1519,6 +1520,11 @@ split_fallthrough:
>>                   if ((flags&  FOLL_WRITE)&&
>>                       !pte_dirty(pte)&&  !PageDirty(page))
>>                           set_page_dirty(page);
>> +
>> +#ifdef CONFIG_FIXUP_WRITE_PERMISSION
>> +               if ((flags&  FOLL_WRITE)&&  !pte_dirty(pte))
>> +                       fix_write_permission = true;
>> +#endif
>>                   /*
>>                    * pte_mkyoung() would be more correct here, but atomic
>> care
>>                    * is needed to avoid losing the dirty bit: it is
>> easier to use
>> @@ -1551,7 +1557,7 @@ split_fallthrough:
>>    unlock:
>>           pte_unmap_unlock(ptep, ptl);
>>    out:
>> -       return page;
>> +       return (fix_write_permission == true) ? NULL: page;
>>
>>    bad_page:
>>           pte_unmap_unlock(ptep, ptl);
> You patch not only is uglier (more ifdef's) but also incomplete since it
> doesn't handle the young case and it doesn't handle the spurious fault
> case either.
>

Yep, I know holding lots of ifdef's everywhere is not so good,
but if we have some other way(I don't know how till now) to
figure out the arch has the need to fixup up the write permission
we could eradicate the ugly ifdef's here.

I think the handle_mm_fault could do all dirty/young tracking,
because the purpose of making follow_page return NULL to
its caller is that want to the handle_mm_fault to be called
on write permission protection fault.

Thanks
Shan Hai

> What the futex code is trying to do is use gup() as a way to fixup from
> a fault which means essentially to have the -exact- same semantics as a
> normal fault would have.
>
> Thus by factoring the common fault fixup code and using that exact same
> code in gup(), we get a much more robust guarantee that this will work
> in the long run.
>
> I don't expect gup to be that commonly used to fixup access after an
> attempt at doing a user access with page faults disabled, only those
> case will need to be modified to use the new flag.
>
>>    From the CONFIG_FIXUP_WRITE_PERMISSION and
>> (flags&  FOLL_WRITE)&&  !pte_dirty(pte) the follow_page()
>> could figure out that the caller want to write to the
>> (present&&  writable&&  non-dirty) pte, and the architecture
>> want to fixup the problem by indicating CONFIG_FIXUP_WRITE_PERMISSION,
>> so let the follow_page() return NULL to the __get_user_pages, and
>> let the handle_mm_fault to fixup dirty/young tracking.
>>
>> Checking the following code we can conclude that the handle_mm_fault
>> is ready to handle the faults and the write permission violation is
>> a kind of fault, so why don't we let the handle_mm_fault to
>> handle that fault?
>>
>> __get_user_pages()
>>        while (!(page = follow_page(vma, start, foll_flags))) {
>>           ...
>>           ret = handle_mm_fault(mm, vma, start,
>>                                                           fault_flags);
>>           ...
>>       }
> Cheers,
> Ben.
>
>


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-18  7:26                       ` Shan Hai
  0 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-18  7:26 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: tony.luck, Peter Zijlstra, Peter Zijlstra, linux-kernel,
	cmetcalf, dhowells, paulus, tglx, walken, linuxppc-dev, akpm

On 07/18/2011 03:01 PM, Benjamin Herrenschmidt wrote:
> On Mon, 2011-07-18 at 14:48 +0800, Shan Hai wrote:
>
>> It could not fix the problem, refer the following reply for
>> the reasons.
>   .../...
>
>>> diff --git a/kernel/futex.c b/kernel/futex.c
>>> index fe28dc2..02adff7 100644
>>> --- a/kernel/futex.c
>>> +++ b/kernel/futex.c
>>> @@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
>>>    	int ret;
>>>
>>>    	down_read(&mm->mmap_sem);
>>> -	ret = get_user_pages(current, mm, (unsigned long)uaddr,
>>> -			     1, 1, 0, NULL, NULL);
>>> +	ret = __get_user_pages(current, mm, (unsigned long)uaddr, 1,
>>> +			       FOLL_WRITE | FOLL_FIXFAULT, NULL, NULL, NULL);
>> the FOLL_FIXFAULT is filtered out at the following code
>> get_user_pages()
>>       if (write)
>>                   flags |= FOLL_WRITE;
>>
> I'm not sure what you're talking about here, you may notice that I'm
> calling __get_user_pages() not get_user_pages(). Make sure you get my
> -second- post of the patch (the one with a proper description&  s-o-b)
> since the first one was a mis-send of an wip version.
>

I am sorry I hadn't tried your newer patch, I tried it but it still 
could not
work in my test environment, I will dig into and tell you why
that failed later.

>>> +
>>> +	if (flags&   FOLL_FIXFAULT)
>>> +		handle_pte_sw_young_dirty(vma, address, ptep,
>>> +					  flags&   FOLL_WRITE);
>>>    	if (flags&   FOLL_TOUCH) {
>>>    		if ((flags&   FOLL_WRITE)&&
>>>    		!pte_dirty(pte)&&   !PageDirty(page))
>> call handle_pte_sw_young_dirty before !pte_dirty(pte)
>> might has problems.
> No this is on purpose.
>
> My initial patch was only calling it under the same condition as the
> FOLL_TOUCH case, but I got concerned by this whole
> flush_tlb_fix_spurious_fault() business.
>
> Basically, our generic code is designed so that relaxing write
> protection on a PTE can be done without flushing the TLB on all CPUs, so
> that a "spurrious" fault on a secondary CPU will flush the TLB at that
> point.
>
> I don't know which arch relies on this feature (ARM maybe ?) but if we
> are going to be semantically equivalent to a real fault, we must also do
> that, so the right thing to do here is to always call in there if
> FOLL_FIXFAULT is set.
>
> It's up to the caller to only set FOLL_FIXFAULT when really trying to
> deal with an -EFAULT, to avoid possible unnecessary overhead, but in
> this case I think we are fine, this is all a fallback slow path.
>
>   .../...
>
>> So what about the following?
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 9b8a01d..fb48122 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -1442,6 +1442,7 @@ struct page *follow_page(struct vm_area_struct
>> *vma, unsig
>>           spinlock_t *ptl;
>>           struct page *page;
>>           struct mm_struct *mm = vma->vm_mm;
>> +       int fix_write_permission = false;
>>
>>           page = follow_huge_addr(mm, address, flags&  FOLL_WRITE);
>>           if (!IS_ERR(page)) {
>> @@ -1519,6 +1520,11 @@ split_fallthrough:
>>                   if ((flags&  FOLL_WRITE)&&
>>                       !pte_dirty(pte)&&  !PageDirty(page))
>>                           set_page_dirty(page);
>> +
>> +#ifdef CONFIG_FIXUP_WRITE_PERMISSION
>> +               if ((flags&  FOLL_WRITE)&&  !pte_dirty(pte))
>> +                       fix_write_permission = true;
>> +#endif
>>                   /*
>>                    * pte_mkyoung() would be more correct here, but atomic
>> care
>>                    * is needed to avoid losing the dirty bit: it is
>> easier to use
>> @@ -1551,7 +1557,7 @@ split_fallthrough:
>>    unlock:
>>           pte_unmap_unlock(ptep, ptl);
>>    out:
>> -       return page;
>> +       return (fix_write_permission == true) ? NULL: page;
>>
>>    bad_page:
>>           pte_unmap_unlock(ptep, ptl);
> You patch not only is uglier (more ifdef's) but also incomplete since it
> doesn't handle the young case and it doesn't handle the spurious fault
> case either.
>

Yep, I know holding lots of ifdef's everywhere is not so good,
but if we have some other way(I don't know how till now) to
figure out the arch has the need to fixup up the write permission
we could eradicate the ugly ifdef's here.

I think the handle_mm_fault could do all dirty/young tracking,
because the purpose of making follow_page return NULL to
its caller is that want to the handle_mm_fault to be called
on write permission protection fault.

Thanks
Shan Hai

> What the futex code is trying to do is use gup() as a way to fixup from
> a fault which means essentially to have the -exact- same semantics as a
> normal fault would have.
>
> Thus by factoring the common fault fixup code and using that exact same
> code in gup(), we get a much more robust guarantee that this will work
> in the long run.
>
> I don't expect gup to be that commonly used to fixup access after an
> attempt at doing a user access with page faults disabled, only those
> case will need to be modified to use the new flag.
>
>>    From the CONFIG_FIXUP_WRITE_PERMISSION and
>> (flags&  FOLL_WRITE)&&  !pte_dirty(pte) the follow_page()
>> could figure out that the caller want to write to the
>> (present&&  writable&&  non-dirty) pte, and the architecture
>> want to fixup the problem by indicating CONFIG_FIXUP_WRITE_PERMISSION,
>> so let the follow_page() return NULL to the __get_user_pages, and
>> let the handle_mm_fault to fixup dirty/young tracking.
>>
>> Checking the following code we can conclude that the handle_mm_fault
>> is ready to handle the faults and the write permission violation is
>> a kind of fault, so why don't we let the handle_mm_fault to
>> handle that fault?
>>
>> __get_user_pages()
>>        while (!(page = follow_page(vma, start, foll_flags))) {
>>           ...
>>           ret = handle_mm_fault(mm, vma, start,
>>                                                           fault_flags);
>>           ...
>>       }
> Cheers,
> Ben.
>
>

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-18  7:26                       ` Shan Hai
@ 2011-07-18  7:36                         ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-18  7:36 UTC (permalink / raw)
  To: Shan Hai
  Cc: Peter Zijlstra, Peter Zijlstra, paulus, tglx, walken, dhowells,
	cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel

On Mon, 2011-07-18 at 15:26 +0800, Shan Hai wrote:
> 
> I am sorry I hadn't tried your newer patch, I tried it but it still 
> could not work in my test environment, I will dig into and tell you
> why that failed later.

Ok, please let me know what you find !

> Yep, I know holding lots of ifdef's everywhere is not so good,
> but if we have some other way(I don't know how till now) to
> figure out the arch has the need to fixup up the write permission
> we could eradicate the ugly ifdef's here.
> 
> I think the handle_mm_fault could do all dirty/young tracking,
> because the purpose of making follow_page return NULL to
> its caller is that want to the handle_mm_fault to be called
> on write permission protection fault.

I see your point. Rather than factoring the fixup code out, we could
force gup to call handle_mm_fault()... that makes sense.

However, I don't think we should special case archs. There's plenty of
cases where we don't care about this fixup even on archs that do SW
tracking of dirty and young. For example when gup is using for
subsequent DMA.

Only the (rare ?) cases where it's used as a mean to fixup a failing
"atomic" user access are relevant.

So I believe we should still pass an explicit flag to __get_user_pages()
as I propose to activate that behaviour.

At this point, since we have isolated the special case callers, I think
we are pretty much in a situation where there's no point trying to
optimize the x86 case more, it's a fairly slow path anyway, and so no
ifdef should be needed (and x86 already #define out the TLB flush for
spurious faults in handle_pte_fault today).

We don't even need to change follow_page()... we just don't call it the
first time around.

I'll cook up another patch later but first we need to find out why the
one you have doesn't work. There might be another problem lurking (or I
just made a stupid mistake).

BTW. Can you give me some details about how you reproduce the problem ?
I should setup something on a booke machine here to verify things.

Cheers,
Ben.


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-18  7:36                         ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-18  7:36 UTC (permalink / raw)
  To: Shan Hai
  Cc: tony.luck, Peter Zijlstra, Peter Zijlstra, linux-kernel,
	cmetcalf, dhowells, paulus, tglx, walken, linuxppc-dev, akpm

On Mon, 2011-07-18 at 15:26 +0800, Shan Hai wrote:
> 
> I am sorry I hadn't tried your newer patch, I tried it but it still 
> could not work in my test environment, I will dig into and tell you
> why that failed later.

Ok, please let me know what you find !

> Yep, I know holding lots of ifdef's everywhere is not so good,
> but if we have some other way(I don't know how till now) to
> figure out the arch has the need to fixup up the write permission
> we could eradicate the ugly ifdef's here.
> 
> I think the handle_mm_fault could do all dirty/young tracking,
> because the purpose of making follow_page return NULL to
> its caller is that want to the handle_mm_fault to be called
> on write permission protection fault.

I see your point. Rather than factoring the fixup code out, we could
force gup to call handle_mm_fault()... that makes sense.

However, I don't think we should special case archs. There's plenty of
cases where we don't care about this fixup even on archs that do SW
tracking of dirty and young. For example when gup is using for
subsequent DMA.

Only the (rare ?) cases where it's used as a mean to fixup a failing
"atomic" user access are relevant.

So I believe we should still pass an explicit flag to __get_user_pages()
as I propose to activate that behaviour.

At this point, since we have isolated the special case callers, I think
we are pretty much in a situation where there's no point trying to
optimize the x86 case more, it's a fairly slow path anyway, and so no
ifdef should be needed (and x86 already #define out the TLB flush for
spurious faults in handle_pte_fault today).

We don't even need to change follow_page()... we just don't call it the
first time around.

I'll cook up another patch later but first we need to find out why the
one you have doesn't work. There might be another problem lurking (or I
just made a stupid mistake).

BTW. Can you give me some details about how you reproduce the problem ?
I should setup something on a booke machine here to verify things.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-18  7:36                         ` Benjamin Herrenschmidt
@ 2011-07-18  7:50                           ` Shan Hai
  -1 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-18  7:50 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Peter Zijlstra, Peter Zijlstra, paulus, tglx, walken, dhowells,
	cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2514 bytes --]

On 07/18/2011 03:36 PM, Benjamin Herrenschmidt wrote:
> On Mon, 2011-07-18 at 15:26 +0800, Shan Hai wrote:
>> I am sorry I hadn't tried your newer patch, I tried it but it still
>> could not work in my test environment, I will dig into and tell you
>> why that failed later.
> Ok, please let me know what you find !
>
>> Yep, I know holding lots of ifdef's everywhere is not so good,
>> but if we have some other way(I don't know how till now) to
>> figure out the arch has the need to fixup up the write permission
>> we could eradicate the ugly ifdef's here.
>>
>> I think the handle_mm_fault could do all dirty/young tracking,
>> because the purpose of making follow_page return NULL to
>> its caller is that want to the handle_mm_fault to be called
>> on write permission protection fault.
> I see your point. Rather than factoring the fixup code out, we could
> force gup to call handle_mm_fault()... that makes sense.
>
> However, I don't think we should special case archs. There's plenty of
> cases where we don't care about this fixup even on archs that do SW
> tracking of dirty and young. For example when gup is using for
> subsequent DMA.
>
> Only the (rare ?) cases where it's used as a mean to fixup a failing
> "atomic" user access are relevant.
>
> So I believe we should still pass an explicit flag to __get_user_pages()
> as I propose to activate that behaviour.
>
> At this point, since we have isolated the special case callers, I think
> we are pretty much in a situation where there's no point trying to
> optimize the x86 case more, it's a fairly slow path anyway, and so no
> ifdef should be needed (and x86 already #define out the TLB flush for
> spurious faults in handle_pte_fault today).
>
> We don't even need to change follow_page()... we just don't call it the
> first time around.
>
> I'll cook up another patch later but first we need to find out why the
> one you have doesn't work. There might be another problem lurking (or I
> just made a stupid mistake).
>
> BTW. Can you give me some details about how you reproduce the problem ?
> I should setup something on a booke machine here to verify things.
>

Please get the test case code from the thread
"[PATCH 0/1] Fixup write permission of TLB on powerpc e500 core"'s
attachment, simply compile it and do the following,
- run the test case on the board
- run 'top' on the other terminal, you should observe almost
     100% CPU system usage

I also attached the kernel config file.

Best regards
Shan Hai


> Cheers,
> Ben.
>


[-- Attachment #2: config --]
[-- Type: text/plain, Size: 76255 bytes --]

#
# Automatically generated make config: don't edit
# Linux/powerpc 3.0.0-rc6 Kernel Configuration
#
# CONFIG_PPC64 is not set

#
# Processor support
#
# CONFIG_PPC_BOOK3S_32 is not set
CONFIG_PPC_85xx=y
# CONFIG_PPC_8xx is not set
# CONFIG_40x is not set
# CONFIG_44x is not set
# CONFIG_E200 is not set
CONFIG_E500=y
# CONFIG_PPC_E500MC is not set
CONFIG_FSL_EMB_PERFMON=y
CONFIG_FSL_EMB_PERF_EVENT=y
CONFIG_FSL_EMB_PERF_EVENT_E500=y
CONFIG_BOOKE=y
CONFIG_FSL_BOOKE=y
CONFIG_PPC_FSL_BOOK3E=y
# CONFIG_PHYS_64BIT is not set
CONFIG_SPE=y
CONFIG_PPC_MMU_NOHASH=y
CONFIG_PPC_MMU_NOHASH_32=y
CONFIG_PPC_BOOK3E_MMU=y
# CONFIG_PPC_MM_SLICES is not set
CONFIG_SMP=y
CONFIG_NR_CPUS=255
CONFIG_PPC32=y
CONFIG_32BIT=y
CONFIG_WORD_SIZE=32
# CONFIG_ARCH_PHYS_ADDR_T_64BIT is not set
# CONFIG_ARCH_DMA_ADDR_T_64BIT is not set
CONFIG_MMU=y
CONFIG_GENERIC_CMOS_UPDATE=y
CONFIG_GENERIC_TIME_VSYSCALL=y
CONFIG_GENERIC_CLOCKEVENTS=y
# CONFIG_HAVE_SETUP_PER_CPU_AREA is not set
# CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK is not set
CONFIG_NR_IRQS=512
CONFIG_STACKTRACE_SUPPORT=y
CONFIG_HAVE_LATENCYTOP_SUPPORT=y
CONFIG_TRACE_IRQFLAGS_SUPPORT=y
CONFIG_LOCKDEP_SUPPORT=y
CONFIG_RWSEM_XCHGADD_ALGORITHM=y
CONFIG_ARCH_HAS_ILOG2_U32=y
CONFIG_GENERIC_HWEIGHT=y
# CONFIG_ARCH_NO_VIRT_TO_BUS is not set
CONFIG_PPC=y
CONFIG_EARLY_PRINTK=y
CONFIG_GENERIC_NVRAM=y
CONFIG_SCHED_OMIT_FRAME_POINTER=y
CONFIG_ARCH_MAY_HAVE_PC_FDC=y
CONFIG_PPC_OF=y
CONFIG_PPC_UDBG_16550=y
CONFIG_GENERIC_TBSYNC=y
CONFIG_AUDIT_ARCH=y
CONFIG_GENERIC_BUG=y
# CONFIG_EPAPR_BOOT is not set
CONFIG_DEFAULT_UIMAGE=y
CONFIG_ARCH_HIBERNATION_POSSIBLE=y
# CONFIG_PPC_DCR_NATIVE is not set
# CONFIG_PPC_DCR_MMIO is not set
CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
CONFIG_PPC_ADV_DEBUG_REGS=y
CONFIG_PPC_ADV_DEBUG_IACS=2
CONFIG_PPC_ADV_DEBUG_DACS=2
CONFIG_PPC_ADV_DEBUG_DVCS=0
CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
CONFIG_HAVE_IRQ_WORK=y
CONFIG_IRQ_WORK=y

#
# General setup
#
CONFIG_EXPERIMENTAL=y
CONFIG_INIT_ENV_ARG_LIMIT=32
CONFIG_CROSS_COMPILE=""
CONFIG_LOCALVERSION=""
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_DEFAULT_HOSTNAME="(none)"
CONFIG_SWAP=y
CONFIG_SYSVIPC=y
CONFIG_SYSVIPC_SYSCTL=y
CONFIG_POSIX_MQUEUE=y
CONFIG_POSIX_MQUEUE_SYSCTL=y
CONFIG_BSD_PROCESS_ACCT=y
# CONFIG_BSD_PROCESS_ACCT_V3 is not set
# CONFIG_FHANDLE is not set
CONFIG_TASKSTATS=y
CONFIG_TASK_DELAY_ACCT=y
# CONFIG_TASK_XACCT is not set
CONFIG_AUDIT=y
CONFIG_AUDITSYSCALL=y
CONFIG_AUDIT_WATCH=y
CONFIG_AUDIT_TREE=y
CONFIG_HAVE_GENERIC_HARDIRQS=y

#
# IRQ subsystem
#
CONFIG_GENERIC_HARDIRQS=y
CONFIG_HAVE_SPARSE_IRQ=y
CONFIG_GENERIC_IRQ_SHOW=y
CONFIG_GENERIC_IRQ_SHOW_LEVEL=y
# CONFIG_SPARSE_IRQ is not set

#
# RCU Subsystem
#
CONFIG_TREE_RCU=y
# CONFIG_PREEMPT_RCU is not set
# CONFIG_RCU_TRACE is not set
CONFIG_RCU_FANOUT=32
# CONFIG_RCU_FANOUT_EXACT is not set
# CONFIG_TREE_RCU_TRACE is not set
# CONFIG_IKCONFIG is not set
CONFIG_LOG_BUF_SHIFT=17
# CONFIG_CGROUPS is not set
CONFIG_NAMESPACES=y
CONFIG_UTS_NS=y
CONFIG_IPC_NS=y
CONFIG_USER_NS=y
CONFIG_PID_NS=y
CONFIG_NET_NS=y
# CONFIG_SCHED_AUTOGROUP is not set
# CONFIG_SYSFS_DEPRECATED is not set
CONFIG_RELAY=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_INITRAMFS_SOURCE=""
CONFIG_RD_GZIP=y
CONFIG_RD_BZIP2=y
CONFIG_RD_LZMA=y
CONFIG_RD_XZ=y
CONFIG_RD_LZO=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
CONFIG_SYSCTL=y
CONFIG_ANON_INODES=y
# CONFIG_EXPERT is not set
CONFIG_SYSCTL_SYSCALL=y
CONFIG_KALLSYMS=y
# CONFIG_KALLSYMS_ALL is not set
CONFIG_HOTPLUG=y
CONFIG_PRINTK=y
CONFIG_BUG=y
CONFIG_ELF_CORE=y
CONFIG_BASE_FULL=y
CONFIG_FUTEX=y
CONFIG_EPOLL=y
CONFIG_SIGNALFD=y
CONFIG_TIMERFD=y
CONFIG_EVENTFD=y
CONFIG_SHMEM=y
CONFIG_AIO=y
# CONFIG_EMBEDDED is not set
CONFIG_HAVE_PERF_EVENTS=y

#
# Kernel Performance Events And Counters
#
CONFIG_PERF_EVENTS=y
# CONFIG_PERF_COUNTERS is not set
# CONFIG_DEBUG_PERF_USE_VMALLOC is not set
CONFIG_VM_EVENT_COUNTERS=y
CONFIG_PCI_QUIRKS=y
CONFIG_COMPAT_BRK=y
CONFIG_SLAB=y
# CONFIG_SLUB is not set
CONFIG_PROFILING=y
CONFIG_TRACEPOINTS=y
CONFIG_OPROFILE=m
CONFIG_HAVE_OPROFILE=y
CONFIG_KPROBES=y
CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y
CONFIG_KRETPROBES=y
CONFIG_HAVE_IOREMAP_PROT=y
CONFIG_HAVE_KPROBES=y
CONFIG_HAVE_KRETPROBES=y
CONFIG_HAVE_ARCH_TRACEHOOK=y
CONFIG_HAVE_DMA_ATTRS=y
CONFIG_USE_GENERIC_SMP_HELPERS=y
CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y
CONFIG_HAVE_DMA_API_DEBUG=y
CONFIG_HAVE_RCU_TABLE_FREE=y

#
# GCOV-based kernel profiling
#
# CONFIG_GCOV_KERNEL is not set
# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
CONFIG_SLABINFO=y
CONFIG_RT_MUTEXES=y
CONFIG_BASE_SMALL=0
CONFIG_MODULES=y
# CONFIG_MODULE_FORCE_LOAD is not set
CONFIG_MODULE_UNLOAD=y
# CONFIG_MODULE_FORCE_UNLOAD is not set
CONFIG_MODVERSIONS=y
CONFIG_MODULE_SRCVERSION_ALL=y
CONFIG_STOP_MACHINE=y
CONFIG_BLOCK=y
CONFIG_LBDAF=y
CONFIG_BLK_DEV_BSG=y
# CONFIG_BLK_DEV_INTEGRITY is not set

#
# IO Schedulers
#
CONFIG_IOSCHED_NOOP=y
CONFIG_IOSCHED_DEADLINE=y
CONFIG_IOSCHED_CFQ=y
# CONFIG_DEFAULT_DEADLINE is not set
CONFIG_DEFAULT_CFQ=y
# CONFIG_DEFAULT_NOOP is not set
CONFIG_DEFAULT_IOSCHED="cfq"
# CONFIG_INLINE_SPIN_TRYLOCK is not set
# CONFIG_INLINE_SPIN_TRYLOCK_BH is not set
# CONFIG_INLINE_SPIN_LOCK is not set
# CONFIG_INLINE_SPIN_LOCK_BH is not set
# CONFIG_INLINE_SPIN_LOCK_IRQ is not set
# CONFIG_INLINE_SPIN_LOCK_IRQSAVE is not set
CONFIG_INLINE_SPIN_UNLOCK=y
# CONFIG_INLINE_SPIN_UNLOCK_BH is not set
CONFIG_INLINE_SPIN_UNLOCK_IRQ=y
# CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE is not set
# CONFIG_INLINE_READ_TRYLOCK is not set
# CONFIG_INLINE_READ_LOCK is not set
# CONFIG_INLINE_READ_LOCK_BH is not set
# CONFIG_INLINE_READ_LOCK_IRQ is not set
# CONFIG_INLINE_READ_LOCK_IRQSAVE is not set
CONFIG_INLINE_READ_UNLOCK=y
# CONFIG_INLINE_READ_UNLOCK_BH is not set
CONFIG_INLINE_READ_UNLOCK_IRQ=y
# CONFIG_INLINE_READ_UNLOCK_IRQRESTORE is not set
# CONFIG_INLINE_WRITE_TRYLOCK is not set
# CONFIG_INLINE_WRITE_LOCK is not set
# CONFIG_INLINE_WRITE_LOCK_BH is not set
# CONFIG_INLINE_WRITE_LOCK_IRQ is not set
# CONFIG_INLINE_WRITE_LOCK_IRQSAVE is not set
CONFIG_INLINE_WRITE_UNLOCK=y
# CONFIG_INLINE_WRITE_UNLOCK_BH is not set
CONFIG_INLINE_WRITE_UNLOCK_IRQ=y
# CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE is not set
CONFIG_MUTEX_SPIN_ON_OWNER=y
# CONFIG_FREEZER is not set
CONFIG_PPC_MSI_BITMAP=y
# CONFIG_PPC_XICS is not set
# CONFIG_PPC_ICP_NATIVE is not set
# CONFIG_PPC_ICP_HV is not set
# CONFIG_PPC_ICS_RTAS is not set

#
# Platform support
#
# CONFIG_PPC_CELL is not set
# CONFIG_PPC_CELL_NATIVE is not set
# CONFIG_PQ2ADS is not set
CONFIG_FSL_SOC_BOOKE=y
# CONFIG_MPC8540_ADS is not set
# CONFIG_MPC8560_ADS is not set
# CONFIG_MPC85xx_CDS is not set
# CONFIG_MPC85xx_MDS is not set
# CONFIG_MPC8536_DS is not set
# CONFIG_MPC85xx_DS is not set
# CONFIG_MPC85xx_RDB is not set
# CONFIG_P1022_DS is not set
# CONFIG_SOCRATES is not set
# CONFIG_KSI8560 is not set
# CONFIG_XES_MPC85xx is not set
# CONFIG_STX_GP3 is not set
# CONFIG_TQM8540 is not set
# CONFIG_TQM8541 is not set
# CONFIG_TQM8548 is not set
# CONFIG_TQM8555 is not set
# CONFIG_TQM8560 is not set
CONFIG_SBC8548=y
# CONFIG_SBC8560 is not set
# CONFIG_P3041_DS is not set
# CONFIG_P4080_DS is not set
# CONFIG_P5020_DS is not set
# CONFIG_PPC_WSP is not set
CONFIG_KVM_GUEST=y
CONFIG_PPC_SMP_MUXED_IPI=y
# CONFIG_IPIC is not set
CONFIG_MPIC=y
# CONFIG_MPIC_WEIRD is not set
# CONFIG_PPC_I8259 is not set
# CONFIG_PPC_RTAS is not set
# CONFIG_MMIO_NVRAM is not set
# CONFIG_MPIC_U3_HT_IRQS is not set
# CONFIG_PPC_MPC106 is not set
# CONFIG_PPC_970_NAP is not set
# CONFIG_PPC_P7_NAP is not set

#
# CPU Frequency scaling
#
CONFIG_CPU_FREQ=y
CONFIG_CPU_FREQ_TABLE=m
CONFIG_CPU_FREQ_STAT=m
CONFIG_CPU_FREQ_STAT_DETAILS=y
# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set
CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y
# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set
# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set
CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
CONFIG_CPU_FREQ_GOV_POWERSAVE=m
CONFIG_CPU_FREQ_GOV_USERSPACE=y
CONFIG_CPU_FREQ_GOV_ONDEMAND=m
CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m

#
# CPU Frequency drivers
#
# CONFIG_QUICC_ENGINE is not set
# CONFIG_CPM2 is not set
# CONFIG_FSL_ULI1575 is not set
# CONFIG_MPC8xxx_GPIO is not set
# CONFIG_SIMPLE_GPIO is not set

#
# Kernel options
#
# CONFIG_HIGHMEM is not set
# CONFIG_NO_HZ is not set
# CONFIG_HIGH_RES_TIMERS is not set
CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
# CONFIG_HZ_100 is not set
# CONFIG_HZ_250 is not set
# CONFIG_HZ_300 is not set
CONFIG_HZ_1000=y
CONFIG_HZ=1000
# CONFIG_SCHED_HRTICK is not set
# CONFIG_PREEMPT_NONE is not set
CONFIG_PREEMPT_VOLUNTARY=y
# CONFIG_PREEMPT is not set
CONFIG_BINFMT_ELF=y
CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
# CONFIG_HAVE_AOUT is not set
CONFIG_BINFMT_MISC=y
# CONFIG_MATH_EMULATION is not set
CONFIG_IOMMU_HELPER=y
CONFIG_SWIOTLB=y
CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
CONFIG_ARCH_HAS_WALK_MEMORY=y
CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y
CONFIG_KEXEC=y
CONFIG_CRASH_DUMP=y
# CONFIG_IRQ_ALL_CPUS is not set
CONFIG_MAX_ACTIVE_REGIONS=32
CONFIG_ARCH_FLATMEM_ENABLE=y
CONFIG_ARCH_POPULATES_NODE_MAP=y
CONFIG_SELECT_MEMORY_MODEL=y
CONFIG_FLATMEM_MANUAL=y
CONFIG_FLATMEM=y
CONFIG_FLAT_NODE_MEM_MAP=y
CONFIG_HAVE_MEMBLOCK=y
CONFIG_PAGEFLAGS_EXTENDED=y
CONFIG_SPLIT_PTLOCK_CPUS=4
# CONFIG_COMPACTION is not set
CONFIG_MIGRATION=y
# CONFIG_PHYS_ADDR_T_64BIT is not set
CONFIG_ZONE_DMA_FLAG=1
CONFIG_BOUNCE=y
CONFIG_VIRT_TO_BUS=y
# CONFIG_KSM is not set
CONFIG_DEFAULT_MMAP_MIN_ADDR=4096
# CONFIG_CLEANCACHE is not set
CONFIG_PPC_4K_PAGES=y
CONFIG_FORCE_MAX_ZONEORDER=11
# CONFIG_CMDLINE_BOOL is not set
CONFIG_EXTRA_TARGETS=""
# CONFIG_HIBERNATION is not set
# CONFIG_PM_RUNTIME is not set
# CONFIG_SECCOMP is not set
CONFIG_ISA_DMA_API=y

#
# Bus options
#
CONFIG_ZONE_DMA=y
# CONFIG_NEED_DMA_MAP_STATE is not set
CONFIG_NEED_SG_DMA_LENGTH=y
CONFIG_GENERIC_ISA_DMA=y
CONFIG_PPC_INDIRECT_PCI=y
CONFIG_FSL_SOC=y
CONFIG_FSL_PCI=y
# CONFIG_FSL_LBC is not set
CONFIG_PPC_PCI_CHOICE=y
CONFIG_PCI=y
CONFIG_PCI_DOMAINS=y
CONFIG_PCI_SYSCALL=y
CONFIG_PCIEPORTBUS=y
CONFIG_HOTPLUG_PCI_PCIE=m
CONFIG_PCIEAER=y
# CONFIG_PCIE_ECRC is not set
# CONFIG_PCIEAER_INJECT is not set
CONFIG_PCIEASPM=y
# CONFIG_PCIEASPM_DEBUG is not set
CONFIG_ARCH_SUPPORTS_MSI=y
CONFIG_PCI_MSI=y
# CONFIG_PCI_DEBUG is not set
# CONFIG_PCI_STUB is not set
# CONFIG_PCI_IOV is not set
CONFIG_PCCARD=y
CONFIG_PCMCIA=y
CONFIG_PCMCIA_LOAD_CIS=y
CONFIG_CARDBUS=y

#
# PC-card bridges
#
CONFIG_YENTA=y
CONFIG_YENTA_O2=y
CONFIG_YENTA_RICOH=y
CONFIG_YENTA_TI=y
CONFIG_YENTA_ENE_TUNE=y
CONFIG_YENTA_TOSHIBA=y
CONFIG_PD6729=m
# CONFIG_I82092 is not set
CONFIG_PCCARD_NONSTATIC=y
CONFIG_HOTPLUG_PCI=y
CONFIG_HOTPLUG_PCI_FAKE=m
# CONFIG_HOTPLUG_PCI_CPCI is not set
CONFIG_HOTPLUG_PCI_SHPC=m
# CONFIG_HAS_RAPIDIO is not set
# CONFIG_RAPIDIO is not set

#
# Advanced setup
#
# CONFIG_ADVANCED_OPTIONS is not set

#
# Default settings for advanced configuration options are used
#
CONFIG_LOWMEM_SIZE=0x30000000
CONFIG_LOWMEM_CAM_NUM=3
CONFIG_RELOCATABLE=y
CONFIG_PAGE_OFFSET=0xc0000000
CONFIG_KERNEL_START=0xc0000000
CONFIG_PHYSICAL_START=0x00000000
CONFIG_PHYSICAL_ALIGN=0x04000000
CONFIG_TASK_SIZE=0xc0000000
CONFIG_NET=y

#
# Networking options
#
CONFIG_PACKET=y
CONFIG_UNIX=y
CONFIG_XFRM=y
CONFIG_XFRM_USER=y
# CONFIG_XFRM_SUB_POLICY is not set
# CONFIG_XFRM_MIGRATE is not set
# CONFIG_XFRM_STATISTICS is not set
CONFIG_XFRM_IPCOMP=m
CONFIG_NET_KEY=m
# CONFIG_NET_KEY_MIGRATE is not set
CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_ADVANCED_ROUTER=y
# CONFIG_IP_FIB_TRIE_STATS is not set
CONFIG_IP_MULTIPLE_TABLES=y
CONFIG_IP_ROUTE_MULTIPATH=y
CONFIG_IP_ROUTE_VERBOSE=y
CONFIG_IP_ROUTE_CLASSID=y
CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
CONFIG_IP_PNP_BOOTP=y
CONFIG_IP_PNP_RARP=y
CONFIG_NET_IPIP=m
# CONFIG_NET_IPGRE_DEMUX is not set
CONFIG_IP_MROUTE=y
# CONFIG_IP_MROUTE_MULTIPLE_TABLES is not set
CONFIG_IP_PIMSM_V1=y
CONFIG_IP_PIMSM_V2=y
# CONFIG_ARPD is not set
CONFIG_SYN_COOKIES=y
CONFIG_INET_AH=m
CONFIG_INET_ESP=m
CONFIG_INET_IPCOMP=m
CONFIG_INET_XFRM_TUNNEL=m
CONFIG_INET_TUNNEL=m
CONFIG_INET_XFRM_MODE_TRANSPORT=m
CONFIG_INET_XFRM_MODE_TUNNEL=m
CONFIG_INET_XFRM_MODE_BEET=y
CONFIG_INET_LRO=y
CONFIG_INET_DIAG=m
CONFIG_INET_TCP_DIAG=m
CONFIG_TCP_CONG_ADVANCED=y
CONFIG_TCP_CONG_BIC=y
CONFIG_TCP_CONG_CUBIC=m
CONFIG_TCP_CONG_WESTWOOD=m
CONFIG_TCP_CONG_HTCP=m
CONFIG_TCP_CONG_HSTCP=m
CONFIG_TCP_CONG_HYBLA=m
CONFIG_TCP_CONG_VEGAS=m
CONFIG_TCP_CONG_SCALABLE=m
CONFIG_TCP_CONG_LP=m
CONFIG_TCP_CONG_VENO=m
# CONFIG_TCP_CONG_YEAH is not set
# CONFIG_TCP_CONG_ILLINOIS is not set
CONFIG_DEFAULT_BIC=y
# CONFIG_DEFAULT_RENO is not set
CONFIG_DEFAULT_TCP_CONG="bic"
# CONFIG_TCP_MD5SIG is not set
CONFIG_IPV6=m
CONFIG_IPV6_PRIVACY=y
CONFIG_IPV6_ROUTER_PREF=y
CONFIG_IPV6_ROUTE_INFO=y
CONFIG_IPV6_OPTIMISTIC_DAD=y
CONFIG_INET6_AH=m
CONFIG_INET6_ESP=m
CONFIG_INET6_IPCOMP=m
# CONFIG_IPV6_MIP6 is not set
CONFIG_INET6_XFRM_TUNNEL=m
CONFIG_INET6_TUNNEL=m
CONFIG_INET6_XFRM_MODE_TRANSPORT=m
CONFIG_INET6_XFRM_MODE_TUNNEL=m
CONFIG_INET6_XFRM_MODE_BEET=m
# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set
CONFIG_IPV6_SIT=m
# CONFIG_IPV6_SIT_6RD is not set
CONFIG_IPV6_NDISC_NODETYPE=y
CONFIG_IPV6_TUNNEL=m
CONFIG_IPV6_MULTIPLE_TABLES=y
# CONFIG_IPV6_SUBTREES is not set
# CONFIG_IPV6_MROUTE is not set
CONFIG_NETWORK_SECMARK=y
# CONFIG_NETWORK_PHY_TIMESTAMPING is not set
CONFIG_NETFILTER=y
# CONFIG_NETFILTER_DEBUG is not set
CONFIG_NETFILTER_ADVANCED=y
CONFIG_BRIDGE_NETFILTER=y

#
# Core Netfilter Configuration
#
CONFIG_NETFILTER_NETLINK=m
CONFIG_NETFILTER_NETLINK_QUEUE=m
CONFIG_NETFILTER_NETLINK_LOG=m
# CONFIG_NF_CONNTRACK is not set
# CONFIG_NETFILTER_TPROXY is not set
CONFIG_NETFILTER_XTABLES=m

#
# Xtables combined modules
#
CONFIG_NETFILTER_XT_MARK=m

#
# Xtables targets
#
# CONFIG_NETFILTER_XT_TARGET_AUDIT is not set
# CONFIG_NETFILTER_XT_TARGET_CHECKSUM is not set
CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
# CONFIG_NETFILTER_XT_TARGET_DSCP is not set
CONFIG_NETFILTER_XT_TARGET_HL=m
# CONFIG_NETFILTER_XT_TARGET_IDLETIMER is not set
# CONFIG_NETFILTER_XT_TARGET_LED is not set
CONFIG_NETFILTER_XT_TARGET_MARK=m
# CONFIG_NETFILTER_XT_TARGET_NFLOG is not set
CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
# CONFIG_NETFILTER_XT_TARGET_RATEEST is not set
# CONFIG_NETFILTER_XT_TARGET_TEE is not set
# CONFIG_NETFILTER_XT_TARGET_TRACE is not set
CONFIG_NETFILTER_XT_TARGET_SECMARK=m
# CONFIG_NETFILTER_XT_TARGET_TCPMSS is not set
# CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP is not set

#
# Xtables matches
#
# CONFIG_NETFILTER_XT_MATCH_ADDRTYPE is not set
CONFIG_NETFILTER_XT_MATCH_COMMENT=m
# CONFIG_NETFILTER_XT_MATCH_CPU is not set
CONFIG_NETFILTER_XT_MATCH_DCCP=m
# CONFIG_NETFILTER_XT_MATCH_DEVGROUP is not set
# CONFIG_NETFILTER_XT_MATCH_DSCP is not set
CONFIG_NETFILTER_XT_MATCH_ESP=m
# CONFIG_NETFILTER_XT_MATCH_HASHLIMIT is not set
CONFIG_NETFILTER_XT_MATCH_HL=m
# CONFIG_NETFILTER_XT_MATCH_IPRANGE is not set
CONFIG_NETFILTER_XT_MATCH_LENGTH=m
CONFIG_NETFILTER_XT_MATCH_LIMIT=m
CONFIG_NETFILTER_XT_MATCH_MAC=m
CONFIG_NETFILTER_XT_MATCH_MARK=m
CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m
# CONFIG_NETFILTER_XT_MATCH_OSF is not set
# CONFIG_NETFILTER_XT_MATCH_OWNER is not set
CONFIG_NETFILTER_XT_MATCH_POLICY=m
CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m
CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m
CONFIG_NETFILTER_XT_MATCH_QUOTA=m
# CONFIG_NETFILTER_XT_MATCH_RATEEST is not set
CONFIG_NETFILTER_XT_MATCH_REALM=m
# CONFIG_NETFILTER_XT_MATCH_RECENT is not set
CONFIG_NETFILTER_XT_MATCH_SCTP=m
CONFIG_NETFILTER_XT_MATCH_STATISTIC=m
CONFIG_NETFILTER_XT_MATCH_STRING=m
CONFIG_NETFILTER_XT_MATCH_TCPMSS=m
# CONFIG_NETFILTER_XT_MATCH_TIME is not set
# CONFIG_NETFILTER_XT_MATCH_U32 is not set
# CONFIG_IP_SET is not set
CONFIG_IP_VS=m
# CONFIG_IP_VS_IPV6 is not set
# CONFIG_IP_VS_DEBUG is not set
CONFIG_IP_VS_TAB_BITS=12

#
# IPVS transport protocol load balancing support
#
CONFIG_IP_VS_PROTO_TCP=y
CONFIG_IP_VS_PROTO_UDP=y
CONFIG_IP_VS_PROTO_AH_ESP=y
CONFIG_IP_VS_PROTO_ESP=y
CONFIG_IP_VS_PROTO_AH=y
# CONFIG_IP_VS_PROTO_SCTP is not set

#
# IPVS scheduler
#
CONFIG_IP_VS_RR=m
CONFIG_IP_VS_WRR=m
CONFIG_IP_VS_LC=m
CONFIG_IP_VS_WLC=m
CONFIG_IP_VS_LBLC=m
CONFIG_IP_VS_LBLCR=m
CONFIG_IP_VS_DH=m
CONFIG_IP_VS_SH=m
CONFIG_IP_VS_SED=m
CONFIG_IP_VS_NQ=m

#
# IPVS application helper
#

#
# IP: Netfilter Configuration
#
# CONFIG_NF_DEFRAG_IPV4 is not set
CONFIG_IP_NF_QUEUE=m
CONFIG_IP_NF_IPTABLES=m
CONFIG_IP_NF_MATCH_AH=m
CONFIG_IP_NF_MATCH_ECN=m
CONFIG_IP_NF_MATCH_TTL=m
CONFIG_IP_NF_FILTER=m
CONFIG_IP_NF_TARGET_REJECT=m
CONFIG_IP_NF_TARGET_LOG=m
CONFIG_IP_NF_TARGET_ULOG=m
CONFIG_IP_NF_MANGLE=m
CONFIG_IP_NF_TARGET_ECN=m
CONFIG_IP_NF_TARGET_TTL=m
CONFIG_IP_NF_RAW=m
CONFIG_IP_NF_ARPTABLES=m
CONFIG_IP_NF_ARPFILTER=m
CONFIG_IP_NF_ARP_MANGLE=m

#
# IPv6: Netfilter Configuration
#
# CONFIG_NF_DEFRAG_IPV6 is not set
CONFIG_IP6_NF_QUEUE=m
CONFIG_IP6_NF_IPTABLES=m
CONFIG_IP6_NF_MATCH_AH=m
CONFIG_IP6_NF_MATCH_EUI64=m
CONFIG_IP6_NF_MATCH_FRAG=m
CONFIG_IP6_NF_MATCH_OPTS=m
CONFIG_IP6_NF_MATCH_HL=m
CONFIG_IP6_NF_MATCH_IPV6HEADER=m
# CONFIG_IP6_NF_MATCH_MH is not set
CONFIG_IP6_NF_MATCH_RT=m
CONFIG_IP6_NF_TARGET_HL=m
CONFIG_IP6_NF_TARGET_LOG=m
CONFIG_IP6_NF_FILTER=m
CONFIG_IP6_NF_TARGET_REJECT=m
CONFIG_IP6_NF_MANGLE=m
CONFIG_IP6_NF_RAW=m
CONFIG_BRIDGE_NF_EBTABLES=m
CONFIG_BRIDGE_EBT_BROUTE=m
CONFIG_BRIDGE_EBT_T_FILTER=m
CONFIG_BRIDGE_EBT_T_NAT=m
CONFIG_BRIDGE_EBT_802_3=m
CONFIG_BRIDGE_EBT_AMONG=m
CONFIG_BRIDGE_EBT_ARP=m
CONFIG_BRIDGE_EBT_IP=m
# CONFIG_BRIDGE_EBT_IP6 is not set
CONFIG_BRIDGE_EBT_LIMIT=m
CONFIG_BRIDGE_EBT_MARK=m
CONFIG_BRIDGE_EBT_PKTTYPE=m
CONFIG_BRIDGE_EBT_STP=m
CONFIG_BRIDGE_EBT_VLAN=m
CONFIG_BRIDGE_EBT_ARPREPLY=m
CONFIG_BRIDGE_EBT_DNAT=m
CONFIG_BRIDGE_EBT_MARK_T=m
CONFIG_BRIDGE_EBT_REDIRECT=m
CONFIG_BRIDGE_EBT_SNAT=m
CONFIG_BRIDGE_EBT_LOG=m
CONFIG_BRIDGE_EBT_ULOG=m
# CONFIG_BRIDGE_EBT_NFLOG is not set
CONFIG_IP_DCCP=m
CONFIG_INET_DCCP_DIAG=m

#
# DCCP CCIDs Configuration (EXPERIMENTAL)
#
# CONFIG_IP_DCCP_CCID2_DEBUG is not set
CONFIG_IP_DCCP_CCID3=y
# CONFIG_IP_DCCP_CCID3_DEBUG is not set
CONFIG_IP_DCCP_TFRC_LIB=y

#
# DCCP Kernel Hacking
#
# CONFIG_IP_DCCP_DEBUG is not set
# CONFIG_NET_DCCPPROBE is not set
CONFIG_IP_SCTP=m
# CONFIG_NET_SCTPPROBE is not set
# CONFIG_SCTP_DBG_MSG is not set
# CONFIG_SCTP_DBG_OBJCNT is not set
# CONFIG_SCTP_HMAC_NONE is not set
# CONFIG_SCTP_HMAC_SHA1 is not set
CONFIG_SCTP_HMAC_MD5=y
# CONFIG_RDS is not set
CONFIG_TIPC=m
# CONFIG_TIPC_ADVANCED is not set
# CONFIG_TIPC_DEBUG is not set
CONFIG_ATM=m
CONFIG_ATM_CLIP=m
# CONFIG_ATM_CLIP_NO_ICMP is not set
CONFIG_ATM_LANE=m
# CONFIG_ATM_MPOA is not set
CONFIG_ATM_BR2684=m
# CONFIG_ATM_BR2684_IPFILTER is not set
# CONFIG_L2TP is not set
CONFIG_STP=m
CONFIG_BRIDGE=m
CONFIG_BRIDGE_IGMP_SNOOPING=y
# CONFIG_NET_DSA is not set
CONFIG_VLAN_8021Q=m
# CONFIG_VLAN_8021Q_GVRP is not set
# CONFIG_DECNET is not set
CONFIG_LLC=y
# CONFIG_LLC2 is not set
# CONFIG_IPX is not set
# CONFIG_ATALK is not set
# CONFIG_X25 is not set
# CONFIG_LAPB is not set
# CONFIG_ECONET is not set
# CONFIG_WAN_ROUTER is not set
# CONFIG_PHONET is not set
# CONFIG_IEEE802154 is not set
CONFIG_NET_SCHED=y

#
# Queueing/Scheduling
#
CONFIG_NET_SCH_CBQ=m
CONFIG_NET_SCH_HTB=m
CONFIG_NET_SCH_HFSC=m
CONFIG_NET_SCH_ATM=m
CONFIG_NET_SCH_PRIO=m
# CONFIG_NET_SCH_MULTIQ is not set
CONFIG_NET_SCH_RED=m
# CONFIG_NET_SCH_SFB is not set
CONFIG_NET_SCH_SFQ=m
CONFIG_NET_SCH_TEQL=m
CONFIG_NET_SCH_TBF=m
CONFIG_NET_SCH_GRED=m
CONFIG_NET_SCH_DSMARK=m
CONFIG_NET_SCH_NETEM=m
# CONFIG_NET_SCH_DRR is not set
# CONFIG_NET_SCH_MQPRIO is not set
# CONFIG_NET_SCH_CHOKE is not set
# CONFIG_NET_SCH_QFQ is not set
CONFIG_NET_SCH_INGRESS=m

#
# Classification
#
CONFIG_NET_CLS=y
CONFIG_NET_CLS_BASIC=m
CONFIG_NET_CLS_TCINDEX=m
CONFIG_NET_CLS_ROUTE4=m
CONFIG_NET_CLS_FW=m
CONFIG_NET_CLS_U32=m
CONFIG_CLS_U32_PERF=y
CONFIG_CLS_U32_MARK=y
CONFIG_NET_CLS_RSVP=m
CONFIG_NET_CLS_RSVP6=m
# CONFIG_NET_CLS_FLOW is not set
CONFIG_NET_EMATCH=y
CONFIG_NET_EMATCH_STACK=32
CONFIG_NET_EMATCH_CMP=m
CONFIG_NET_EMATCH_NBYTE=m
CONFIG_NET_EMATCH_U32=m
CONFIG_NET_EMATCH_META=m
CONFIG_NET_EMATCH_TEXT=m
CONFIG_NET_CLS_ACT=y
CONFIG_NET_ACT_POLICE=m
CONFIG_NET_ACT_GACT=m
CONFIG_GACT_PROB=y
CONFIG_NET_ACT_MIRRED=m
CONFIG_NET_ACT_IPT=m
# CONFIG_NET_ACT_NAT is not set
CONFIG_NET_ACT_PEDIT=m
CONFIG_NET_ACT_SIMP=m
# CONFIG_NET_ACT_SKBEDIT is not set
# CONFIG_NET_ACT_CSUM is not set
CONFIG_NET_CLS_IND=y
CONFIG_NET_SCH_FIFO=y
# CONFIG_DCB is not set
CONFIG_DNS_RESOLVER=y
# CONFIG_BATMAN_ADV is not set
CONFIG_RPS=y
CONFIG_RFS_ACCEL=y
CONFIG_XPS=y

#
# Network testing
#
CONFIG_NET_PKTGEN=m
# CONFIG_NET_TCPPROBE is not set
# CONFIG_NET_DROP_MONITOR is not set
# CONFIG_HAMRADIO is not set
# CONFIG_CAN is not set
# CONFIG_IRDA is not set
CONFIG_BT=m
# CONFIG_BT_L2CAP is not set
# CONFIG_BT_SCO is not set

#
# Bluetooth device drivers
#
# CONFIG_BT_HCIBTUSB is not set
# CONFIG_BT_HCIBTSDIO is not set
CONFIG_BT_HCIUART=m
CONFIG_BT_HCIUART_H4=y
CONFIG_BT_HCIUART_BCSP=y
# CONFIG_BT_HCIUART_ATH3K is not set
# CONFIG_BT_HCIUART_LL is not set
CONFIG_BT_HCIBCM203X=m
CONFIG_BT_HCIBPA10X=m
CONFIG_BT_HCIBFUSB=m
CONFIG_BT_HCIDTL1=m
CONFIG_BT_HCIBT3C=m
CONFIG_BT_HCIBLUECARD=m
CONFIG_BT_HCIBTUART=m
CONFIG_BT_HCIVHCI=m
# CONFIG_BT_MRVL is not set
# CONFIG_AF_RXRPC is not set
CONFIG_FIB_RULES=y
CONFIG_WIRELESS=y
CONFIG_WIRELESS_EXT=y
CONFIG_WEXT_CORE=y
CONFIG_WEXT_PROC=y
CONFIG_WEXT_SPY=y
CONFIG_WEXT_PRIV=y
CONFIG_CFG80211=m
# CONFIG_NL80211_TESTMODE is not set
# CONFIG_CFG80211_DEVELOPER_WARNINGS is not set
# CONFIG_CFG80211_REG_DEBUG is not set
CONFIG_CFG80211_DEFAULT_PS=y
# CONFIG_CFG80211_DEBUGFS is not set
# CONFIG_CFG80211_INTERNAL_REGDB is not set
CONFIG_CFG80211_WEXT=y
CONFIG_WIRELESS_EXT_SYSFS=y
CONFIG_LIB80211=m
CONFIG_LIB80211_CRYPT_WEP=m
CONFIG_LIB80211_CRYPT_CCMP=m
CONFIG_LIB80211_CRYPT_TKIP=m
# CONFIG_LIB80211_DEBUG is not set
CONFIG_MAC80211=m
CONFIG_MAC80211_HAS_RC=y
CONFIG_MAC80211_RC_MINSTREL=y
CONFIG_MAC80211_RC_MINSTREL_HT=y
CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y
CONFIG_MAC80211_RC_DEFAULT="minstrel_ht"
# CONFIG_MAC80211_MESH is not set
CONFIG_MAC80211_LEDS=y
# CONFIG_MAC80211_DEBUGFS is not set
# CONFIG_MAC80211_DEBUG_MENU is not set
# CONFIG_WIMAX is not set
# CONFIG_RFKILL is not set
# CONFIG_NET_9P is not set
# CONFIG_CAIF is not set
# CONFIG_CEPH_LIB is not set

#
# Device Drivers
#

#
# Generic Driver Options
#
CONFIG_UEVENT_HELPER_PATH=""
# CONFIG_DEVTMPFS is not set
CONFIG_STANDALONE=y
CONFIG_PREVENT_FIRMWARE_BUILD=y
CONFIG_FW_LOADER=y
CONFIG_FIRMWARE_IN_KERNEL=y
CONFIG_EXTRA_FIRMWARE=""
# CONFIG_DEBUG_DRIVER is not set
# CONFIG_DEBUG_DEVRES is not set
# CONFIG_SYS_HYPERVISOR is not set
CONFIG_CONNECTOR=y
CONFIG_PROC_EVENTS=y
CONFIG_MTD=m
# CONFIG_MTD_DEBUG is not set
# CONFIG_MTD_TESTS is not set
CONFIG_MTD_REDBOOT_PARTS=m
CONFIG_MTD_REDBOOT_DIRECTORY_BLOCK=-1
# CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED is not set
# CONFIG_MTD_REDBOOT_PARTS_READONLY is not set
CONFIG_MTD_OF_PARTS=y
# CONFIG_MTD_AR7_PARTS is not set

#
# User Modules And Translation Layers
#
CONFIG_MTD_CHAR=m
CONFIG_MTD_BLKDEVS=m
CONFIG_MTD_BLOCK=m
CONFIG_MTD_BLOCK_RO=m
CONFIG_FTL=m
CONFIG_NFTL=m
CONFIG_NFTL_RW=y
# CONFIG_INFTL is not set
CONFIG_RFD_FTL=m
# CONFIG_SSFDC is not set
# CONFIG_SM_FTL is not set
# CONFIG_MTD_OOPS is not set
# CONFIG_MTD_SWAP is not set

#
# RAM/ROM/Flash chip drivers
#
CONFIG_MTD_CFI=m
CONFIG_MTD_JEDECPROBE=m
CONFIG_MTD_GEN_PROBE=m
# CONFIG_MTD_CFI_ADV_OPTIONS is not set
CONFIG_MTD_MAP_BANK_WIDTH_1=y
CONFIG_MTD_MAP_BANK_WIDTH_2=y
CONFIG_MTD_MAP_BANK_WIDTH_4=y
# CONFIG_MTD_MAP_BANK_WIDTH_8 is not set
# CONFIG_MTD_MAP_BANK_WIDTH_16 is not set
# CONFIG_MTD_MAP_BANK_WIDTH_32 is not set
CONFIG_MTD_CFI_I1=y
CONFIG_MTD_CFI_I2=y
# CONFIG_MTD_CFI_I4 is not set
# CONFIG_MTD_CFI_I8 is not set
CONFIG_MTD_CFI_INTELEXT=m
CONFIG_MTD_CFI_AMDSTD=m
CONFIG_MTD_CFI_STAA=m
CONFIG_MTD_CFI_UTIL=m
CONFIG_MTD_RAM=m
CONFIG_MTD_ROM=m
CONFIG_MTD_ABSENT=m

#
# Mapping drivers for chip access
#
# CONFIG_MTD_COMPLEX_MAPPINGS is not set
# CONFIG_MTD_PHYSMAP is not set
# CONFIG_MTD_PHYSMAP_OF is not set
# CONFIG_MTD_INTEL_VR_NOR is not set
# CONFIG_MTD_PLATRAM is not set

#
# Self-contained MTD device drivers
#
# CONFIG_MTD_PMC551 is not set
# CONFIG_MTD_SLRAM is not set
# CONFIG_MTD_PHRAM is not set
CONFIG_MTD_MTDRAM=m
CONFIG_MTDRAM_TOTAL_SIZE=4096
CONFIG_MTDRAM_ERASE_SIZE=128
CONFIG_MTD_BLOCK2MTD=m

#
# Disk-On-Chip Device Drivers
#
# CONFIG_MTD_DOC2000 is not set
# CONFIG_MTD_DOC2001 is not set
# CONFIG_MTD_DOC2001PLUS is not set
CONFIG_MTD_NAND_ECC=m
CONFIG_MTD_NAND_ECC_SMC=y
CONFIG_MTD_NAND=m
# CONFIG_MTD_NAND_VERIFY_WRITE is not set
# CONFIG_MTD_NAND_ECC_BCH is not set
# CONFIG_MTD_SM_COMMON is not set
# CONFIG_MTD_NAND_MUSEUM_IDS is not set
# CONFIG_MTD_NAND_DENALI is not set
CONFIG_MTD_NAND_IDS=m
# CONFIG_MTD_NAND_RICOH is not set
CONFIG_MTD_NAND_DISKONCHIP=m
# CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADVANCED is not set
CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADDRESS=0
# CONFIG_MTD_NAND_DISKONCHIP_BBTWRITE is not set
# CONFIG_MTD_NAND_CAFE is not set
CONFIG_MTD_NAND_NANDSIM=m
# CONFIG_MTD_NAND_PLATFORM is not set
# CONFIG_MTD_ALAUDA is not set
# CONFIG_MTD_NAND_FSL_ELBC is not set
# CONFIG_MTD_NAND_FSL_UPM is not set
# CONFIG_MTD_ONENAND is not set

#
# LPDDR flash memory drivers
#
# CONFIG_MTD_LPDDR is not set
# CONFIG_MTD_UBI is not set
CONFIG_DTC=y
CONFIG_OF=y

#
# Device Tree and Open Firmware support
#
# CONFIG_PROC_DEVICETREE is not set
CONFIG_OF_FLATTREE=y
CONFIG_OF_EARLY_FLATTREE=y
CONFIG_OF_DYNAMIC=y
CONFIG_OF_ADDRESS=y
CONFIG_OF_IRQ=y
CONFIG_OF_DEVICE=y
CONFIG_OF_I2C=m
CONFIG_OF_NET=y
CONFIG_OF_MDIO=y
CONFIG_OF_PCI=y
CONFIG_PARPORT=m
CONFIG_PARPORT_PC=m
CONFIG_PARPORT_SERIAL=m
# CONFIG_PARPORT_PC_FIFO is not set
# CONFIG_PARPORT_PC_SUPERIO is not set
CONFIG_PARPORT_PC_PCMCIA=m
# CONFIG_PARPORT_GSC is not set
# CONFIG_PARPORT_AX88796 is not set
CONFIG_PARPORT_1284=y
CONFIG_PARPORT_NOT_PC=y
CONFIG_BLK_DEV=y
CONFIG_BLK_DEV_FD=m
CONFIG_PARIDE=m

#
# Parallel IDE high-level drivers
#
CONFIG_PARIDE_PD=m
CONFIG_PARIDE_PCD=m
CONFIG_PARIDE_PF=m
CONFIG_PARIDE_PT=m
CONFIG_PARIDE_PG=m

#
# Parallel IDE protocol modules
#
CONFIG_PARIDE_ATEN=m
CONFIG_PARIDE_BPCK=m
# CONFIG_PARIDE_BPCK6 is not set
CONFIG_PARIDE_COMM=m
CONFIG_PARIDE_DSTR=m
CONFIG_PARIDE_FIT2=m
CONFIG_PARIDE_FIT3=m
CONFIG_PARIDE_EPAT=m
CONFIG_PARIDE_EPATC8=y
CONFIG_PARIDE_EPIA=m
CONFIG_PARIDE_FRIQ=m
CONFIG_PARIDE_FRPW=m
CONFIG_PARIDE_KBIC=m
CONFIG_PARIDE_KTTI=m
CONFIG_PARIDE_ON20=m
CONFIG_PARIDE_ON26=m
CONFIG_BLK_CPQ_DA=m
CONFIG_BLK_CPQ_CISS_DA=m
CONFIG_CISS_SCSI_TAPE=y
CONFIG_BLK_DEV_DAC960=m
# CONFIG_BLK_DEV_UMEM is not set
# CONFIG_BLK_DEV_COW_COMMON is not set
CONFIG_BLK_DEV_LOOP=m
CONFIG_BLK_DEV_CRYPTOLOOP=m
# CONFIG_BLK_DEV_DRBD is not set
CONFIG_BLK_DEV_NBD=m
CONFIG_BLK_DEV_SX8=m
# CONFIG_BLK_DEV_UB is not set
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_COUNT=16
CONFIG_BLK_DEV_RAM_SIZE=16384
# CONFIG_BLK_DEV_XIP is not set
CONFIG_CDROM_PKTCDVD=m
CONFIG_CDROM_PKTCDVD_BUFFERS=8
# CONFIG_CDROM_PKTCDVD_WCACHE is not set
CONFIG_ATA_OVER_ETH=m
# CONFIG_BLK_DEV_HD is not set
# CONFIG_BLK_DEV_RBD is not set
# CONFIG_SENSORS_LIS3LV02D is not set
# CONFIG_MISC_DEVICES is not set
CONFIG_EEPROM_93CX6=m
CONFIG_HAVE_IDE=y
CONFIG_IDE=y

#
# Please see Documentation/ide/ide.txt for help/info on IDE drives
#
CONFIG_IDE_XFER_MODE=y
CONFIG_IDE_TIMINGS=y
CONFIG_IDE_ATAPI=y
# CONFIG_BLK_DEV_IDE_SATA is not set
CONFIG_IDE_GD=y
CONFIG_IDE_GD_ATA=y
# CONFIG_IDE_GD_ATAPI is not set
CONFIG_BLK_DEV_IDECS=m
# CONFIG_BLK_DEV_DELKIN is not set
CONFIG_BLK_DEV_IDECD=m
CONFIG_BLK_DEV_IDECD_VERBOSE_ERRORS=y
# CONFIG_BLK_DEV_IDETAPE is not set
CONFIG_IDE_TASK_IOCTL=y
CONFIG_IDE_PROC_FS=y

#
# IDE chipset support/bugfixes
#
# CONFIG_BLK_DEV_PLATFORM is not set
CONFIG_BLK_DEV_IDEDMA_SFF=y

#
# PCI IDE chipsets support
#
CONFIG_BLK_DEV_IDEPCI=y
CONFIG_IDEPCI_PCIBUS_ORDER=y
# CONFIG_BLK_DEV_OFFBOARD is not set
CONFIG_BLK_DEV_GENERIC=y
# CONFIG_BLK_DEV_OPTI621 is not set
CONFIG_BLK_DEV_IDEDMA_PCI=y
CONFIG_BLK_DEV_AEC62XX=y
CONFIG_BLK_DEV_ALI15X3=y
CONFIG_BLK_DEV_AMD74XX=y
CONFIG_BLK_DEV_CMD64X=y
# CONFIG_BLK_DEV_TRIFLEX is not set
# CONFIG_BLK_DEV_CS5520 is not set
# CONFIG_BLK_DEV_CS5530 is not set
CONFIG_BLK_DEV_HPT366=y
# CONFIG_BLK_DEV_JMICRON is not set
# CONFIG_BLK_DEV_SC1200 is not set
CONFIG_BLK_DEV_PIIX=y
# CONFIG_BLK_DEV_IT8172 is not set
# CONFIG_BLK_DEV_IT8213 is not set
CONFIG_BLK_DEV_IT821X=y
# CONFIG_BLK_DEV_NS87415 is not set
CONFIG_BLK_DEV_PDC202XX_OLD=y
CONFIG_BLK_DEV_PDC202XX_NEW=y
CONFIG_BLK_DEV_SVWKS=y
CONFIG_BLK_DEV_SIIMAGE=y
# CONFIG_BLK_DEV_SL82C105 is not set
# CONFIG_BLK_DEV_SLC90E66 is not set
# CONFIG_BLK_DEV_TRM290 is not set
CONFIG_BLK_DEV_VIA82CXXX=y
# CONFIG_BLK_DEV_TC86C001 is not set
CONFIG_BLK_DEV_IDEDMA=y

#
# SCSI device support
#
CONFIG_SCSI_MOD=m
CONFIG_RAID_ATTRS=m
CONFIG_SCSI=m
CONFIG_SCSI_DMA=y
# CONFIG_SCSI_TGT is not set
CONFIG_SCSI_NETLINK=y
CONFIG_SCSI_PROC_FS=y

#
# SCSI support type (disk, tape, CD-ROM)
#
CONFIG_BLK_DEV_SD=m
CONFIG_CHR_DEV_ST=m
CONFIG_CHR_DEV_OSST=m
CONFIG_BLK_DEV_SR=m
CONFIG_BLK_DEV_SR_VENDOR=y
CONFIG_CHR_DEV_SG=m
CONFIG_CHR_DEV_SCH=m
CONFIG_SCSI_MULTI_LUN=y
CONFIG_SCSI_CONSTANTS=y
CONFIG_SCSI_LOGGING=y
# CONFIG_SCSI_SCAN_ASYNC is not set
CONFIG_SCSI_WAIT_SCAN=m

#
# SCSI Transports
#
CONFIG_SCSI_SPI_ATTRS=m
CONFIG_SCSI_FC_ATTRS=m
CONFIG_SCSI_ISCSI_ATTRS=m
CONFIG_SCSI_SAS_ATTRS=m
CONFIG_SCSI_SAS_LIBSAS=m
CONFIG_SCSI_SAS_ATA=y
CONFIG_SCSI_SAS_HOST_SMP=y
CONFIG_SCSI_SRP_ATTRS=m
CONFIG_SCSI_LOWLEVEL=y
CONFIG_ISCSI_TCP=m
# CONFIG_ISCSI_BOOT_SYSFS is not set
# CONFIG_SCSI_CXGB3_ISCSI is not set
# CONFIG_SCSI_CXGB4_ISCSI is not set
# CONFIG_SCSI_BNX2_ISCSI is not set
# CONFIG_SCSI_BNX2X_FCOE is not set
# CONFIG_BE2ISCSI is not set
CONFIG_BLK_DEV_3W_XXXX_RAID=m
# CONFIG_SCSI_HPSA is not set
CONFIG_SCSI_3W_9XXX=m
# CONFIG_SCSI_3W_SAS is not set
CONFIG_SCSI_ACARD=m
CONFIG_SCSI_AACRAID=m
CONFIG_SCSI_AIC7XXX=m
CONFIG_AIC7XXX_CMDS_PER_DEVICE=4
CONFIG_AIC7XXX_RESET_DELAY_MS=15000
# CONFIG_AIC7XXX_DEBUG_ENABLE is not set
CONFIG_AIC7XXX_DEBUG_MASK=0
# CONFIG_AIC7XXX_REG_PRETTY_PRINT is not set
CONFIG_SCSI_AIC7XXX_OLD=m
CONFIG_SCSI_AIC79XX=m
CONFIG_AIC79XX_CMDS_PER_DEVICE=4
CONFIG_AIC79XX_RESET_DELAY_MS=15000
# CONFIG_AIC79XX_DEBUG_ENABLE is not set
CONFIG_AIC79XX_DEBUG_MASK=0
# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set
CONFIG_SCSI_AIC94XX=m
# CONFIG_AIC94XX_DEBUG is not set
# CONFIG_SCSI_MVSAS is not set
# CONFIG_SCSI_DPT_I2O is not set
# CONFIG_SCSI_ADVANSYS is not set
CONFIG_SCSI_ARCMSR=m
# CONFIG_SCSI_ARCMSR_AER is not set
CONFIG_MEGARAID_NEWGEN=y
CONFIG_MEGARAID_MM=m
CONFIG_MEGARAID_MAILBOX=m
CONFIG_MEGARAID_LEGACY=m
CONFIG_MEGARAID_SAS=m
# CONFIG_SCSI_MPT2SAS is not set
CONFIG_SCSI_HPTIOP=m
# CONFIG_SCSI_BUSLOGIC is not set
CONFIG_LIBFC=m
CONFIG_LIBFCOE=m
CONFIG_FCOE=m
# CONFIG_SCSI_DMX3191D is not set
# CONFIG_SCSI_EATA is not set
# CONFIG_SCSI_FUTURE_DOMAIN is not set
CONFIG_SCSI_GDTH=m
CONFIG_SCSI_IPS=m
CONFIG_SCSI_INITIO=m
# CONFIG_SCSI_INIA100 is not set
CONFIG_SCSI_PPA=m
CONFIG_SCSI_IMM=m
# CONFIG_SCSI_IZIP_EPP16 is not set
# CONFIG_SCSI_IZIP_SLOW_CTR is not set
CONFIG_SCSI_STEX=m
CONFIG_SCSI_SYM53C8XX_2=m
CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1
CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16
CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64
CONFIG_SCSI_SYM53C8XX_MMIO=y
# CONFIG_SCSI_IPR is not set
CONFIG_SCSI_QLOGIC_1280=m
CONFIG_SCSI_QLA_FC=m
CONFIG_SCSI_QLA_ISCSI=m
CONFIG_SCSI_LPFC=m
# CONFIG_SCSI_LPFC_DEBUG_FS is not set
CONFIG_SCSI_DC395x=m
# CONFIG_SCSI_DC390T is not set
# CONFIG_SCSI_NSP32 is not set
# CONFIG_SCSI_DEBUG is not set
# CONFIG_SCSI_PMCRAID is not set
# CONFIG_SCSI_PM8001 is not set
# CONFIG_SCSI_SRP is not set
# CONFIG_SCSI_BFA_FC is not set
# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
CONFIG_SCSI_DH=m
CONFIG_SCSI_DH_RDAC=m
# CONFIG_SCSI_DH_HP_SW is not set
# CONFIG_SCSI_DH_EMC is not set
# CONFIG_SCSI_DH_ALUA is not set
# CONFIG_SCSI_OSD_INITIATOR is not set
CONFIG_ATA=m
# CONFIG_ATA_NONSTANDARD is not set
CONFIG_ATA_VERBOSE_ERROR=y
CONFIG_SATA_PMP=y

#
# Controllers with non-SFF native interface
#
CONFIG_SATA_AHCI=m
# CONFIG_SATA_AHCI_PLATFORM is not set
# CONFIG_SATA_FSL is not set
CONFIG_SATA_INIC162X=m
# CONFIG_SATA_ACARD_AHCI is not set
CONFIG_SATA_SIL24=m
CONFIG_ATA_SFF=y

#
# SFF controllers with custom DMA interface
#
CONFIG_PDC_ADMA=m
CONFIG_SATA_QSTOR=m
CONFIG_SATA_SX4=m
CONFIG_ATA_BMDMA=y

#
# SATA SFF controllers with BMDMA
#
CONFIG_ATA_PIIX=m
CONFIG_SATA_MV=m
CONFIG_SATA_NV=m
CONFIG_SATA_PROMISE=m
CONFIG_SATA_SIL=m
CONFIG_SATA_SIS=m
CONFIG_SATA_SVW=m
CONFIG_SATA_ULI=m
CONFIG_SATA_VIA=m
CONFIG_SATA_VITESSE=m

#
# PATA SFF controllers with BMDMA
#
# CONFIG_PATA_ALI is not set
# CONFIG_PATA_AMD is not set
# CONFIG_PATA_ARASAN_CF is not set
# CONFIG_PATA_ARTOP is not set
# CONFIG_PATA_ATIIXP is not set
# CONFIG_PATA_ATP867X is not set
# CONFIG_PATA_CMD64X is not set
# CONFIG_PATA_CS5520 is not set
# CONFIG_PATA_CS5530 is not set
# CONFIG_PATA_CS5536 is not set
# CONFIG_PATA_CYPRESS is not set
# CONFIG_PATA_EFAR is not set
# CONFIG_PATA_HPT366 is not set
# CONFIG_PATA_HPT37X is not set
# CONFIG_PATA_HPT3X2N is not set
# CONFIG_PATA_HPT3X3 is not set
# CONFIG_PATA_IT8213 is not set
# CONFIG_PATA_IT821X is not set
# CONFIG_PATA_JMICRON is not set
CONFIG_PATA_MARVELL=m
# CONFIG_PATA_NETCELL is not set
# CONFIG_PATA_NINJA32 is not set
# CONFIG_PATA_NS87415 is not set
# CONFIG_PATA_OLDPIIX is not set
# CONFIG_PATA_OPTIDMA is not set
CONFIG_PATA_PDC2027X=m
# CONFIG_PATA_PDC_OLD is not set
# CONFIG_PATA_RADISYS is not set
# CONFIG_PATA_RDC is not set
# CONFIG_PATA_SC1200 is not set
# CONFIG_PATA_SCH is not set
# CONFIG_PATA_SERVERWORKS is not set
CONFIG_PATA_SIL680=m
CONFIG_PATA_SIS=m
# CONFIG_PATA_TOSHIBA is not set
# CONFIG_PATA_TRIFLEX is not set
# CONFIG_PATA_VIA is not set
# CONFIG_PATA_WINBOND is not set

#
# PIO-only SFF controllers
#
# CONFIG_PATA_CMD640_PCI is not set
# CONFIG_PATA_MPIIX is not set
# CONFIG_PATA_NS87410 is not set
# CONFIG_PATA_OPTI is not set
# CONFIG_PATA_PCMCIA is not set
# CONFIG_PATA_PLATFORM is not set
# CONFIG_PATA_RZ1000 is not set

#
# Generic fallback / legacy drivers
#
# CONFIG_ATA_GENERIC is not set
# CONFIG_PATA_LEGACY is not set
CONFIG_MD=y
CONFIG_BLK_DEV_MD=y
CONFIG_MD_AUTODETECT=y
CONFIG_MD_LINEAR=m
CONFIG_MD_RAID0=m
CONFIG_MD_RAID1=m
CONFIG_MD_RAID10=m
CONFIG_MD_RAID456=m
# CONFIG_MULTICORE_RAID456 is not set
CONFIG_MD_MULTIPATH=m
CONFIG_MD_FAULTY=m
CONFIG_BLK_DEV_DM=m
# CONFIG_DM_DEBUG is not set
CONFIG_DM_CRYPT=m
CONFIG_DM_SNAPSHOT=m
CONFIG_DM_MIRROR=m
# CONFIG_DM_RAID is not set
# CONFIG_DM_LOG_USERSPACE is not set
CONFIG_DM_ZERO=m
CONFIG_DM_MULTIPATH=m
# CONFIG_DM_MULTIPATH_QL is not set
# CONFIG_DM_MULTIPATH_ST is not set
# CONFIG_DM_DELAY is not set
CONFIG_DM_UEVENT=y
# CONFIG_DM_FLAKEY is not set
# CONFIG_TARGET_CORE is not set
CONFIG_FUSION=y
CONFIG_FUSION_SPI=m
CONFIG_FUSION_FC=m
CONFIG_FUSION_SAS=m
CONFIG_FUSION_MAX_SGE=128
CONFIG_FUSION_CTL=m
CONFIG_FUSION_LAN=m
CONFIG_FUSION_LOGGING=y

#
# IEEE 1394 (FireWire) support
#
CONFIG_FIREWIRE=m
CONFIG_FIREWIRE_OHCI=m
CONFIG_FIREWIRE_OHCI_DEBUG=y
CONFIG_FIREWIRE_SBP2=m
# CONFIG_FIREWIRE_NET is not set
# CONFIG_FIREWIRE_NOSY is not set
CONFIG_I2O=m
# CONFIG_I2O_LCT_NOTIFY_ON_CHANGES is not set
CONFIG_I2O_EXT_ADAPTEC=y
CONFIG_I2O_CONFIG=m
CONFIG_I2O_CONFIG_OLD_IOCTL=y
CONFIG_I2O_BUS=m
CONFIG_I2O_BLOCK=m
CONFIG_I2O_SCSI=m
CONFIG_I2O_PROC=m
# CONFIG_MACINTOSH_DRIVERS is not set
CONFIG_NETDEVICES=y
CONFIG_IFB=m
CONFIG_DUMMY=m
CONFIG_BONDING=m
# CONFIG_MACVLAN is not set
# CONFIG_EQUALIZER is not set
CONFIG_TUN=m
# CONFIG_VETH is not set
# CONFIG_ARCNET is not set
CONFIG_MII=m
CONFIG_PHYLIB=y

#
# MII PHY device drivers
#
CONFIG_MARVELL_PHY=m
CONFIG_DAVICOM_PHY=m
CONFIG_QSEMI_PHY=m
CONFIG_LXT_PHY=m
CONFIG_CICADA_PHY=m
CONFIG_VITESSE_PHY=m
CONFIG_SMSC_PHY=m
# CONFIG_BROADCOM_PHY is not set
# CONFIG_ICPLUS_PHY is not set
# CONFIG_REALTEK_PHY is not set
# CONFIG_NATIONAL_PHY is not set
# CONFIG_STE10XP is not set
# CONFIG_LSI_ET1011C_PHY is not set
# CONFIG_MICREL_PHY is not set
# CONFIG_FIXED_PHY is not set
# CONFIG_MDIO_BITBANG is not set
CONFIG_NET_ETHERNET=y
CONFIG_HAPPYMEAL=m
CONFIG_SUNGEM=m
CONFIG_CASSINI=m
CONFIG_NET_VENDOR_3COM=y
CONFIG_VORTEX=m
CONFIG_TYPHOON=m
# CONFIG_ETHOC is not set
# CONFIG_DNET is not set
CONFIG_NET_TULIP=y
CONFIG_DE2104X=m
CONFIG_DE2104X_DSL=0
CONFIG_TULIP=m
# CONFIG_TULIP_MWI is not set
CONFIG_TULIP_MMIO=y
# CONFIG_TULIP_NAPI is not set
CONFIG_DE4X5=m
CONFIG_WINBOND_840=m
CONFIG_DM9102=m
CONFIG_ULI526X=m
CONFIG_PCMCIA_XIRCOM=m
# CONFIG_HP100 is not set
# CONFIG_IBM_NEW_EMAC_ZMII is not set
# CONFIG_IBM_NEW_EMAC_RGMII is not set
# CONFIG_IBM_NEW_EMAC_TAH is not set
# CONFIG_IBM_NEW_EMAC_EMAC4 is not set
# CONFIG_IBM_NEW_EMAC_NO_FLOW_CTRL is not set
# CONFIG_IBM_NEW_EMAC_MAL_CLR_ICINTSTAT is not set
# CONFIG_IBM_NEW_EMAC_MAL_COMMON_ERR is not set
CONFIG_NET_PCI=y
CONFIG_PCNET32=m
CONFIG_AMD8111_ETH=m
CONFIG_ADAPTEC_STARFIRE=m
# CONFIG_KSZ884X_PCI is not set
CONFIG_B44=m
CONFIG_B44_PCI_AUTOSELECT=y
CONFIG_B44_PCICORE_AUTOSELECT=y
CONFIG_B44_PCI=y
CONFIG_FORCEDETH=m
CONFIG_E100=m
CONFIG_FEALNX=m
CONFIG_NATSEMI=m
CONFIG_NE2K_PCI=m
CONFIG_8139CP=m
CONFIG_8139TOO=m
# CONFIG_8139TOO_PIO is not set
# CONFIG_8139TOO_TUNE_TWISTER is not set
CONFIG_8139TOO_8129=y
# CONFIG_8139_OLD_RX_RESET is not set
# CONFIG_R6040 is not set
CONFIG_SIS900=m
CONFIG_EPIC100=m
# CONFIG_SMSC9420 is not set
CONFIG_SUNDANCE=m
# CONFIG_SUNDANCE_MMIO is not set
# CONFIG_TLAN is not set
# CONFIG_KS8851_MLL is not set
CONFIG_VIA_RHINE=m
CONFIG_VIA_RHINE_MMIO=y
# CONFIG_SC92031 is not set
CONFIG_NET_POCKET=y
# CONFIG_DE600 is not set
# CONFIG_DE620 is not set
# CONFIG_ATL2 is not set
# CONFIG_XILINX_EMACLITE is not set
CONFIG_NETDEV_1000=y
CONFIG_ACENIC=m
# CONFIG_ACENIC_OMIT_TIGON_I is not set
CONFIG_DL2K=m
CONFIG_E1000=m
CONFIG_E1000E=m
# CONFIG_IP1000 is not set
CONFIG_IGB=m
# CONFIG_IGBVF is not set
CONFIG_NS83820=m
# CONFIG_HAMACHI is not set
# CONFIG_YELLOWFIN is not set
CONFIG_R8169=m
CONFIG_SIS190=m
CONFIG_SKGE=m
# CONFIG_SKGE_DEBUG is not set
CONFIG_SKY2=m
# CONFIG_SKY2_DEBUG is not set
CONFIG_VIA_VELOCITY=m
CONFIG_TIGON3=m
CONFIG_BNX2=m
# CONFIG_CNIC is not set
CONFIG_FSL_PQ_MDIO=y
CONFIG_GIANFAR=y
# CONFIG_MV643XX_ETH is not set
# CONFIG_XILINX_LL_TEMAC is not set
CONFIG_QLA3XXX=m
# CONFIG_ATL1 is not set
# CONFIG_ATL1E is not set
# CONFIG_ATL1C is not set
# CONFIG_JME is not set
# CONFIG_STMMAC_ETH is not set
# CONFIG_PCH_GBE is not set
CONFIG_NETDEV_10000=y
CONFIG_MDIO=m
CONFIG_CHELSIO_T1=m
# CONFIG_CHELSIO_T1_1G is not set
CONFIG_CHELSIO_T3=m
# CONFIG_CHELSIO_T4 is not set
# CONFIG_CHELSIO_T4VF is not set
CONFIG_ENIC=m
CONFIG_IXGBE=m
# CONFIG_IXGBEVF is not set
CONFIG_IXGB=m
CONFIG_S2IO=m
# CONFIG_VXGE is not set
CONFIG_MYRI10GE=m
CONFIG_NETXEN_NIC=m
CONFIG_NIU=m
# CONFIG_MLX4_EN is not set
CONFIG_MLX4_CORE=m
CONFIG_MLX4_DEBUG=y
# CONFIG_TEHUTI is not set
CONFIG_BNX2X=m
# CONFIG_QLCNIC is not set
# CONFIG_QLGE is not set
# CONFIG_BNA is not set
# CONFIG_SFC is not set
# CONFIG_BE2NET is not set
CONFIG_TR=y
CONFIG_IBMOL=m
# CONFIG_IBMLS is not set
CONFIG_3C359=m
# CONFIG_TMS380TR is not set
CONFIG_WLAN=y
# CONFIG_PCMCIA_RAYCS is not set
# CONFIG_LIBERTAS_THINFIRM is not set
CONFIG_AIRO=m
CONFIG_ATMEL=m
CONFIG_PCI_ATMEL=m
CONFIG_PCMCIA_ATMEL=m
# CONFIG_AT76C50X_USB is not set
CONFIG_AIRO_CS=m
CONFIG_PCMCIA_WL3501=m
CONFIG_PRISM54=m
CONFIG_USB_ZD1201=m
# CONFIG_USB_NET_RNDIS_WLAN is not set
CONFIG_RTL8180=m
CONFIG_RTL8187=m
CONFIG_RTL8187_LEDS=y
# CONFIG_ADM8211 is not set
# CONFIG_MAC80211_HWSIM is not set
# CONFIG_MWL8K is not set
# CONFIG_ATH_COMMON is not set
# CONFIG_B43 is not set
# CONFIG_B43LEGACY is not set
CONFIG_HOSTAP=m
CONFIG_HOSTAP_FIRMWARE=y
CONFIG_HOSTAP_FIRMWARE_NVRAM=y
CONFIG_HOSTAP_PLX=m
CONFIG_HOSTAP_PCI=m
CONFIG_HOSTAP_CS=m
CONFIG_IPW2100=m
CONFIG_IPW2100_MONITOR=y
# CONFIG_IPW2100_DEBUG is not set
CONFIG_IPW2200=m
CONFIG_IPW2200_MONITOR=y
CONFIG_IPW2200_RADIOTAP=y
CONFIG_IPW2200_PROMISCUOUS=y
CONFIG_IPW2200_QOS=y
# CONFIG_IPW2200_DEBUG is not set
CONFIG_LIBIPW=m
# CONFIG_LIBIPW_DEBUG is not set
CONFIG_IWLAGN=m

#
# Debugging Options
#
# CONFIG_IWLWIFI_DEBUG is not set
# CONFIG_IWLWIFI_DEVICE_TRACING is not set
# CONFIG_IWLWIFI_DEVICE_SVTOOL is not set
# CONFIG_IWL_P2P is not set
CONFIG_IWLWIFI_LEGACY=m

#
# Debugging Options
#
# CONFIG_IWLWIFI_LEGACY_DEBUG is not set
# CONFIG_IWLWIFI_LEGACY_DEVICE_TRACING is not set
CONFIG_IWL4965=m
CONFIG_IWL3945=m
# CONFIG_IWM is not set
# CONFIG_LIBERTAS is not set
CONFIG_HERMES=m
# CONFIG_HERMES_PRISM is not set
CONFIG_HERMES_CACHE_FW_ON_INIT=y
CONFIG_PLX_HERMES=m
CONFIG_TMD_HERMES=m
CONFIG_NORTEL_HERMES=m
CONFIG_PCMCIA_HERMES=m
CONFIG_PCMCIA_SPECTRUM=m
# CONFIG_ORINOCO_USB is not set
# CONFIG_P54_COMMON is not set
CONFIG_RT2X00=m
CONFIG_RT2400PCI=m
CONFIG_RT2500PCI=m
CONFIG_RT61PCI=m
# CONFIG_RT2800PCI is not set
CONFIG_RT2500USB=m
CONFIG_RT73USB=m
# CONFIG_RT2800USB is not set
CONFIG_RT2X00_LIB_PCI=m
CONFIG_RT2X00_LIB_USB=m
CONFIG_RT2X00_LIB=m
CONFIG_RT2X00_LIB_FIRMWARE=y
CONFIG_RT2X00_LIB_CRYPTO=y
CONFIG_RT2X00_LIB_LEDS=y
# CONFIG_RT2X00_DEBUG is not set
# CONFIG_RTL8192CE is not set
# CONFIG_RTL8192SE is not set
# CONFIG_RTL8192CU is not set
# CONFIG_WL1251 is not set
# CONFIG_WL12XX_MENU is not set
CONFIG_ZD1211RW=m
# CONFIG_ZD1211RW_DEBUG is not set
# CONFIG_MWIFIEX is not set

#
# Enable WiMAX (Networking options) to see the WiMAX drivers
#

#
# USB Network Adapters
#
CONFIG_USB_CATC=m
CONFIG_USB_KAWETH=m
CONFIG_USB_PEGASUS=m
CONFIG_USB_RTL8150=m
CONFIG_USB_USBNET=m
CONFIG_USB_NET_AX8817X=m
CONFIG_USB_NET_CDCETHER=m
# CONFIG_USB_NET_CDC_EEM is not set
CONFIG_USB_NET_CDC_NCM=m
CONFIG_USB_NET_DM9601=m
# CONFIG_USB_NET_SMSC75XX is not set
# CONFIG_USB_NET_SMSC95XX is not set
CONFIG_USB_NET_GL620A=m
CONFIG_USB_NET_NET1080=m
CONFIG_USB_NET_PLUSB=m
# CONFIG_USB_NET_MCS7830 is not set
CONFIG_USB_NET_RNDIS_HOST=m
CONFIG_USB_NET_CDC_SUBSET=m
CONFIG_USB_ALI_M5632=y
CONFIG_USB_AN2720=y
CONFIG_USB_BELKIN=y
CONFIG_USB_ARMLINUX=y
CONFIG_USB_EPSON2888=y
# CONFIG_USB_KC2190 is not set
CONFIG_USB_NET_ZAURUS=m
# CONFIG_USB_NET_CX82310_ETH is not set
# CONFIG_USB_NET_KALMIA is not set
# CONFIG_USB_NET_INT51X1 is not set
# CONFIG_USB_IPHETH is not set
# CONFIG_USB_SIERRA_NET is not set
# CONFIG_USB_VL600 is not set
CONFIG_NET_PCMCIA=y
CONFIG_PCMCIA_3C589=m
CONFIG_PCMCIA_3C574=m
CONFIG_PCMCIA_FMVJ18X=m
CONFIG_PCMCIA_PCNET=m
CONFIG_PCMCIA_NMCLAN=m
CONFIG_PCMCIA_SMC91C92=m
CONFIG_PCMCIA_XIRC2PS=m
CONFIG_PCMCIA_AXNET=m
# CONFIG_PCMCIA_IBMTR is not set
# CONFIG_WAN is not set
CONFIG_ATM_DRIVERS=y
# CONFIG_ATM_DUMMY is not set
CONFIG_ATM_TCP=m
CONFIG_ATM_LANAI=m
CONFIG_ATM_ENI=m
# CONFIG_ATM_ENI_DEBUG is not set
# CONFIG_ATM_ENI_TUNE_BURST is not set
CONFIG_ATM_FIRESTREAM=m
# CONFIG_ATM_ZATM is not set
# CONFIG_ATM_NICSTAR is not set
CONFIG_ATM_IDT77252=m
# CONFIG_ATM_IDT77252_DEBUG is not set
# CONFIG_ATM_IDT77252_RCV_ALL is not set
CONFIG_ATM_IDT77252_USE_SUNI=y
CONFIG_ATM_AMBASSADOR=m
# CONFIG_ATM_AMBASSADOR_DEBUG is not set
CONFIG_ATM_HORIZON=m
# CONFIG_ATM_HORIZON_DEBUG is not set
# CONFIG_ATM_IA is not set
# CONFIG_ATM_FORE200E is not set
CONFIG_ATM_HE=m
# CONFIG_ATM_HE_USE_SUNI is not set
# CONFIG_ATM_SOLOS is not set

#
# CAIF transport drivers
#
CONFIG_FDDI=y
# CONFIG_DEFXX is not set
# CONFIG_SKFP is not set
# CONFIG_HIPPI is not set
# CONFIG_PLIP is not set
CONFIG_PPP=m
CONFIG_PPP_MULTILINK=y
CONFIG_PPP_FILTER=y
CONFIG_PPP_ASYNC=m
CONFIG_PPP_SYNC_TTY=m
CONFIG_PPP_DEFLATE=m
# CONFIG_PPP_BSDCOMP is not set
CONFIG_PPP_MPPE=m
CONFIG_PPPOE=m
CONFIG_PPPOATM=m
CONFIG_SLIP=m
CONFIG_SLIP_COMPRESSED=y
CONFIG_SLHC=m
CONFIG_SLIP_SMART=y
# CONFIG_SLIP_MODE_SLIP6 is not set
CONFIG_NET_FC=y
CONFIG_NETCONSOLE=m
# CONFIG_NETCONSOLE_DYNAMIC is not set
CONFIG_NETPOLL=y
CONFIG_NETPOLL_TRAP=y
CONFIG_NET_POLL_CONTROLLER=y
# CONFIG_VMXNET3 is not set
# CONFIG_ISDN is not set
# CONFIG_PHONE is not set

#
# Input device support
#
CONFIG_INPUT=y
CONFIG_INPUT_FF_MEMLESS=y
# CONFIG_INPUT_POLLDEV is not set
# CONFIG_INPUT_SPARSEKMAP is not set

#
# Userland interfaces
#
CONFIG_INPUT_MOUSEDEV=y
# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
CONFIG_INPUT_JOYDEV=m
CONFIG_INPUT_EVDEV=y
# CONFIG_INPUT_EVBUG is not set

#
# Input Device Drivers
#
CONFIG_INPUT_KEYBOARD=y
# CONFIG_KEYBOARD_ADP5588 is not set
# CONFIG_KEYBOARD_ADP5589 is not set
CONFIG_KEYBOARD_ATKBD=y
# CONFIG_KEYBOARD_QT1070 is not set
# CONFIG_KEYBOARD_QT2160 is not set
# CONFIG_KEYBOARD_LKKBD is not set
# CONFIG_KEYBOARD_TCA6416 is not set
# CONFIG_KEYBOARD_LM8323 is not set
# CONFIG_KEYBOARD_MAX7359 is not set
# CONFIG_KEYBOARD_MCS is not set
# CONFIG_KEYBOARD_MPR121 is not set
# CONFIG_KEYBOARD_NEWTON is not set
# CONFIG_KEYBOARD_OPENCORES is not set
# CONFIG_KEYBOARD_STOWAWAY is not set
# CONFIG_KEYBOARD_SUNKBD is not set
# CONFIG_KEYBOARD_XTKBD is not set
CONFIG_INPUT_MOUSE=y
CONFIG_MOUSE_PS2=y
CONFIG_MOUSE_PS2_ALPS=y
CONFIG_MOUSE_PS2_LOGIPS2PP=y
CONFIG_MOUSE_PS2_SYNAPTICS=y
CONFIG_MOUSE_PS2_TRACKPOINT=y
# CONFIG_MOUSE_PS2_ELANTECH is not set
# CONFIG_MOUSE_PS2_SENTELIC is not set
# CONFIG_MOUSE_PS2_TOUCHKIT is not set
CONFIG_MOUSE_SERIAL=m
# CONFIG_MOUSE_APPLETOUCH is not set
# CONFIG_MOUSE_BCM5974 is not set
CONFIG_MOUSE_VSXXXAA=m
# CONFIG_MOUSE_SYNAPTICS_I2C is not set
CONFIG_INPUT_JOYSTICK=y
# CONFIG_JOYSTICK_ANALOG is not set
# CONFIG_JOYSTICK_A3D is not set
# CONFIG_JOYSTICK_ADI is not set
# CONFIG_JOYSTICK_COBRA is not set
# CONFIG_JOYSTICK_GF2K is not set
# CONFIG_JOYSTICK_GRIP is not set
# CONFIG_JOYSTICK_GRIP_MP is not set
# CONFIG_JOYSTICK_GUILLEMOT is not set
# CONFIG_JOYSTICK_INTERACT is not set
# CONFIG_JOYSTICK_SIDEWINDER is not set
# CONFIG_JOYSTICK_TMDC is not set
# CONFIG_JOYSTICK_IFORCE is not set
# CONFIG_JOYSTICK_WARRIOR is not set
# CONFIG_JOYSTICK_MAGELLAN is not set
# CONFIG_JOYSTICK_SPACEORB is not set
# CONFIG_JOYSTICK_SPACEBALL is not set
# CONFIG_JOYSTICK_STINGER is not set
CONFIG_JOYSTICK_TWIDJOY=m
# CONFIG_JOYSTICK_ZHENHUA is not set
# CONFIG_JOYSTICK_DB9 is not set
# CONFIG_JOYSTICK_GAMECON is not set
# CONFIG_JOYSTICK_TURBOGRAFX is not set
# CONFIG_JOYSTICK_AS5011 is not set
CONFIG_JOYSTICK_JOYDUMP=m
# CONFIG_JOYSTICK_XPAD is not set
# CONFIG_INPUT_TABLET is not set
CONFIG_INPUT_TOUCHSCREEN=y
# CONFIG_TOUCHSCREEN_AD7879 is not set
# CONFIG_TOUCHSCREEN_ATMEL_MXT is not set
# CONFIG_TOUCHSCREEN_BU21013 is not set
# CONFIG_TOUCHSCREEN_DYNAPRO is not set
# CONFIG_TOUCHSCREEN_HAMPSHIRE is not set
# CONFIG_TOUCHSCREEN_EETI is not set
# CONFIG_TOUCHSCREEN_FUJITSU is not set
CONFIG_TOUCHSCREEN_GUNZE=m
CONFIG_TOUCHSCREEN_ELO=m
# CONFIG_TOUCHSCREEN_WACOM_W8001 is not set
# CONFIG_TOUCHSCREEN_MAX11801 is not set
# CONFIG_TOUCHSCREEN_MCS5000 is not set
CONFIG_TOUCHSCREEN_MTOUCH=m
# CONFIG_TOUCHSCREEN_INEXIO is not set
CONFIG_TOUCHSCREEN_MK712=m
# CONFIG_TOUCHSCREEN_PENMOUNT is not set
# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set
# CONFIG_TOUCHSCREEN_TOUCHWIN is not set
# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set
# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set
# CONFIG_TOUCHSCREEN_TSC2007 is not set
# CONFIG_TOUCHSCREEN_ST1232 is not set
# CONFIG_TOUCHSCREEN_TPS6507X is not set
CONFIG_INPUT_MISC=y
# CONFIG_INPUT_AD714X is not set
# CONFIG_INPUT_ATI_REMOTE is not set
# CONFIG_INPUT_ATI_REMOTE2 is not set
# CONFIG_INPUT_KEYSPAN_REMOTE is not set
# CONFIG_INPUT_POWERMATE is not set
# CONFIG_INPUT_YEALINK is not set
# CONFIG_INPUT_CM109 is not set
CONFIG_INPUT_UINPUT=m
# CONFIG_INPUT_PCF8574 is not set
# CONFIG_INPUT_ADXL34X is not set
# CONFIG_INPUT_CMA3000 is not set

#
# Hardware I/O ports
#
CONFIG_SERIO=y
CONFIG_SERIO_I8042=y
CONFIG_SERIO_SERPORT=y
# CONFIG_SERIO_PARKBD is not set
# CONFIG_SERIO_PCIPS2 is not set
CONFIG_SERIO_LIBPS2=y
CONFIG_SERIO_RAW=m
# CONFIG_SERIO_XILINX_XPS_PS2 is not set
# CONFIG_SERIO_ALTERA_PS2 is not set
# CONFIG_SERIO_PS2MULT is not set
CONFIG_GAMEPORT=m
CONFIG_GAMEPORT_NS558=m
CONFIG_GAMEPORT_L4=m
CONFIG_GAMEPORT_EMU10K1=m
CONFIG_GAMEPORT_FM801=m

#
# Character devices
#
CONFIG_VT=y
CONFIG_CONSOLE_TRANSLATIONS=y
CONFIG_VT_CONSOLE=y
CONFIG_HW_CONSOLE=y
CONFIG_VT_HW_CONSOLE_BINDING=y
CONFIG_UNIX98_PTYS=y
# CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set
# CONFIG_LEGACY_PTYS is not set
CONFIG_SERIAL_NONSTANDARD=y
# CONFIG_ROCKETPORT is not set
CONFIG_CYCLADES=m
# CONFIG_CYZ_INTR is not set
# CONFIG_MOXA_INTELLIO is not set
# CONFIG_MOXA_SMARTIO is not set
CONFIG_SYNCLINK=m
CONFIG_SYNCLINKMP=m
CONFIG_SYNCLINK_GT=m
# CONFIG_NOZOMI is not set
# CONFIG_ISI is not set
CONFIG_N_HDLC=m
# CONFIG_N_GSM is not set
# CONFIG_TRACE_SINK is not set
CONFIG_DEVKMEM=y
# CONFIG_STALDRV is not set

#
# Serial drivers
#
CONFIG_SERIAL_8250=y
CONFIG_SERIAL_8250_CONSOLE=y
CONFIG_SERIAL_8250_PCI=y
CONFIG_SERIAL_8250_CS=m
CONFIG_SERIAL_8250_NR_UARTS=32
CONFIG_SERIAL_8250_RUNTIME_UARTS=4
CONFIG_SERIAL_8250_EXTENDED=y
CONFIG_SERIAL_8250_MANY_PORTS=y
CONFIG_SERIAL_8250_SHARE_IRQ=y
CONFIG_SERIAL_8250_DETECT_IRQ=y
CONFIG_SERIAL_8250_RSA=y

#
# Non-8250 serial port support
#
# CONFIG_SERIAL_MFD_HSU is not set
# CONFIG_SERIAL_UARTLITE is not set
CONFIG_SERIAL_CORE=y
CONFIG_SERIAL_CORE_CONSOLE=y
CONFIG_SERIAL_JSM=m
# CONFIG_SERIAL_OF_PLATFORM is not set
# CONFIG_SERIAL_TIMBERDALE is not set
# CONFIG_SERIAL_ALTERA_JTAGUART is not set
# CONFIG_SERIAL_ALTERA_UART is not set
# CONFIG_SERIAL_PCH_UART is not set
# CONFIG_SERIAL_XILINX_PS_UART is not set
CONFIG_PRINTER=m
CONFIG_LP_CONSOLE=y
CONFIG_PPDEV=m
# CONFIG_HVC_UDBG is not set
CONFIG_IPMI_HANDLER=m
CONFIG_IPMI_PANIC_EVENT=y
CONFIG_IPMI_PANIC_STRING=y
CONFIG_IPMI_DEVICE_INTERFACE=m
CONFIG_IPMI_SI=m
CONFIG_IPMI_WATCHDOG=m
CONFIG_IPMI_POWEROFF=m
CONFIG_HW_RANDOM=y
# CONFIG_HW_RANDOM_TIMERIOMEM is not set
CONFIG_NVRAM=y
# CONFIG_GEN_RTC is not set
# CONFIG_R3964 is not set
# CONFIG_APPLICOM is not set

#
# PCMCIA character devices
#
# CONFIG_SYNCLINK_CS is not set
CONFIG_CARDMAN_4000=m
CONFIG_CARDMAN_4040=m
# CONFIG_IPWIRELESS is not set
CONFIG_RAW_DRIVER=y
CONFIG_MAX_RAW_DEVS=8192
CONFIG_TCG_TPM=m
CONFIG_TCG_TIS=m
CONFIG_TCG_NSC=m
CONFIG_TCG_ATMEL=m
CONFIG_DEVPORT=y
# CONFIG_RAMOOPS is not set
CONFIG_I2C=m
CONFIG_I2C_BOARDINFO=y
CONFIG_I2C_COMPAT=y
CONFIG_I2C_CHARDEV=m
# CONFIG_I2C_MUX is not set
CONFIG_I2C_HELPER_AUTO=y
CONFIG_I2C_SMBUS=m
CONFIG_I2C_ALGOBIT=m

#
# I2C Hardware Bus support
#

#
# PC SMBus host controller drivers
#
# CONFIG_I2C_ALI1535 is not set
# CONFIG_I2C_ALI1563 is not set
# CONFIG_I2C_ALI15X3 is not set
CONFIG_I2C_AMD756=m
CONFIG_I2C_AMD8111=m
CONFIG_I2C_I801=m
# CONFIG_I2C_ISCH is not set
CONFIG_I2C_PIIX4=m
CONFIG_I2C_NFORCE2=m
# CONFIG_I2C_SIS5595 is not set
# CONFIG_I2C_SIS630 is not set
CONFIG_I2C_SIS96X=m
CONFIG_I2C_VIA=m
CONFIG_I2C_VIAPRO=m

#
# I2C system bus drivers (mostly embedded / system-on-chip)
#
# CONFIG_I2C_INTEL_MID is not set
# CONFIG_I2C_MPC is not set
# CONFIG_I2C_OCORES is not set
# CONFIG_I2C_PCA_PLATFORM is not set
# CONFIG_I2C_PXA_PCI is not set
# CONFIG_I2C_SIMTEC is not set
# CONFIG_I2C_XILINX is not set
# CONFIG_I2C_EG20T is not set

#
# External I2C/SMBus adapter drivers
#
# CONFIG_I2C_DIOLAN_U2C is not set
CONFIG_I2C_PARPORT=m
CONFIG_I2C_PARPORT_LIGHT=m
# CONFIG_I2C_TAOS_EVM is not set
# CONFIG_I2C_TINY_USB is not set

#
# Other I2C/SMBus bus drivers
#
CONFIG_I2C_STUB=m
# CONFIG_I2C_DEBUG_CORE is not set
# CONFIG_I2C_DEBUG_ALGO is not set
# CONFIG_I2C_DEBUG_BUS is not set
# CONFIG_SPI is not set

#
# PPS support
#
# CONFIG_PPS is not set

#
# PPS generators support
#

#
# PTP clock support
#

#
# Enable Device Drivers -> PPS to see the PTP clock options.
#
CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y
# CONFIG_GPIOLIB is not set
# CONFIG_W1 is not set
CONFIG_POWER_SUPPLY=m
# CONFIG_POWER_SUPPLY_DEBUG is not set
# CONFIG_PDA_POWER is not set
# CONFIG_TEST_POWER is not set
# CONFIG_BATTERY_DS2780 is not set
# CONFIG_BATTERY_DS2782 is not set
# CONFIG_BATTERY_BQ20Z75 is not set
# CONFIG_BATTERY_BQ27x00 is not set
# CONFIG_BATTERY_MAX17040 is not set
# CONFIG_BATTERY_MAX17042 is not set
# CONFIG_CHARGER_MAX8903 is not set
CONFIG_HWMON=m
CONFIG_HWMON_VID=m
# CONFIG_HWMON_DEBUG_CHIP is not set

#
# Native drivers
#
# CONFIG_SENSORS_AD7414 is not set
# CONFIG_SENSORS_AD7418 is not set
CONFIG_SENSORS_ADM1021=m
CONFIG_SENSORS_ADM1025=m
CONFIG_SENSORS_ADM1026=m
# CONFIG_SENSORS_ADM1029 is not set
CONFIG_SENSORS_ADM1031=m
CONFIG_SENSORS_ADM9240=m
# CONFIG_SENSORS_ADT7411 is not set
# CONFIG_SENSORS_ADT7462 is not set
# CONFIG_SENSORS_ADT7470 is not set
# CONFIG_SENSORS_ADT7475 is not set
# CONFIG_SENSORS_ASC7621 is not set
CONFIG_SENSORS_ATXP1=m
# CONFIG_SENSORS_DS620 is not set
CONFIG_SENSORS_DS1621=m
# CONFIG_SENSORS_I5K_AMB is not set
CONFIG_SENSORS_F71805F=m
# CONFIG_SENSORS_F71882FG is not set
# CONFIG_SENSORS_F75375S is not set
# CONFIG_SENSORS_G760A is not set
CONFIG_SENSORS_GL518SM=m
CONFIG_SENSORS_GL520SM=m
# CONFIG_SENSORS_IBMAEM is not set
# CONFIG_SENSORS_IBMPEX is not set
CONFIG_SENSORS_IT87=m
# CONFIG_SENSORS_JC42 is not set
# CONFIG_SENSORS_LINEAGE is not set
CONFIG_SENSORS_LM63=m
# CONFIG_SENSORS_LM73 is not set
CONFIG_SENSORS_LM75=m
CONFIG_SENSORS_LM77=m
CONFIG_SENSORS_LM78=m
CONFIG_SENSORS_LM80=m
CONFIG_SENSORS_LM83=m
CONFIG_SENSORS_LM85=m
CONFIG_SENSORS_LM87=m
CONFIG_SENSORS_LM90=m
CONFIG_SENSORS_LM92=m
# CONFIG_SENSORS_LM93 is not set
# CONFIG_SENSORS_LTC4151 is not set
# CONFIG_SENSORS_LTC4215 is not set
# CONFIG_SENSORS_LTC4245 is not set
# CONFIG_SENSORS_LTC4261 is not set
# CONFIG_SENSORS_LM95241 is not set
# CONFIG_SENSORS_MAX16065 is not set
CONFIG_SENSORS_MAX1619=m
# CONFIG_SENSORS_MAX6639 is not set
# CONFIG_SENSORS_MAX6642 is not set
# CONFIG_SENSORS_MAX6650 is not set
CONFIG_SENSORS_PC87360=m
# CONFIG_SENSORS_PC87427 is not set
CONFIG_SENSORS_PCF8591=m
# CONFIG_PMBUS is not set
# CONFIG_SENSORS_SHT21 is not set
CONFIG_SENSORS_SIS5595=m
# CONFIG_SENSORS_SMM665 is not set
# CONFIG_SENSORS_DME1737 is not set
# CONFIG_SENSORS_EMC1403 is not set
# CONFIG_SENSORS_EMC2103 is not set
# CONFIG_SENSORS_EMC6W201 is not set
CONFIG_SENSORS_SMSC47M1=m
CONFIG_SENSORS_SMSC47M192=m
CONFIG_SENSORS_SMSC47B397=m
# CONFIG_SENSORS_SCH5627 is not set
# CONFIG_SENSORS_ADS1015 is not set
# CONFIG_SENSORS_ADS7828 is not set
# CONFIG_SENSORS_AMC6821 is not set
# CONFIG_SENSORS_THMC50 is not set
# CONFIG_SENSORS_TMP102 is not set
# CONFIG_SENSORS_TMP401 is not set
# CONFIG_SENSORS_TMP421 is not set
CONFIG_SENSORS_VIA686A=m
# CONFIG_SENSORS_VT1211 is not set
CONFIG_SENSORS_VT8231=m
CONFIG_SENSORS_W83781D=m
CONFIG_SENSORS_W83791D=m
CONFIG_SENSORS_W83792D=m
# CONFIG_SENSORS_W83793 is not set
# CONFIG_SENSORS_W83795 is not set
CONFIG_SENSORS_W83L785TS=m
# CONFIG_SENSORS_W83L786NG is not set
CONFIG_SENSORS_W83627HF=m
CONFIG_SENSORS_W83627EHF=m
# CONFIG_THERMAL is not set
CONFIG_WATCHDOG=y
# CONFIG_WATCHDOG_NOWAYOUT is not set

#
# Watchdog Device Drivers
#
CONFIG_SOFT_WATCHDOG=m
CONFIG_ALIM7101_WDT=m
# CONFIG_BOOKE_WDT is not set

#
# PCI-based Watchdog Cards
#
CONFIG_PCIPCWATCHDOG=m
CONFIG_WDTPCI=m

#
# USB-based Watchdog Cards
#
CONFIG_USBPCWATCHDOG=m
CONFIG_SSB_POSSIBLE=y

#
# Sonics Silicon Backplane
#
CONFIG_SSB=m
CONFIG_SSB_SPROM=y
CONFIG_SSB_PCIHOST_POSSIBLE=y
CONFIG_SSB_PCIHOST=y
# CONFIG_SSB_B43_PCI_BRIDGE is not set
CONFIG_SSB_PCMCIAHOST_POSSIBLE=y
# CONFIG_SSB_PCMCIAHOST is not set
CONFIG_SSB_SDIOHOST_POSSIBLE=y
# CONFIG_SSB_SDIOHOST is not set
# CONFIG_SSB_DEBUG is not set
CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y
CONFIG_SSB_DRIVER_PCICORE=y
CONFIG_BCMA_POSSIBLE=y

#
# Broadcom specific AMBA
#
# CONFIG_BCMA is not set
CONFIG_MFD_SUPPORT=y
# CONFIG_MFD_CORE is not set
# CONFIG_MFD_SM501 is not set
# CONFIG_HTC_PASIC3 is not set
# CONFIG_TPS6105X is not set
# CONFIG_TPS6507X is not set
# CONFIG_MFD_TMIO is not set
# CONFIG_MFD_WM8400 is not set
# CONFIG_MFD_PCF50633 is not set
# CONFIG_ABX500_CORE is not set
# CONFIG_LPC_SCH is not set
# CONFIG_MFD_RDC321X is not set
# CONFIG_MFD_JANZ_CMODIO is not set
# CONFIG_MFD_VX855 is not set
# CONFIG_MFD_WL1273_CORE is not set
# CONFIG_REGULATOR is not set
# CONFIG_MEDIA_SUPPORT is not set

#
# Graphics support
#
CONFIG_AGP=y
CONFIG_VGA_ARB=y
CONFIG_VGA_ARB_MAX_GPUS=16
CONFIG_DRM=m
CONFIG_DRM_KMS_HELPER=m
CONFIG_DRM_TTM=m
# CONFIG_DRM_TDFX is not set
CONFIG_DRM_R128=m
CONFIG_DRM_RADEON=m
# CONFIG_DRM_RADEON_KMS is not set
CONFIG_DRM_MGA=m
# CONFIG_DRM_SIS is not set
CONFIG_DRM_VIA=m
CONFIG_DRM_SAVAGE=m
# CONFIG_STUB_POULSBO is not set
CONFIG_VGASTATE=m
# CONFIG_VIDEO_OUTPUT_CONTROL is not set
CONFIG_FB=y
# CONFIG_FIRMWARE_EDID is not set
CONFIG_FB_DDC=m
# CONFIG_FB_BOOT_VESA_SUPPORT is not set
CONFIG_FB_CFB_FILLRECT=m
CONFIG_FB_CFB_COPYAREA=m
CONFIG_FB_CFB_IMAGEBLIT=m
# CONFIG_FB_CFB_REV_PIXELS_IN_BYTE is not set
# CONFIG_FB_SYS_FILLRECT is not set
# CONFIG_FB_SYS_COPYAREA is not set
# CONFIG_FB_SYS_IMAGEBLIT is not set
# CONFIG_FB_FOREIGN_ENDIAN is not set
# CONFIG_FB_SYS_FOPS is not set
# CONFIG_FB_WMT_GE_ROPS is not set
# CONFIG_FB_SVGALIB is not set
# CONFIG_FB_MACMODES is not set
CONFIG_FB_BACKLIGHT=y
CONFIG_FB_MODE_HELPERS=y
CONFIG_FB_TILEBLITTING=y

#
# Frame buffer hardware drivers
#
CONFIG_FB_CIRRUS=m
# CONFIG_FB_PM2 is not set
# CONFIG_FB_CYBER2000 is not set
# CONFIG_FB_OF is not set
# CONFIG_FB_CT65550 is not set
# CONFIG_FB_ASILIANT is not set
# CONFIG_FB_IMSTT is not set
CONFIG_FB_VGA16=m
# CONFIG_FB_UVESA is not set
# CONFIG_FB_S1D13XXX is not set
CONFIG_FB_NVIDIA=m
CONFIG_FB_NVIDIA_I2C=y
# CONFIG_FB_NVIDIA_DEBUG is not set
CONFIG_FB_NVIDIA_BACKLIGHT=y
CONFIG_FB_RIVA=m
# CONFIG_FB_RIVA_I2C is not set
# CONFIG_FB_RIVA_DEBUG is not set
CONFIG_FB_RIVA_BACKLIGHT=y
# CONFIG_FB_MATROX is not set
# CONFIG_FB_RADEON is not set
# CONFIG_FB_ATY128 is not set
# CONFIG_FB_ATY is not set
# CONFIG_FB_S3 is not set
CONFIG_FB_SAVAGE=m
CONFIG_FB_SAVAGE_I2C=y
CONFIG_FB_SAVAGE_ACCEL=y
# CONFIG_FB_SIS is not set
# CONFIG_FB_NEOMAGIC is not set
CONFIG_FB_KYRO=m
# CONFIG_FB_3DFX is not set
# CONFIG_FB_VOODOO1 is not set
# CONFIG_FB_VT8623 is not set
# CONFIG_FB_TRIDENT is not set
# CONFIG_FB_ARK is not set
# CONFIG_FB_PM3 is not set
# CONFIG_FB_CARMINE is not set
# CONFIG_FB_FSL_DIU is not set
# CONFIG_FB_UDL is not set
# CONFIG_FB_IBM_GXT4500 is not set
# CONFIG_FB_VIRTUAL is not set
# CONFIG_FB_METRONOME is not set
# CONFIG_FB_MB862XX is not set
# CONFIG_FB_BROADSHEET is not set
CONFIG_BACKLIGHT_LCD_SUPPORT=y
CONFIG_LCD_CLASS_DEVICE=m
# CONFIG_LCD_PLATFORM is not set
CONFIG_BACKLIGHT_CLASS_DEVICE=y
CONFIG_BACKLIGHT_GENERIC=y
# CONFIG_BACKLIGHT_ADP8860 is not set
# CONFIG_BACKLIGHT_ADP8870 is not set

#
# Display device support
#
# CONFIG_DISPLAY_SUPPORT is not set

#
# Console display driver support
#
CONFIG_VGA_CONSOLE=y
CONFIG_VGACON_SOFT_SCROLLBACK=y
CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
CONFIG_DUMMY_CONSOLE=y
CONFIG_FRAMEBUFFER_CONSOLE=y
CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y
CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y
# CONFIG_FONTS is not set
CONFIG_FONT_8x8=y
CONFIG_FONT_8x16=y
CONFIG_LOGO=y
# CONFIG_LOGO_LINUX_MONO is not set
# CONFIG_LOGO_LINUX_VGA16 is not set
CONFIG_LOGO_LINUX_CLUT224=y
# CONFIG_SOUND is not set
# CONFIG_HID_SUPPORT is not set
CONFIG_USB_SUPPORT=y
CONFIG_USB_ARCH_HAS_HCD=y
CONFIG_USB_ARCH_HAS_OHCI=y
CONFIG_USB_ARCH_HAS_EHCI=y
CONFIG_USB=y
# CONFIG_USB_DEBUG is not set
# CONFIG_USB_ANNOUNCE_NEW_DEVICES is not set

#
# Miscellaneous USB options
#
CONFIG_USB_DEVICEFS=y
CONFIG_USB_DEVICE_CLASS=y
# CONFIG_USB_DYNAMIC_MINORS is not set
CONFIG_USB_MON=y
# CONFIG_USB_WUSB is not set
# CONFIG_USB_WUSB_CBAF is not set

#
# USB Host Controller Drivers
#
# CONFIG_USB_C67X00_HCD is not set
# CONFIG_USB_XHCI_HCD is not set
CONFIG_USB_EHCI_HCD=m
CONFIG_USB_EHCI_ROOT_HUB_TT=y
CONFIG_USB_EHCI_TT_NEWSCHED=y
# CONFIG_XPS_USB_HCD_XILINX is not set
# CONFIG_USB_EHCI_FSL is not set
CONFIG_USB_EHCI_HCD_PPC_OF=y
# CONFIG_USB_OXU210HP_HCD is not set
CONFIG_USB_ISP116X_HCD=m
# CONFIG_USB_ISP1760_HCD is not set
# CONFIG_USB_ISP1362_HCD is not set
CONFIG_USB_OHCI_HCD=m
# CONFIG_USB_OHCI_HCD_PPC_OF_BE is not set
# CONFIG_USB_OHCI_HCD_PPC_OF_LE is not set
# CONFIG_USB_OHCI_HCD_PPC_OF is not set
# CONFIG_USB_OHCI_HCD_SSB is not set
# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set
# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set
CONFIG_USB_OHCI_LITTLE_ENDIAN=y
CONFIG_USB_UHCI_HCD=m
CONFIG_USB_SL811_HCD=m
# CONFIG_USB_SL811_HCD_ISO is not set
CONFIG_USB_SL811_CS=m
# CONFIG_USB_R8A66597_HCD is not set
# CONFIG_USB_WHCI_HCD is not set
# CONFIG_USB_HWA_HCD is not set

#
# USB Device Class drivers
#
CONFIG_USB_ACM=m
CONFIG_USB_PRINTER=m
# CONFIG_USB_WDM is not set
# CONFIG_USB_TMC is not set

#
# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
#

#
# also be needed; see USB_STORAGE Help for more info
#
CONFIG_USB_STORAGE=m
# CONFIG_USB_STORAGE_DEBUG is not set
# CONFIG_USB_STORAGE_REALTEK is not set
CONFIG_USB_STORAGE_DATAFAB=m
CONFIG_USB_STORAGE_FREECOM=m
CONFIG_USB_STORAGE_ISD200=m
CONFIG_USB_STORAGE_USBAT=m
CONFIG_USB_STORAGE_SDDR09=m
CONFIG_USB_STORAGE_SDDR55=m
CONFIG_USB_STORAGE_JUMPSHOT=m
CONFIG_USB_STORAGE_ALAUDA=m
# CONFIG_USB_STORAGE_ONETOUCH is not set
# CONFIG_USB_STORAGE_KARMA is not set
# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set
# CONFIG_USB_STORAGE_ENE_UB6250 is not set
# CONFIG_USB_UAS is not set
# CONFIG_USB_LIBUSUAL is not set

#
# USB Imaging devices
#
CONFIG_USB_MDC800=m
CONFIG_USB_MICROTEK=m

#
# USB port drivers
#
CONFIG_USB_USS720=m
# CONFIG_USB_SERIAL is not set

#
# USB Miscellaneous drivers
#
CONFIG_USB_EMI62=m
CONFIG_USB_EMI26=m
# CONFIG_USB_ADUTUX is not set
# CONFIG_USB_SEVSEG is not set
CONFIG_USB_RIO500=m
CONFIG_USB_LEGOTOWER=m
CONFIG_USB_LCD=m
CONFIG_USB_LED=m
# CONFIG_USB_CYPRESS_CY7C63 is not set
# CONFIG_USB_CYTHERM is not set
CONFIG_USB_IDMOUSE=m
# CONFIG_USB_FTDI_ELAN is not set
CONFIG_USB_APPLEDISPLAY=m
CONFIG_USB_SISUSBVGA=m
CONFIG_USB_SISUSBVGA_CON=y
CONFIG_USB_LD=m
# CONFIG_USB_TRANCEVIBRATOR is not set
# CONFIG_USB_IOWARRIOR is not set
CONFIG_USB_TEST=m
# CONFIG_USB_ISIGHTFW is not set
# CONFIG_USB_YUREX is not set
CONFIG_USB_ATM=m
CONFIG_USB_SPEEDTOUCH=m
CONFIG_USB_CXACRU=m
CONFIG_USB_UEAGLEATM=m
CONFIG_USB_XUSBATM=m
# CONFIG_USB_GADGET is not set

#
# OTG and related infrastructure
#
# CONFIG_NOP_USB_XCEIV is not set
# CONFIG_UWB is not set
CONFIG_MMC=m
# CONFIG_MMC_DEBUG is not set
# CONFIG_MMC_UNSAFE_RESUME is not set
# CONFIG_MMC_CLKGATE is not set

#
# MMC/SD/SDIO Card Drivers
#
CONFIG_MMC_BLOCK=m
CONFIG_MMC_BLOCK_MINORS=8
CONFIG_MMC_BLOCK_BOUNCE=y
# CONFIG_SDIO_UART is not set
# CONFIG_MMC_TEST is not set

#
# MMC/SD/SDIO Host Controller Drivers
#
CONFIG_MMC_SDHCI=m
# CONFIG_MMC_SDHCI_PCI is not set
# CONFIG_MMC_SDHCI_OF is not set
# CONFIG_MMC_SDHCI_PLTFM is not set
CONFIG_MMC_WBSD=m
# CONFIG_MMC_TIFM_SD is not set
# CONFIG_MMC_SDRICOH_CS is not set
# CONFIG_MMC_CB710 is not set
# CONFIG_MMC_VIA_SDMMC is not set
# CONFIG_MMC_VUB300 is not set
# CONFIG_MMC_USHC is not set
# CONFIG_MEMSTICK is not set
CONFIG_NEW_LEDS=y
CONFIG_LEDS_CLASS=y

#
# LED drivers
#
# CONFIG_LEDS_LM3530 is not set
# CONFIG_LEDS_PCA9532 is not set
# CONFIG_LEDS_LP3944 is not set
# CONFIG_LEDS_LP5521 is not set
# CONFIG_LEDS_LP5523 is not set
# CONFIG_LEDS_PCA955X is not set
# CONFIG_LEDS_BD2802 is not set
CONFIG_LEDS_TRIGGERS=y

#
# LED Triggers
#
CONFIG_LEDS_TRIGGER_TIMER=m
CONFIG_LEDS_TRIGGER_IDE_DISK=y
CONFIG_LEDS_TRIGGER_HEARTBEAT=m
# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set

#
# iptables trigger is under Netfilter config (LED target)
#
# CONFIG_NFC_DEVICES is not set
# CONFIG_ACCESSIBILITY is not set
CONFIG_INFINIBAND=m
CONFIG_INFINIBAND_USER_MAD=m
CONFIG_INFINIBAND_USER_ACCESS=m
CONFIG_INFINIBAND_USER_MEM=y
CONFIG_INFINIBAND_ADDR_TRANS=y
CONFIG_INFINIBAND_MTHCA=m
CONFIG_INFINIBAND_MTHCA_DEBUG=y
CONFIG_INFINIBAND_AMSO1100=m
# CONFIG_INFINIBAND_AMSO1100_DEBUG is not set
CONFIG_INFINIBAND_CXGB3=m
# CONFIG_INFINIBAND_CXGB3_DEBUG is not set
CONFIG_MLX4_INFINIBAND=m
CONFIG_INFINIBAND_NES=m
# CONFIG_INFINIBAND_NES_DEBUG is not set
CONFIG_INFINIBAND_IPOIB=m
CONFIG_INFINIBAND_IPOIB_CM=y
CONFIG_INFINIBAND_IPOIB_DEBUG=y
# CONFIG_INFINIBAND_IPOIB_DEBUG_DATA is not set
CONFIG_INFINIBAND_SRP=m
CONFIG_INFINIBAND_ISER=m
CONFIG_EDAC=y

#
# Reporting subsystems
#
# CONFIG_EDAC_DEBUG is not set
CONFIG_EDAC_MM_EDAC=m
# CONFIG_EDAC_MPC85XX is not set
# CONFIG_RTC_CLASS is not set
# CONFIG_DMADEVICES is not set
# CONFIG_AUXDISPLAY is not set
# CONFIG_UIO is not set
# CONFIG_STAGING is not set

#
# File systems
#
CONFIG_EXT2_FS=y
CONFIG_EXT2_FS_XATTR=y
CONFIG_EXT2_FS_POSIX_ACL=y
CONFIG_EXT2_FS_SECURITY=y
CONFIG_EXT2_FS_XIP=y
CONFIG_EXT3_FS=m
CONFIG_EXT3_DEFAULTS_TO_ORDERED=y
CONFIG_EXT3_FS_XATTR=y
CONFIG_EXT3_FS_POSIX_ACL=y
CONFIG_EXT3_FS_SECURITY=y
# CONFIG_EXT4_FS is not set
CONFIG_FS_XIP=y
CONFIG_JBD=m
# CONFIG_JBD_DEBUG is not set
CONFIG_FS_MBCACHE=y
# CONFIG_REISERFS_FS is not set
# CONFIG_JFS_FS is not set
# CONFIG_XFS_FS is not set
CONFIG_GFS2_FS=m
# CONFIG_GFS2_FS_LOCKING_DLM is not set
# CONFIG_OCFS2_FS is not set
# CONFIG_BTRFS_FS is not set
# CONFIG_NILFS2_FS is not set
CONFIG_FS_POSIX_ACL=y
CONFIG_EXPORTFS=m
CONFIG_FILE_LOCKING=y
CONFIG_FSNOTIFY=y
CONFIG_DNOTIFY=y
CONFIG_INOTIFY_USER=y
# CONFIG_FANOTIFY is not set
CONFIG_QUOTA=y
# CONFIG_QUOTA_NETLINK_INTERFACE is not set
CONFIG_PRINT_QUOTA_WARNING=y
# CONFIG_QUOTA_DEBUG is not set
CONFIG_QUOTA_TREE=y
# CONFIG_QFMT_V1 is not set
CONFIG_QFMT_V2=y
CONFIG_QUOTACTL=y
CONFIG_AUTOFS4_FS=m
# CONFIG_FUSE_FS is not set

#
# Caches
#
CONFIG_FSCACHE=m
# CONFIG_FSCACHE_STATS is not set
# CONFIG_FSCACHE_HISTOGRAM is not set
# CONFIG_FSCACHE_DEBUG is not set
# CONFIG_FSCACHE_OBJECT_LIST is not set
CONFIG_CACHEFILES=m
CONFIG_CACHEFILES_DEBUG=y
# CONFIG_CACHEFILES_HISTOGRAM is not set

#
# CD-ROM/DVD Filesystems
#
CONFIG_ISO9660_FS=y
CONFIG_JOLIET=y
CONFIG_ZISOFS=y
CONFIG_UDF_FS=m
CONFIG_UDF_NLS=y

#
# DOS/FAT/NT Filesystems
#
CONFIG_FAT_FS=m
CONFIG_MSDOS_FS=m
CONFIG_VFAT_FS=m
CONFIG_FAT_DEFAULT_CODEPAGE=437
CONFIG_FAT_DEFAULT_IOCHARSET="ascii"
# CONFIG_NTFS_FS is not set

#
# Pseudo filesystems
#
CONFIG_PROC_FS=y
CONFIG_PROC_KCORE=y
CONFIG_PROC_VMCORE=y
CONFIG_PROC_SYSCTL=y
CONFIG_PROC_PAGE_MONITOR=y
CONFIG_SYSFS=y
CONFIG_TMPFS=y
# CONFIG_TMPFS_POSIX_ACL is not set
# CONFIG_TMPFS_XATTR is not set
# CONFIG_HUGETLB_PAGE is not set
CONFIG_CONFIGFS_FS=m
CONFIG_MISC_FILESYSTEMS=y
# CONFIG_ADFS_FS is not set
# CONFIG_AFFS_FS is not set
CONFIG_ECRYPT_FS=m
CONFIG_HFS_FS=m
CONFIG_HFSPLUS_FS=m
# CONFIG_BEFS_FS is not set
# CONFIG_BFS_FS is not set
# CONFIG_EFS_FS is not set
CONFIG_JFFS2_FS=m
CONFIG_JFFS2_FS_DEBUG=0
CONFIG_JFFS2_FS_WRITEBUFFER=y
# CONFIG_JFFS2_FS_WBUF_VERIFY is not set
CONFIG_JFFS2_SUMMARY=y
# CONFIG_JFFS2_FS_XATTR is not set
# CONFIG_JFFS2_COMPRESSION_OPTIONS is not set
CONFIG_JFFS2_ZLIB=y
# CONFIG_JFFS2_LZO is not set
CONFIG_JFFS2_RTIME=y
# CONFIG_JFFS2_RUBIN is not set
# CONFIG_LOGFS is not set
CONFIG_CRAMFS=m
CONFIG_SQUASHFS=m
# CONFIG_SQUASHFS_XATTR is not set
# CONFIG_SQUASHFS_LZO is not set
# CONFIG_SQUASHFS_XZ is not set
# CONFIG_SQUASHFS_EMBEDDED is not set
CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3
CONFIG_VXFS_FS=m
# CONFIG_MINIX_FS is not set
# CONFIG_OMFS_FS is not set
# CONFIG_HPFS_FS is not set
# CONFIG_QNX4FS_FS is not set
# CONFIG_ROMFS_FS is not set
# CONFIG_PSTORE is not set
# CONFIG_SYSV_FS is not set
# CONFIG_UFS_FS is not set
CONFIG_NETWORK_FILESYSTEMS=y
CONFIG_NFS_FS=y
CONFIG_NFS_V3=y
CONFIG_NFS_V3_ACL=y
CONFIG_NFS_V4=y
# CONFIG_NFS_V4_1 is not set
CONFIG_ROOT_NFS=y
# CONFIG_NFS_USE_LEGACY_DNS is not set
CONFIG_NFS_USE_KERNEL_DNS=y
# CONFIG_NFS_USE_NEW_IDMAPPER is not set
CONFIG_NFSD=m
CONFIG_NFSD_DEPRECATED=y
CONFIG_NFSD_V2_ACL=y
CONFIG_NFSD_V3=y
CONFIG_NFSD_V3_ACL=y
CONFIG_NFSD_V4=y
CONFIG_LOCKD=y
CONFIG_LOCKD_V4=y
CONFIG_NFS_ACL_SUPPORT=y
CONFIG_NFS_COMMON=y
CONFIG_SUNRPC=y
CONFIG_SUNRPC_GSS=y
CONFIG_SUNRPC_XPRT_RDMA=m
# CONFIG_CEPH_FS is not set
CONFIG_CIFS=m
# CONFIG_CIFS_STATS is not set
CONFIG_CIFS_WEAK_PW_HASH=y
CONFIG_CIFS_UPCALL=y
CONFIG_CIFS_XATTR=y
CONFIG_CIFS_POSIX=y
# CONFIG_CIFS_DEBUG2 is not set
CONFIG_CIFS_DFS_UPCALL=y
# CONFIG_CIFS_FSCACHE is not set
# CONFIG_CIFS_ACL is not set
# CONFIG_NCP_FS is not set
# CONFIG_CODA_FS is not set
# CONFIG_AFS_FS is not set

#
# Partition Types
#
CONFIG_PARTITION_ADVANCED=y
# CONFIG_ACORN_PARTITION is not set
CONFIG_OSF_PARTITION=y
CONFIG_AMIGA_PARTITION=y
# CONFIG_ATARI_PARTITION is not set
CONFIG_MAC_PARTITION=y
CONFIG_MSDOS_PARTITION=y
CONFIG_BSD_DISKLABEL=y
CONFIG_MINIX_SUBPARTITION=y
CONFIG_SOLARIS_X86_PARTITION=y
CONFIG_UNIXWARE_DISKLABEL=y
# CONFIG_LDM_PARTITION is not set
CONFIG_SGI_PARTITION=y
# CONFIG_ULTRIX_PARTITION is not set
CONFIG_SUN_PARTITION=y
CONFIG_KARMA_PARTITION=y
CONFIG_EFI_PARTITION=y
# CONFIG_SYSV68_PARTITION is not set
CONFIG_NLS=y
CONFIG_NLS_DEFAULT="utf8"
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_CODEPAGE_737=m
CONFIG_NLS_CODEPAGE_775=m
CONFIG_NLS_CODEPAGE_850=m
CONFIG_NLS_CODEPAGE_852=m
CONFIG_NLS_CODEPAGE_855=m
CONFIG_NLS_CODEPAGE_857=m
CONFIG_NLS_CODEPAGE_860=m
CONFIG_NLS_CODEPAGE_861=m
CONFIG_NLS_CODEPAGE_862=m
CONFIG_NLS_CODEPAGE_863=m
CONFIG_NLS_CODEPAGE_864=m
CONFIG_NLS_CODEPAGE_865=m
CONFIG_NLS_CODEPAGE_866=m
CONFIG_NLS_CODEPAGE_869=m
CONFIG_NLS_CODEPAGE_936=m
CONFIG_NLS_CODEPAGE_950=m
CONFIG_NLS_CODEPAGE_932=m
CONFIG_NLS_CODEPAGE_949=m
CONFIG_NLS_CODEPAGE_874=m
CONFIG_NLS_ISO8859_8=m
CONFIG_NLS_CODEPAGE_1250=m
CONFIG_NLS_CODEPAGE_1251=m
CONFIG_NLS_ASCII=y
CONFIG_NLS_ISO8859_1=m
CONFIG_NLS_ISO8859_2=m
CONFIG_NLS_ISO8859_3=m
CONFIG_NLS_ISO8859_4=m
CONFIG_NLS_ISO8859_5=m
CONFIG_NLS_ISO8859_6=m
CONFIG_NLS_ISO8859_7=m
CONFIG_NLS_ISO8859_9=m
CONFIG_NLS_ISO8859_13=m
CONFIG_NLS_ISO8859_14=m
CONFIG_NLS_ISO8859_15=m
CONFIG_NLS_KOI8_R=m
CONFIG_NLS_KOI8_U=m
CONFIG_NLS_UTF8=m
CONFIG_DLM=m
CONFIG_DLM_DEBUG=y
CONFIG_BINARY_PRINTF=y

#
# Library routines
#
CONFIG_RAID6_PQ=m
CONFIG_BITREVERSE=y
CONFIG_CRC_CCITT=m
CONFIG_CRC16=m
# CONFIG_CRC_T10DIF is not set
CONFIG_CRC_ITU_T=m
CONFIG_CRC32=y
# CONFIG_CRC7 is not set
CONFIG_LIBCRC32C=y
CONFIG_ZLIB_INFLATE=y
CONFIG_ZLIB_DEFLATE=m
CONFIG_LZO_DECOMPRESS=y
CONFIG_XZ_DEC=y
CONFIG_XZ_DEC_X86=y
CONFIG_XZ_DEC_POWERPC=y
CONFIG_XZ_DEC_IA64=y
CONFIG_XZ_DEC_ARM=y
CONFIG_XZ_DEC_ARMTHUMB=y
CONFIG_XZ_DEC_SPARC=y
CONFIG_XZ_DEC_BCJ=y
# CONFIG_XZ_DEC_TEST is not set
CONFIG_DECOMPRESS_GZIP=y
CONFIG_DECOMPRESS_BZIP2=y
CONFIG_DECOMPRESS_LZMA=y
CONFIG_DECOMPRESS_XZ=y
CONFIG_DECOMPRESS_LZO=y
CONFIG_GENERIC_ALLOCATOR=y
CONFIG_REED_SOLOMON=m
CONFIG_REED_SOLOMON_DEC16=y
CONFIG_TEXTSEARCH=y
CONFIG_TEXTSEARCH_KMP=m
CONFIG_TEXTSEARCH_BM=m
CONFIG_TEXTSEARCH_FSM=m
CONFIG_HAS_IOMEM=y
CONFIG_HAS_IOPORT=y
CONFIG_HAS_DMA=y
CONFIG_CPU_RMAP=y
CONFIG_NLATTR=y
CONFIG_GENERIC_ATOMIC64=y
CONFIG_AVERAGE=y

#
# Kernel hacking
#
# CONFIG_PRINTK_TIME is not set
CONFIG_DEFAULT_MESSAGE_LOGLEVEL=4
CONFIG_ENABLE_WARN_DEPRECATED=y
CONFIG_ENABLE_MUST_CHECK=y
CONFIG_FRAME_WARN=1024
CONFIG_MAGIC_SYSRQ=y
# CONFIG_STRIP_ASM_SYMS is not set
# CONFIG_UNUSED_SYMBOLS is not set
CONFIG_DEBUG_FS=y
# CONFIG_HEADERS_CHECK is not set
# CONFIG_DEBUG_SECTION_MISMATCH is not set
CONFIG_DEBUG_KERNEL=y
# CONFIG_DEBUG_SHIRQ is not set
# CONFIG_LOCKUP_DETECTOR is not set
# CONFIG_HARDLOCKUP_DETECTOR is not set
# CONFIG_DETECT_HUNG_TASK is not set
CONFIG_SCHED_DEBUG=y
CONFIG_SCHEDSTATS=y
# CONFIG_TIMER_STATS is not set
# CONFIG_DEBUG_OBJECTS is not set
# CONFIG_DEBUG_SLAB is not set
# CONFIG_DEBUG_KMEMLEAK is not set
# CONFIG_DEBUG_RT_MUTEXES is not set
# CONFIG_RT_MUTEX_TESTER is not set
# CONFIG_DEBUG_SPINLOCK is not set
# CONFIG_DEBUG_MUTEXES is not set
# CONFIG_DEBUG_LOCK_ALLOC is not set
# CONFIG_PROVE_LOCKING is not set
# CONFIG_SPARSE_RCU_POINTER is not set
# CONFIG_LOCK_STAT is not set
# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
CONFIG_STACKTRACE=y
# CONFIG_DEBUG_STACK_USAGE is not set
# CONFIG_DEBUG_KOBJECT is not set
CONFIG_DEBUG_BUGVERBOSE=y
CONFIG_DEBUG_INFO=y
# CONFIG_DEBUG_INFO_REDUCED is not set
# CONFIG_DEBUG_VM is not set
# CONFIG_DEBUG_WRITECOUNT is not set
CONFIG_DEBUG_MEMORY_INIT=y
CONFIG_DEBUG_LIST=y
# CONFIG_TEST_LIST_SORT is not set
# CONFIG_DEBUG_SG is not set
# CONFIG_DEBUG_NOTIFIERS is not set
# CONFIG_DEBUG_CREDENTIALS is not set
CONFIG_FRAME_POINTER=y
# CONFIG_RCU_TORTURE_TEST is not set
CONFIG_RCU_CPU_STALL_TIMEOUT=60
# CONFIG_KPROBES_SANITY_TEST is not set
# CONFIG_BACKTRACE_SELF_TEST is not set
# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set
# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set
# CONFIG_DEBUG_PER_CPU_MAPS is not set
# CONFIG_LKDTM is not set
# CONFIG_FAULT_INJECTION is not set
# CONFIG_LATENCYTOP is not set
# CONFIG_SYSCTL_SYSCALL_CHECK is not set
# CONFIG_DEBUG_PAGEALLOC is not set
CONFIG_NOP_TRACER=y
CONFIG_HAVE_FUNCTION_TRACER=y
CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
CONFIG_HAVE_DYNAMIC_FTRACE=y
CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
CONFIG_HAVE_SYSCALL_TRACEPOINTS=y
CONFIG_RING_BUFFER=y
CONFIG_EVENT_TRACING=y
CONFIG_EVENT_POWER_TRACING_DEPRECATED=y
CONFIG_CONTEXT_SWITCH_TRACER=y
CONFIG_RING_BUFFER_ALLOW_SWAP=y
CONFIG_TRACING=y
CONFIG_GENERIC_TRACER=y
CONFIG_TRACING_SUPPORT=y
CONFIG_FTRACE=y
CONFIG_FUNCTION_TRACER=y
CONFIG_FUNCTION_GRAPH_TRACER=y
# CONFIG_IRQSOFF_TRACER is not set
# CONFIG_SCHED_TRACER is not set
# CONFIG_FTRACE_SYSCALLS is not set
CONFIG_BRANCH_PROFILE_NONE=y
# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set
# CONFIG_PROFILE_ALL_BRANCHES is not set
# CONFIG_STACK_TRACER is not set
CONFIG_BLK_DEV_IO_TRACE=y
CONFIG_KPROBE_EVENT=y
CONFIG_DYNAMIC_FTRACE=y
# CONFIG_FUNCTION_PROFILER is not set
CONFIG_FTRACE_MCOUNT_RECORD=y
# CONFIG_FTRACE_STARTUP_TEST is not set
# CONFIG_RING_BUFFER_BENCHMARK is not set
# CONFIG_FIREWIRE_OHCI_REMOTE_DMA is not set
# CONFIG_DYNAMIC_DEBUG is not set
# CONFIG_DMA_API_DEBUG is not set
# CONFIG_ATOMIC64_SELFTEST is not set
# CONFIG_ASYNC_RAID6_TEST is not set
# CONFIG_SAMPLES is not set
CONFIG_HAVE_ARCH_KGDB=y
# CONFIG_KGDB is not set
# CONFIG_TEST_KSTRTOX is not set
# CONFIG_PPC_DISABLE_WERROR is not set
CONFIG_PPC_WERROR=y
CONFIG_PRINT_STACK_DEPTH=64
CONFIG_DEBUG_STACKOVERFLOW=y
# CONFIG_PPC_EMULATED_STATS is not set
# CONFIG_CODE_PATCHING_SELFTEST is not set
# CONFIG_FTR_FIXUP_SELFTEST is not set
# CONFIG_MSI_BITMAP_SELFTEST is not set
# CONFIG_XMON is not set
# CONFIG_VIRQ_DEBUG is not set
# CONFIG_BDI_SWITCH is not set
# CONFIG_PPC_EARLY_DEBUG is not set

#
# Security options
#
CONFIG_KEYS=y
# CONFIG_TRUSTED_KEYS is not set
# CONFIG_KEYS_DEBUG_PROC_KEYS is not set
# CONFIG_SECURITY_DMESG_RESTRICT is not set
# CONFIG_SECURITY is not set
CONFIG_SECURITYFS=y
CONFIG_DEFAULT_SECURITY_DAC=y
CONFIG_DEFAULT_SECURITY=""
CONFIG_XOR_BLOCKS=m
CONFIG_ASYNC_CORE=m
CONFIG_ASYNC_MEMCPY=m
CONFIG_ASYNC_XOR=m
CONFIG_ASYNC_PQ=m
CONFIG_ASYNC_RAID6_RECOV=m
CONFIG_CRYPTO=y

#
# Crypto core or helper
#
CONFIG_CRYPTO_ALGAPI=y
CONFIG_CRYPTO_ALGAPI2=y
CONFIG_CRYPTO_AEAD=m
CONFIG_CRYPTO_AEAD2=y
CONFIG_CRYPTO_BLKCIPHER=m
CONFIG_CRYPTO_BLKCIPHER2=y
CONFIG_CRYPTO_HASH=y
CONFIG_CRYPTO_HASH2=y
CONFIG_CRYPTO_RNG=m
CONFIG_CRYPTO_RNG2=y
CONFIG_CRYPTO_PCOMP2=y
CONFIG_CRYPTO_MANAGER=y
CONFIG_CRYPTO_MANAGER2=y
CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=y
# CONFIG_CRYPTO_GF128MUL is not set
CONFIG_CRYPTO_NULL=m
# CONFIG_CRYPTO_PCRYPT is not set
CONFIG_CRYPTO_WORKQUEUE=y
# CONFIG_CRYPTO_CRYPTD is not set
CONFIG_CRYPTO_AUTHENC=m
CONFIG_CRYPTO_TEST=m

#
# Authenticated Encryption with Associated Data
#
CONFIG_CRYPTO_CCM=m
# CONFIG_CRYPTO_GCM is not set
CONFIG_CRYPTO_SEQIV=m

#
# Block modes
#
CONFIG_CRYPTO_CBC=m
CONFIG_CRYPTO_CTR=m
# CONFIG_CRYPTO_CTS is not set
CONFIG_CRYPTO_ECB=m
# CONFIG_CRYPTO_LRW is not set
# CONFIG_CRYPTO_PCBC is not set
# CONFIG_CRYPTO_XTS is not set

#
# Hash modes
#
CONFIG_CRYPTO_HMAC=y
CONFIG_CRYPTO_XCBC=m
# CONFIG_CRYPTO_VMAC is not set

#
# Digest
#
CONFIG_CRYPTO_CRC32C=y
# CONFIG_CRYPTO_GHASH is not set
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MD5=m
CONFIG_CRYPTO_MICHAEL_MIC=m
# CONFIG_CRYPTO_RMD128 is not set
# CONFIG_CRYPTO_RMD160 is not set
# CONFIG_CRYPTO_RMD256 is not set
# CONFIG_CRYPTO_RMD320 is not set
CONFIG_CRYPTO_SHA1=y
CONFIG_CRYPTO_SHA256=m
CONFIG_CRYPTO_SHA512=m
CONFIG_CRYPTO_TGR192=m
CONFIG_CRYPTO_WP512=m

#
# Ciphers
#
CONFIG_CRYPTO_AES=m
CONFIG_CRYPTO_ANUBIS=m
CONFIG_CRYPTO_ARC4=m
CONFIG_CRYPTO_BLOWFISH=m
# CONFIG_CRYPTO_CAMELLIA is not set
CONFIG_CRYPTO_CAST5=m
CONFIG_CRYPTO_CAST6=m
CONFIG_CRYPTO_DES=m
# CONFIG_CRYPTO_FCRYPT is not set
CONFIG_CRYPTO_KHAZAD=m
# CONFIG_CRYPTO_SALSA20 is not set
# CONFIG_CRYPTO_SEED is not set
CONFIG_CRYPTO_SERPENT=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_TWOFISH_COMMON=m

#
# Compression
#
CONFIG_CRYPTO_DEFLATE=m
# CONFIG_CRYPTO_ZLIB is not set
# CONFIG_CRYPTO_LZO is not set

#
# Random Number Generation
#
CONFIG_CRYPTO_ANSI_CPRNG=m
# CONFIG_CRYPTO_USER_API_HASH is not set
# CONFIG_CRYPTO_USER_API_SKCIPHER is not set
CONFIG_CRYPTO_HW=y
# CONFIG_CRYPTO_DEV_HIFN_795X is not set
# CONFIG_CRYPTO_DEV_FSL_CAAM is not set
# CONFIG_CRYPTO_DEV_TALITOS is not set
# CONFIG_PPC_CLOCK is not set
# CONFIG_VIRTUALIZATION is not set

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-18  7:50                           ` Shan Hai
  0 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-18  7:50 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: tony.luck, Peter Zijlstra, Peter Zijlstra, linux-kernel,
	cmetcalf, dhowells, paulus, tglx, walken, linuxppc-dev, akpm

[-- Attachment #1: Type: text/plain, Size: 2514 bytes --]

On 07/18/2011 03:36 PM, Benjamin Herrenschmidt wrote:
> On Mon, 2011-07-18 at 15:26 +0800, Shan Hai wrote:
>> I am sorry I hadn't tried your newer patch, I tried it but it still
>> could not work in my test environment, I will dig into and tell you
>> why that failed later.
> Ok, please let me know what you find !
>
>> Yep, I know holding lots of ifdef's everywhere is not so good,
>> but if we have some other way(I don't know how till now) to
>> figure out the arch has the need to fixup up the write permission
>> we could eradicate the ugly ifdef's here.
>>
>> I think the handle_mm_fault could do all dirty/young tracking,
>> because the purpose of making follow_page return NULL to
>> its caller is that want to the handle_mm_fault to be called
>> on write permission protection fault.
> I see your point. Rather than factoring the fixup code out, we could
> force gup to call handle_mm_fault()... that makes sense.
>
> However, I don't think we should special case archs. There's plenty of
> cases where we don't care about this fixup even on archs that do SW
> tracking of dirty and young. For example when gup is using for
> subsequent DMA.
>
> Only the (rare ?) cases where it's used as a mean to fixup a failing
> "atomic" user access are relevant.
>
> So I believe we should still pass an explicit flag to __get_user_pages()
> as I propose to activate that behaviour.
>
> At this point, since we have isolated the special case callers, I think
> we are pretty much in a situation where there's no point trying to
> optimize the x86 case more, it's a fairly slow path anyway, and so no
> ifdef should be needed (and x86 already #define out the TLB flush for
> spurious faults in handle_pte_fault today).
>
> We don't even need to change follow_page()... we just don't call it the
> first time around.
>
> I'll cook up another patch later but first we need to find out why the
> one you have doesn't work. There might be another problem lurking (or I
> just made a stupid mistake).
>
> BTW. Can you give me some details about how you reproduce the problem ?
> I should setup something on a booke machine here to verify things.
>

Please get the test case code from the thread
"[PATCH 0/1] Fixup write permission of TLB on powerpc e500 core"'s
attachment, simply compile it and do the following,
- run the test case on the board
- run 'top' on the other terminal, you should observe almost
     100% CPU system usage

I also attached the kernel config file.

Best regards
Shan Hai


> Cheers,
> Ben.
>


[-- Attachment #2: config --]
[-- Type: text/plain, Size: 76255 bytes --]

#
# Automatically generated make config: don't edit
# Linux/powerpc 3.0.0-rc6 Kernel Configuration
#
# CONFIG_PPC64 is not set

#
# Processor support
#
# CONFIG_PPC_BOOK3S_32 is not set
CONFIG_PPC_85xx=y
# CONFIG_PPC_8xx is not set
# CONFIG_40x is not set
# CONFIG_44x is not set
# CONFIG_E200 is not set
CONFIG_E500=y
# CONFIG_PPC_E500MC is not set
CONFIG_FSL_EMB_PERFMON=y
CONFIG_FSL_EMB_PERF_EVENT=y
CONFIG_FSL_EMB_PERF_EVENT_E500=y
CONFIG_BOOKE=y
CONFIG_FSL_BOOKE=y
CONFIG_PPC_FSL_BOOK3E=y
# CONFIG_PHYS_64BIT is not set
CONFIG_SPE=y
CONFIG_PPC_MMU_NOHASH=y
CONFIG_PPC_MMU_NOHASH_32=y
CONFIG_PPC_BOOK3E_MMU=y
# CONFIG_PPC_MM_SLICES is not set
CONFIG_SMP=y
CONFIG_NR_CPUS=255
CONFIG_PPC32=y
CONFIG_32BIT=y
CONFIG_WORD_SIZE=32
# CONFIG_ARCH_PHYS_ADDR_T_64BIT is not set
# CONFIG_ARCH_DMA_ADDR_T_64BIT is not set
CONFIG_MMU=y
CONFIG_GENERIC_CMOS_UPDATE=y
CONFIG_GENERIC_TIME_VSYSCALL=y
CONFIG_GENERIC_CLOCKEVENTS=y
# CONFIG_HAVE_SETUP_PER_CPU_AREA is not set
# CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK is not set
CONFIG_NR_IRQS=512
CONFIG_STACKTRACE_SUPPORT=y
CONFIG_HAVE_LATENCYTOP_SUPPORT=y
CONFIG_TRACE_IRQFLAGS_SUPPORT=y
CONFIG_LOCKDEP_SUPPORT=y
CONFIG_RWSEM_XCHGADD_ALGORITHM=y
CONFIG_ARCH_HAS_ILOG2_U32=y
CONFIG_GENERIC_HWEIGHT=y
# CONFIG_ARCH_NO_VIRT_TO_BUS is not set
CONFIG_PPC=y
CONFIG_EARLY_PRINTK=y
CONFIG_GENERIC_NVRAM=y
CONFIG_SCHED_OMIT_FRAME_POINTER=y
CONFIG_ARCH_MAY_HAVE_PC_FDC=y
CONFIG_PPC_OF=y
CONFIG_PPC_UDBG_16550=y
CONFIG_GENERIC_TBSYNC=y
CONFIG_AUDIT_ARCH=y
CONFIG_GENERIC_BUG=y
# CONFIG_EPAPR_BOOT is not set
CONFIG_DEFAULT_UIMAGE=y
CONFIG_ARCH_HIBERNATION_POSSIBLE=y
# CONFIG_PPC_DCR_NATIVE is not set
# CONFIG_PPC_DCR_MMIO is not set
CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
CONFIG_PPC_ADV_DEBUG_REGS=y
CONFIG_PPC_ADV_DEBUG_IACS=2
CONFIG_PPC_ADV_DEBUG_DACS=2
CONFIG_PPC_ADV_DEBUG_DVCS=0
CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
CONFIG_HAVE_IRQ_WORK=y
CONFIG_IRQ_WORK=y

#
# General setup
#
CONFIG_EXPERIMENTAL=y
CONFIG_INIT_ENV_ARG_LIMIT=32
CONFIG_CROSS_COMPILE=""
CONFIG_LOCALVERSION=""
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_DEFAULT_HOSTNAME="(none)"
CONFIG_SWAP=y
CONFIG_SYSVIPC=y
CONFIG_SYSVIPC_SYSCTL=y
CONFIG_POSIX_MQUEUE=y
CONFIG_POSIX_MQUEUE_SYSCTL=y
CONFIG_BSD_PROCESS_ACCT=y
# CONFIG_BSD_PROCESS_ACCT_V3 is not set
# CONFIG_FHANDLE is not set
CONFIG_TASKSTATS=y
CONFIG_TASK_DELAY_ACCT=y
# CONFIG_TASK_XACCT is not set
CONFIG_AUDIT=y
CONFIG_AUDITSYSCALL=y
CONFIG_AUDIT_WATCH=y
CONFIG_AUDIT_TREE=y
CONFIG_HAVE_GENERIC_HARDIRQS=y

#
# IRQ subsystem
#
CONFIG_GENERIC_HARDIRQS=y
CONFIG_HAVE_SPARSE_IRQ=y
CONFIG_GENERIC_IRQ_SHOW=y
CONFIG_GENERIC_IRQ_SHOW_LEVEL=y
# CONFIG_SPARSE_IRQ is not set

#
# RCU Subsystem
#
CONFIG_TREE_RCU=y
# CONFIG_PREEMPT_RCU is not set
# CONFIG_RCU_TRACE is not set
CONFIG_RCU_FANOUT=32
# CONFIG_RCU_FANOUT_EXACT is not set
# CONFIG_TREE_RCU_TRACE is not set
# CONFIG_IKCONFIG is not set
CONFIG_LOG_BUF_SHIFT=17
# CONFIG_CGROUPS is not set
CONFIG_NAMESPACES=y
CONFIG_UTS_NS=y
CONFIG_IPC_NS=y
CONFIG_USER_NS=y
CONFIG_PID_NS=y
CONFIG_NET_NS=y
# CONFIG_SCHED_AUTOGROUP is not set
# CONFIG_SYSFS_DEPRECATED is not set
CONFIG_RELAY=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_INITRAMFS_SOURCE=""
CONFIG_RD_GZIP=y
CONFIG_RD_BZIP2=y
CONFIG_RD_LZMA=y
CONFIG_RD_XZ=y
CONFIG_RD_LZO=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
CONFIG_SYSCTL=y
CONFIG_ANON_INODES=y
# CONFIG_EXPERT is not set
CONFIG_SYSCTL_SYSCALL=y
CONFIG_KALLSYMS=y
# CONFIG_KALLSYMS_ALL is not set
CONFIG_HOTPLUG=y
CONFIG_PRINTK=y
CONFIG_BUG=y
CONFIG_ELF_CORE=y
CONFIG_BASE_FULL=y
CONFIG_FUTEX=y
CONFIG_EPOLL=y
CONFIG_SIGNALFD=y
CONFIG_TIMERFD=y
CONFIG_EVENTFD=y
CONFIG_SHMEM=y
CONFIG_AIO=y
# CONFIG_EMBEDDED is not set
CONFIG_HAVE_PERF_EVENTS=y

#
# Kernel Performance Events And Counters
#
CONFIG_PERF_EVENTS=y
# CONFIG_PERF_COUNTERS is not set
# CONFIG_DEBUG_PERF_USE_VMALLOC is not set
CONFIG_VM_EVENT_COUNTERS=y
CONFIG_PCI_QUIRKS=y
CONFIG_COMPAT_BRK=y
CONFIG_SLAB=y
# CONFIG_SLUB is not set
CONFIG_PROFILING=y
CONFIG_TRACEPOINTS=y
CONFIG_OPROFILE=m
CONFIG_HAVE_OPROFILE=y
CONFIG_KPROBES=y
CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y
CONFIG_KRETPROBES=y
CONFIG_HAVE_IOREMAP_PROT=y
CONFIG_HAVE_KPROBES=y
CONFIG_HAVE_KRETPROBES=y
CONFIG_HAVE_ARCH_TRACEHOOK=y
CONFIG_HAVE_DMA_ATTRS=y
CONFIG_USE_GENERIC_SMP_HELPERS=y
CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y
CONFIG_HAVE_DMA_API_DEBUG=y
CONFIG_HAVE_RCU_TABLE_FREE=y

#
# GCOV-based kernel profiling
#
# CONFIG_GCOV_KERNEL is not set
# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
CONFIG_SLABINFO=y
CONFIG_RT_MUTEXES=y
CONFIG_BASE_SMALL=0
CONFIG_MODULES=y
# CONFIG_MODULE_FORCE_LOAD is not set
CONFIG_MODULE_UNLOAD=y
# CONFIG_MODULE_FORCE_UNLOAD is not set
CONFIG_MODVERSIONS=y
CONFIG_MODULE_SRCVERSION_ALL=y
CONFIG_STOP_MACHINE=y
CONFIG_BLOCK=y
CONFIG_LBDAF=y
CONFIG_BLK_DEV_BSG=y
# CONFIG_BLK_DEV_INTEGRITY is not set

#
# IO Schedulers
#
CONFIG_IOSCHED_NOOP=y
CONFIG_IOSCHED_DEADLINE=y
CONFIG_IOSCHED_CFQ=y
# CONFIG_DEFAULT_DEADLINE is not set
CONFIG_DEFAULT_CFQ=y
# CONFIG_DEFAULT_NOOP is not set
CONFIG_DEFAULT_IOSCHED="cfq"
# CONFIG_INLINE_SPIN_TRYLOCK is not set
# CONFIG_INLINE_SPIN_TRYLOCK_BH is not set
# CONFIG_INLINE_SPIN_LOCK is not set
# CONFIG_INLINE_SPIN_LOCK_BH is not set
# CONFIG_INLINE_SPIN_LOCK_IRQ is not set
# CONFIG_INLINE_SPIN_LOCK_IRQSAVE is not set
CONFIG_INLINE_SPIN_UNLOCK=y
# CONFIG_INLINE_SPIN_UNLOCK_BH is not set
CONFIG_INLINE_SPIN_UNLOCK_IRQ=y
# CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE is not set
# CONFIG_INLINE_READ_TRYLOCK is not set
# CONFIG_INLINE_READ_LOCK is not set
# CONFIG_INLINE_READ_LOCK_BH is not set
# CONFIG_INLINE_READ_LOCK_IRQ is not set
# CONFIG_INLINE_READ_LOCK_IRQSAVE is not set
CONFIG_INLINE_READ_UNLOCK=y
# CONFIG_INLINE_READ_UNLOCK_BH is not set
CONFIG_INLINE_READ_UNLOCK_IRQ=y
# CONFIG_INLINE_READ_UNLOCK_IRQRESTORE is not set
# CONFIG_INLINE_WRITE_TRYLOCK is not set
# CONFIG_INLINE_WRITE_LOCK is not set
# CONFIG_INLINE_WRITE_LOCK_BH is not set
# CONFIG_INLINE_WRITE_LOCK_IRQ is not set
# CONFIG_INLINE_WRITE_LOCK_IRQSAVE is not set
CONFIG_INLINE_WRITE_UNLOCK=y
# CONFIG_INLINE_WRITE_UNLOCK_BH is not set
CONFIG_INLINE_WRITE_UNLOCK_IRQ=y
# CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE is not set
CONFIG_MUTEX_SPIN_ON_OWNER=y
# CONFIG_FREEZER is not set
CONFIG_PPC_MSI_BITMAP=y
# CONFIG_PPC_XICS is not set
# CONFIG_PPC_ICP_NATIVE is not set
# CONFIG_PPC_ICP_HV is not set
# CONFIG_PPC_ICS_RTAS is not set

#
# Platform support
#
# CONFIG_PPC_CELL is not set
# CONFIG_PPC_CELL_NATIVE is not set
# CONFIG_PQ2ADS is not set
CONFIG_FSL_SOC_BOOKE=y
# CONFIG_MPC8540_ADS is not set
# CONFIG_MPC8560_ADS is not set
# CONFIG_MPC85xx_CDS is not set
# CONFIG_MPC85xx_MDS is not set
# CONFIG_MPC8536_DS is not set
# CONFIG_MPC85xx_DS is not set
# CONFIG_MPC85xx_RDB is not set
# CONFIG_P1022_DS is not set
# CONFIG_SOCRATES is not set
# CONFIG_KSI8560 is not set
# CONFIG_XES_MPC85xx is not set
# CONFIG_STX_GP3 is not set
# CONFIG_TQM8540 is not set
# CONFIG_TQM8541 is not set
# CONFIG_TQM8548 is not set
# CONFIG_TQM8555 is not set
# CONFIG_TQM8560 is not set
CONFIG_SBC8548=y
# CONFIG_SBC8560 is not set
# CONFIG_P3041_DS is not set
# CONFIG_P4080_DS is not set
# CONFIG_P5020_DS is not set
# CONFIG_PPC_WSP is not set
CONFIG_KVM_GUEST=y
CONFIG_PPC_SMP_MUXED_IPI=y
# CONFIG_IPIC is not set
CONFIG_MPIC=y
# CONFIG_MPIC_WEIRD is not set
# CONFIG_PPC_I8259 is not set
# CONFIG_PPC_RTAS is not set
# CONFIG_MMIO_NVRAM is not set
# CONFIG_MPIC_U3_HT_IRQS is not set
# CONFIG_PPC_MPC106 is not set
# CONFIG_PPC_970_NAP is not set
# CONFIG_PPC_P7_NAP is not set

#
# CPU Frequency scaling
#
CONFIG_CPU_FREQ=y
CONFIG_CPU_FREQ_TABLE=m
CONFIG_CPU_FREQ_STAT=m
CONFIG_CPU_FREQ_STAT_DETAILS=y
# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set
CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y
# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set
# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set
CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
CONFIG_CPU_FREQ_GOV_POWERSAVE=m
CONFIG_CPU_FREQ_GOV_USERSPACE=y
CONFIG_CPU_FREQ_GOV_ONDEMAND=m
CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m

#
# CPU Frequency drivers
#
# CONFIG_QUICC_ENGINE is not set
# CONFIG_CPM2 is not set
# CONFIG_FSL_ULI1575 is not set
# CONFIG_MPC8xxx_GPIO is not set
# CONFIG_SIMPLE_GPIO is not set

#
# Kernel options
#
# CONFIG_HIGHMEM is not set
# CONFIG_NO_HZ is not set
# CONFIG_HIGH_RES_TIMERS is not set
CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
# CONFIG_HZ_100 is not set
# CONFIG_HZ_250 is not set
# CONFIG_HZ_300 is not set
CONFIG_HZ_1000=y
CONFIG_HZ=1000
# CONFIG_SCHED_HRTICK is not set
# CONFIG_PREEMPT_NONE is not set
CONFIG_PREEMPT_VOLUNTARY=y
# CONFIG_PREEMPT is not set
CONFIG_BINFMT_ELF=y
CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
# CONFIG_HAVE_AOUT is not set
CONFIG_BINFMT_MISC=y
# CONFIG_MATH_EMULATION is not set
CONFIG_IOMMU_HELPER=y
CONFIG_SWIOTLB=y
CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
CONFIG_ARCH_HAS_WALK_MEMORY=y
CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y
CONFIG_KEXEC=y
CONFIG_CRASH_DUMP=y
# CONFIG_IRQ_ALL_CPUS is not set
CONFIG_MAX_ACTIVE_REGIONS=32
CONFIG_ARCH_FLATMEM_ENABLE=y
CONFIG_ARCH_POPULATES_NODE_MAP=y
CONFIG_SELECT_MEMORY_MODEL=y
CONFIG_FLATMEM_MANUAL=y
CONFIG_FLATMEM=y
CONFIG_FLAT_NODE_MEM_MAP=y
CONFIG_HAVE_MEMBLOCK=y
CONFIG_PAGEFLAGS_EXTENDED=y
CONFIG_SPLIT_PTLOCK_CPUS=4
# CONFIG_COMPACTION is not set
CONFIG_MIGRATION=y
# CONFIG_PHYS_ADDR_T_64BIT is not set
CONFIG_ZONE_DMA_FLAG=1
CONFIG_BOUNCE=y
CONFIG_VIRT_TO_BUS=y
# CONFIG_KSM is not set
CONFIG_DEFAULT_MMAP_MIN_ADDR=4096
# CONFIG_CLEANCACHE is not set
CONFIG_PPC_4K_PAGES=y
CONFIG_FORCE_MAX_ZONEORDER=11
# CONFIG_CMDLINE_BOOL is not set
CONFIG_EXTRA_TARGETS=""
# CONFIG_HIBERNATION is not set
# CONFIG_PM_RUNTIME is not set
# CONFIG_SECCOMP is not set
CONFIG_ISA_DMA_API=y

#
# Bus options
#
CONFIG_ZONE_DMA=y
# CONFIG_NEED_DMA_MAP_STATE is not set
CONFIG_NEED_SG_DMA_LENGTH=y
CONFIG_GENERIC_ISA_DMA=y
CONFIG_PPC_INDIRECT_PCI=y
CONFIG_FSL_SOC=y
CONFIG_FSL_PCI=y
# CONFIG_FSL_LBC is not set
CONFIG_PPC_PCI_CHOICE=y
CONFIG_PCI=y
CONFIG_PCI_DOMAINS=y
CONFIG_PCI_SYSCALL=y
CONFIG_PCIEPORTBUS=y
CONFIG_HOTPLUG_PCI_PCIE=m
CONFIG_PCIEAER=y
# CONFIG_PCIE_ECRC is not set
# CONFIG_PCIEAER_INJECT is not set
CONFIG_PCIEASPM=y
# CONFIG_PCIEASPM_DEBUG is not set
CONFIG_ARCH_SUPPORTS_MSI=y
CONFIG_PCI_MSI=y
# CONFIG_PCI_DEBUG is not set
# CONFIG_PCI_STUB is not set
# CONFIG_PCI_IOV is not set
CONFIG_PCCARD=y
CONFIG_PCMCIA=y
CONFIG_PCMCIA_LOAD_CIS=y
CONFIG_CARDBUS=y

#
# PC-card bridges
#
CONFIG_YENTA=y
CONFIG_YENTA_O2=y
CONFIG_YENTA_RICOH=y
CONFIG_YENTA_TI=y
CONFIG_YENTA_ENE_TUNE=y
CONFIG_YENTA_TOSHIBA=y
CONFIG_PD6729=m
# CONFIG_I82092 is not set
CONFIG_PCCARD_NONSTATIC=y
CONFIG_HOTPLUG_PCI=y
CONFIG_HOTPLUG_PCI_FAKE=m
# CONFIG_HOTPLUG_PCI_CPCI is not set
CONFIG_HOTPLUG_PCI_SHPC=m
# CONFIG_HAS_RAPIDIO is not set
# CONFIG_RAPIDIO is not set

#
# Advanced setup
#
# CONFIG_ADVANCED_OPTIONS is not set

#
# Default settings for advanced configuration options are used
#
CONFIG_LOWMEM_SIZE=0x30000000
CONFIG_LOWMEM_CAM_NUM=3
CONFIG_RELOCATABLE=y
CONFIG_PAGE_OFFSET=0xc0000000
CONFIG_KERNEL_START=0xc0000000
CONFIG_PHYSICAL_START=0x00000000
CONFIG_PHYSICAL_ALIGN=0x04000000
CONFIG_TASK_SIZE=0xc0000000
CONFIG_NET=y

#
# Networking options
#
CONFIG_PACKET=y
CONFIG_UNIX=y
CONFIG_XFRM=y
CONFIG_XFRM_USER=y
# CONFIG_XFRM_SUB_POLICY is not set
# CONFIG_XFRM_MIGRATE is not set
# CONFIG_XFRM_STATISTICS is not set
CONFIG_XFRM_IPCOMP=m
CONFIG_NET_KEY=m
# CONFIG_NET_KEY_MIGRATE is not set
CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_ADVANCED_ROUTER=y
# CONFIG_IP_FIB_TRIE_STATS is not set
CONFIG_IP_MULTIPLE_TABLES=y
CONFIG_IP_ROUTE_MULTIPATH=y
CONFIG_IP_ROUTE_VERBOSE=y
CONFIG_IP_ROUTE_CLASSID=y
CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
CONFIG_IP_PNP_BOOTP=y
CONFIG_IP_PNP_RARP=y
CONFIG_NET_IPIP=m
# CONFIG_NET_IPGRE_DEMUX is not set
CONFIG_IP_MROUTE=y
# CONFIG_IP_MROUTE_MULTIPLE_TABLES is not set
CONFIG_IP_PIMSM_V1=y
CONFIG_IP_PIMSM_V2=y
# CONFIG_ARPD is not set
CONFIG_SYN_COOKIES=y
CONFIG_INET_AH=m
CONFIG_INET_ESP=m
CONFIG_INET_IPCOMP=m
CONFIG_INET_XFRM_TUNNEL=m
CONFIG_INET_TUNNEL=m
CONFIG_INET_XFRM_MODE_TRANSPORT=m
CONFIG_INET_XFRM_MODE_TUNNEL=m
CONFIG_INET_XFRM_MODE_BEET=y
CONFIG_INET_LRO=y
CONFIG_INET_DIAG=m
CONFIG_INET_TCP_DIAG=m
CONFIG_TCP_CONG_ADVANCED=y
CONFIG_TCP_CONG_BIC=y
CONFIG_TCP_CONG_CUBIC=m
CONFIG_TCP_CONG_WESTWOOD=m
CONFIG_TCP_CONG_HTCP=m
CONFIG_TCP_CONG_HSTCP=m
CONFIG_TCP_CONG_HYBLA=m
CONFIG_TCP_CONG_VEGAS=m
CONFIG_TCP_CONG_SCALABLE=m
CONFIG_TCP_CONG_LP=m
CONFIG_TCP_CONG_VENO=m
# CONFIG_TCP_CONG_YEAH is not set
# CONFIG_TCP_CONG_ILLINOIS is not set
CONFIG_DEFAULT_BIC=y
# CONFIG_DEFAULT_RENO is not set
CONFIG_DEFAULT_TCP_CONG="bic"
# CONFIG_TCP_MD5SIG is not set
CONFIG_IPV6=m
CONFIG_IPV6_PRIVACY=y
CONFIG_IPV6_ROUTER_PREF=y
CONFIG_IPV6_ROUTE_INFO=y
CONFIG_IPV6_OPTIMISTIC_DAD=y
CONFIG_INET6_AH=m
CONFIG_INET6_ESP=m
CONFIG_INET6_IPCOMP=m
# CONFIG_IPV6_MIP6 is not set
CONFIG_INET6_XFRM_TUNNEL=m
CONFIG_INET6_TUNNEL=m
CONFIG_INET6_XFRM_MODE_TRANSPORT=m
CONFIG_INET6_XFRM_MODE_TUNNEL=m
CONFIG_INET6_XFRM_MODE_BEET=m
# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set
CONFIG_IPV6_SIT=m
# CONFIG_IPV6_SIT_6RD is not set
CONFIG_IPV6_NDISC_NODETYPE=y
CONFIG_IPV6_TUNNEL=m
CONFIG_IPV6_MULTIPLE_TABLES=y
# CONFIG_IPV6_SUBTREES is not set
# CONFIG_IPV6_MROUTE is not set
CONFIG_NETWORK_SECMARK=y
# CONFIG_NETWORK_PHY_TIMESTAMPING is not set
CONFIG_NETFILTER=y
# CONFIG_NETFILTER_DEBUG is not set
CONFIG_NETFILTER_ADVANCED=y
CONFIG_BRIDGE_NETFILTER=y

#
# Core Netfilter Configuration
#
CONFIG_NETFILTER_NETLINK=m
CONFIG_NETFILTER_NETLINK_QUEUE=m
CONFIG_NETFILTER_NETLINK_LOG=m
# CONFIG_NF_CONNTRACK is not set
# CONFIG_NETFILTER_TPROXY is not set
CONFIG_NETFILTER_XTABLES=m

#
# Xtables combined modules
#
CONFIG_NETFILTER_XT_MARK=m

#
# Xtables targets
#
# CONFIG_NETFILTER_XT_TARGET_AUDIT is not set
# CONFIG_NETFILTER_XT_TARGET_CHECKSUM is not set
CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
# CONFIG_NETFILTER_XT_TARGET_DSCP is not set
CONFIG_NETFILTER_XT_TARGET_HL=m
# CONFIG_NETFILTER_XT_TARGET_IDLETIMER is not set
# CONFIG_NETFILTER_XT_TARGET_LED is not set
CONFIG_NETFILTER_XT_TARGET_MARK=m
# CONFIG_NETFILTER_XT_TARGET_NFLOG is not set
CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
# CONFIG_NETFILTER_XT_TARGET_RATEEST is not set
# CONFIG_NETFILTER_XT_TARGET_TEE is not set
# CONFIG_NETFILTER_XT_TARGET_TRACE is not set
CONFIG_NETFILTER_XT_TARGET_SECMARK=m
# CONFIG_NETFILTER_XT_TARGET_TCPMSS is not set
# CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP is not set

#
# Xtables matches
#
# CONFIG_NETFILTER_XT_MATCH_ADDRTYPE is not set
CONFIG_NETFILTER_XT_MATCH_COMMENT=m
# CONFIG_NETFILTER_XT_MATCH_CPU is not set
CONFIG_NETFILTER_XT_MATCH_DCCP=m
# CONFIG_NETFILTER_XT_MATCH_DEVGROUP is not set
# CONFIG_NETFILTER_XT_MATCH_DSCP is not set
CONFIG_NETFILTER_XT_MATCH_ESP=m
# CONFIG_NETFILTER_XT_MATCH_HASHLIMIT is not set
CONFIG_NETFILTER_XT_MATCH_HL=m
# CONFIG_NETFILTER_XT_MATCH_IPRANGE is not set
CONFIG_NETFILTER_XT_MATCH_LENGTH=m
CONFIG_NETFILTER_XT_MATCH_LIMIT=m
CONFIG_NETFILTER_XT_MATCH_MAC=m
CONFIG_NETFILTER_XT_MATCH_MARK=m
CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m
# CONFIG_NETFILTER_XT_MATCH_OSF is not set
# CONFIG_NETFILTER_XT_MATCH_OWNER is not set
CONFIG_NETFILTER_XT_MATCH_POLICY=m
CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m
CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m
CONFIG_NETFILTER_XT_MATCH_QUOTA=m
# CONFIG_NETFILTER_XT_MATCH_RATEEST is not set
CONFIG_NETFILTER_XT_MATCH_REALM=m
# CONFIG_NETFILTER_XT_MATCH_RECENT is not set
CONFIG_NETFILTER_XT_MATCH_SCTP=m
CONFIG_NETFILTER_XT_MATCH_STATISTIC=m
CONFIG_NETFILTER_XT_MATCH_STRING=m
CONFIG_NETFILTER_XT_MATCH_TCPMSS=m
# CONFIG_NETFILTER_XT_MATCH_TIME is not set
# CONFIG_NETFILTER_XT_MATCH_U32 is not set
# CONFIG_IP_SET is not set
CONFIG_IP_VS=m
# CONFIG_IP_VS_IPV6 is not set
# CONFIG_IP_VS_DEBUG is not set
CONFIG_IP_VS_TAB_BITS=12

#
# IPVS transport protocol load balancing support
#
CONFIG_IP_VS_PROTO_TCP=y
CONFIG_IP_VS_PROTO_UDP=y
CONFIG_IP_VS_PROTO_AH_ESP=y
CONFIG_IP_VS_PROTO_ESP=y
CONFIG_IP_VS_PROTO_AH=y
# CONFIG_IP_VS_PROTO_SCTP is not set

#
# IPVS scheduler
#
CONFIG_IP_VS_RR=m
CONFIG_IP_VS_WRR=m
CONFIG_IP_VS_LC=m
CONFIG_IP_VS_WLC=m
CONFIG_IP_VS_LBLC=m
CONFIG_IP_VS_LBLCR=m
CONFIG_IP_VS_DH=m
CONFIG_IP_VS_SH=m
CONFIG_IP_VS_SED=m
CONFIG_IP_VS_NQ=m

#
# IPVS application helper
#

#
# IP: Netfilter Configuration
#
# CONFIG_NF_DEFRAG_IPV4 is not set
CONFIG_IP_NF_QUEUE=m
CONFIG_IP_NF_IPTABLES=m
CONFIG_IP_NF_MATCH_AH=m
CONFIG_IP_NF_MATCH_ECN=m
CONFIG_IP_NF_MATCH_TTL=m
CONFIG_IP_NF_FILTER=m
CONFIG_IP_NF_TARGET_REJECT=m
CONFIG_IP_NF_TARGET_LOG=m
CONFIG_IP_NF_TARGET_ULOG=m
CONFIG_IP_NF_MANGLE=m
CONFIG_IP_NF_TARGET_ECN=m
CONFIG_IP_NF_TARGET_TTL=m
CONFIG_IP_NF_RAW=m
CONFIG_IP_NF_ARPTABLES=m
CONFIG_IP_NF_ARPFILTER=m
CONFIG_IP_NF_ARP_MANGLE=m

#
# IPv6: Netfilter Configuration
#
# CONFIG_NF_DEFRAG_IPV6 is not set
CONFIG_IP6_NF_QUEUE=m
CONFIG_IP6_NF_IPTABLES=m
CONFIG_IP6_NF_MATCH_AH=m
CONFIG_IP6_NF_MATCH_EUI64=m
CONFIG_IP6_NF_MATCH_FRAG=m
CONFIG_IP6_NF_MATCH_OPTS=m
CONFIG_IP6_NF_MATCH_HL=m
CONFIG_IP6_NF_MATCH_IPV6HEADER=m
# CONFIG_IP6_NF_MATCH_MH is not set
CONFIG_IP6_NF_MATCH_RT=m
CONFIG_IP6_NF_TARGET_HL=m
CONFIG_IP6_NF_TARGET_LOG=m
CONFIG_IP6_NF_FILTER=m
CONFIG_IP6_NF_TARGET_REJECT=m
CONFIG_IP6_NF_MANGLE=m
CONFIG_IP6_NF_RAW=m
CONFIG_BRIDGE_NF_EBTABLES=m
CONFIG_BRIDGE_EBT_BROUTE=m
CONFIG_BRIDGE_EBT_T_FILTER=m
CONFIG_BRIDGE_EBT_T_NAT=m
CONFIG_BRIDGE_EBT_802_3=m
CONFIG_BRIDGE_EBT_AMONG=m
CONFIG_BRIDGE_EBT_ARP=m
CONFIG_BRIDGE_EBT_IP=m
# CONFIG_BRIDGE_EBT_IP6 is not set
CONFIG_BRIDGE_EBT_LIMIT=m
CONFIG_BRIDGE_EBT_MARK=m
CONFIG_BRIDGE_EBT_PKTTYPE=m
CONFIG_BRIDGE_EBT_STP=m
CONFIG_BRIDGE_EBT_VLAN=m
CONFIG_BRIDGE_EBT_ARPREPLY=m
CONFIG_BRIDGE_EBT_DNAT=m
CONFIG_BRIDGE_EBT_MARK_T=m
CONFIG_BRIDGE_EBT_REDIRECT=m
CONFIG_BRIDGE_EBT_SNAT=m
CONFIG_BRIDGE_EBT_LOG=m
CONFIG_BRIDGE_EBT_ULOG=m
# CONFIG_BRIDGE_EBT_NFLOG is not set
CONFIG_IP_DCCP=m
CONFIG_INET_DCCP_DIAG=m

#
# DCCP CCIDs Configuration (EXPERIMENTAL)
#
# CONFIG_IP_DCCP_CCID2_DEBUG is not set
CONFIG_IP_DCCP_CCID3=y
# CONFIG_IP_DCCP_CCID3_DEBUG is not set
CONFIG_IP_DCCP_TFRC_LIB=y

#
# DCCP Kernel Hacking
#
# CONFIG_IP_DCCP_DEBUG is not set
# CONFIG_NET_DCCPPROBE is not set
CONFIG_IP_SCTP=m
# CONFIG_NET_SCTPPROBE is not set
# CONFIG_SCTP_DBG_MSG is not set
# CONFIG_SCTP_DBG_OBJCNT is not set
# CONFIG_SCTP_HMAC_NONE is not set
# CONFIG_SCTP_HMAC_SHA1 is not set
CONFIG_SCTP_HMAC_MD5=y
# CONFIG_RDS is not set
CONFIG_TIPC=m
# CONFIG_TIPC_ADVANCED is not set
# CONFIG_TIPC_DEBUG is not set
CONFIG_ATM=m
CONFIG_ATM_CLIP=m
# CONFIG_ATM_CLIP_NO_ICMP is not set
CONFIG_ATM_LANE=m
# CONFIG_ATM_MPOA is not set
CONFIG_ATM_BR2684=m
# CONFIG_ATM_BR2684_IPFILTER is not set
# CONFIG_L2TP is not set
CONFIG_STP=m
CONFIG_BRIDGE=m
CONFIG_BRIDGE_IGMP_SNOOPING=y
# CONFIG_NET_DSA is not set
CONFIG_VLAN_8021Q=m
# CONFIG_VLAN_8021Q_GVRP is not set
# CONFIG_DECNET is not set
CONFIG_LLC=y
# CONFIG_LLC2 is not set
# CONFIG_IPX is not set
# CONFIG_ATALK is not set
# CONFIG_X25 is not set
# CONFIG_LAPB is not set
# CONFIG_ECONET is not set
# CONFIG_WAN_ROUTER is not set
# CONFIG_PHONET is not set
# CONFIG_IEEE802154 is not set
CONFIG_NET_SCHED=y

#
# Queueing/Scheduling
#
CONFIG_NET_SCH_CBQ=m
CONFIG_NET_SCH_HTB=m
CONFIG_NET_SCH_HFSC=m
CONFIG_NET_SCH_ATM=m
CONFIG_NET_SCH_PRIO=m
# CONFIG_NET_SCH_MULTIQ is not set
CONFIG_NET_SCH_RED=m
# CONFIG_NET_SCH_SFB is not set
CONFIG_NET_SCH_SFQ=m
CONFIG_NET_SCH_TEQL=m
CONFIG_NET_SCH_TBF=m
CONFIG_NET_SCH_GRED=m
CONFIG_NET_SCH_DSMARK=m
CONFIG_NET_SCH_NETEM=m
# CONFIG_NET_SCH_DRR is not set
# CONFIG_NET_SCH_MQPRIO is not set
# CONFIG_NET_SCH_CHOKE is not set
# CONFIG_NET_SCH_QFQ is not set
CONFIG_NET_SCH_INGRESS=m

#
# Classification
#
CONFIG_NET_CLS=y
CONFIG_NET_CLS_BASIC=m
CONFIG_NET_CLS_TCINDEX=m
CONFIG_NET_CLS_ROUTE4=m
CONFIG_NET_CLS_FW=m
CONFIG_NET_CLS_U32=m
CONFIG_CLS_U32_PERF=y
CONFIG_CLS_U32_MARK=y
CONFIG_NET_CLS_RSVP=m
CONFIG_NET_CLS_RSVP6=m
# CONFIG_NET_CLS_FLOW is not set
CONFIG_NET_EMATCH=y
CONFIG_NET_EMATCH_STACK=32
CONFIG_NET_EMATCH_CMP=m
CONFIG_NET_EMATCH_NBYTE=m
CONFIG_NET_EMATCH_U32=m
CONFIG_NET_EMATCH_META=m
CONFIG_NET_EMATCH_TEXT=m
CONFIG_NET_CLS_ACT=y
CONFIG_NET_ACT_POLICE=m
CONFIG_NET_ACT_GACT=m
CONFIG_GACT_PROB=y
CONFIG_NET_ACT_MIRRED=m
CONFIG_NET_ACT_IPT=m
# CONFIG_NET_ACT_NAT is not set
CONFIG_NET_ACT_PEDIT=m
CONFIG_NET_ACT_SIMP=m
# CONFIG_NET_ACT_SKBEDIT is not set
# CONFIG_NET_ACT_CSUM is not set
CONFIG_NET_CLS_IND=y
CONFIG_NET_SCH_FIFO=y
# CONFIG_DCB is not set
CONFIG_DNS_RESOLVER=y
# CONFIG_BATMAN_ADV is not set
CONFIG_RPS=y
CONFIG_RFS_ACCEL=y
CONFIG_XPS=y

#
# Network testing
#
CONFIG_NET_PKTGEN=m
# CONFIG_NET_TCPPROBE is not set
# CONFIG_NET_DROP_MONITOR is not set
# CONFIG_HAMRADIO is not set
# CONFIG_CAN is not set
# CONFIG_IRDA is not set
CONFIG_BT=m
# CONFIG_BT_L2CAP is not set
# CONFIG_BT_SCO is not set

#
# Bluetooth device drivers
#
# CONFIG_BT_HCIBTUSB is not set
# CONFIG_BT_HCIBTSDIO is not set
CONFIG_BT_HCIUART=m
CONFIG_BT_HCIUART_H4=y
CONFIG_BT_HCIUART_BCSP=y
# CONFIG_BT_HCIUART_ATH3K is not set
# CONFIG_BT_HCIUART_LL is not set
CONFIG_BT_HCIBCM203X=m
CONFIG_BT_HCIBPA10X=m
CONFIG_BT_HCIBFUSB=m
CONFIG_BT_HCIDTL1=m
CONFIG_BT_HCIBT3C=m
CONFIG_BT_HCIBLUECARD=m
CONFIG_BT_HCIBTUART=m
CONFIG_BT_HCIVHCI=m
# CONFIG_BT_MRVL is not set
# CONFIG_AF_RXRPC is not set
CONFIG_FIB_RULES=y
CONFIG_WIRELESS=y
CONFIG_WIRELESS_EXT=y
CONFIG_WEXT_CORE=y
CONFIG_WEXT_PROC=y
CONFIG_WEXT_SPY=y
CONFIG_WEXT_PRIV=y
CONFIG_CFG80211=m
# CONFIG_NL80211_TESTMODE is not set
# CONFIG_CFG80211_DEVELOPER_WARNINGS is not set
# CONFIG_CFG80211_REG_DEBUG is not set
CONFIG_CFG80211_DEFAULT_PS=y
# CONFIG_CFG80211_DEBUGFS is not set
# CONFIG_CFG80211_INTERNAL_REGDB is not set
CONFIG_CFG80211_WEXT=y
CONFIG_WIRELESS_EXT_SYSFS=y
CONFIG_LIB80211=m
CONFIG_LIB80211_CRYPT_WEP=m
CONFIG_LIB80211_CRYPT_CCMP=m
CONFIG_LIB80211_CRYPT_TKIP=m
# CONFIG_LIB80211_DEBUG is not set
CONFIG_MAC80211=m
CONFIG_MAC80211_HAS_RC=y
CONFIG_MAC80211_RC_MINSTREL=y
CONFIG_MAC80211_RC_MINSTREL_HT=y
CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y
CONFIG_MAC80211_RC_DEFAULT="minstrel_ht"
# CONFIG_MAC80211_MESH is not set
CONFIG_MAC80211_LEDS=y
# CONFIG_MAC80211_DEBUGFS is not set
# CONFIG_MAC80211_DEBUG_MENU is not set
# CONFIG_WIMAX is not set
# CONFIG_RFKILL is not set
# CONFIG_NET_9P is not set
# CONFIG_CAIF is not set
# CONFIG_CEPH_LIB is not set

#
# Device Drivers
#

#
# Generic Driver Options
#
CONFIG_UEVENT_HELPER_PATH=""
# CONFIG_DEVTMPFS is not set
CONFIG_STANDALONE=y
CONFIG_PREVENT_FIRMWARE_BUILD=y
CONFIG_FW_LOADER=y
CONFIG_FIRMWARE_IN_KERNEL=y
CONFIG_EXTRA_FIRMWARE=""
# CONFIG_DEBUG_DRIVER is not set
# CONFIG_DEBUG_DEVRES is not set
# CONFIG_SYS_HYPERVISOR is not set
CONFIG_CONNECTOR=y
CONFIG_PROC_EVENTS=y
CONFIG_MTD=m
# CONFIG_MTD_DEBUG is not set
# CONFIG_MTD_TESTS is not set
CONFIG_MTD_REDBOOT_PARTS=m
CONFIG_MTD_REDBOOT_DIRECTORY_BLOCK=-1
# CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED is not set
# CONFIG_MTD_REDBOOT_PARTS_READONLY is not set
CONFIG_MTD_OF_PARTS=y
# CONFIG_MTD_AR7_PARTS is not set

#
# User Modules And Translation Layers
#
CONFIG_MTD_CHAR=m
CONFIG_MTD_BLKDEVS=m
CONFIG_MTD_BLOCK=m
CONFIG_MTD_BLOCK_RO=m
CONFIG_FTL=m
CONFIG_NFTL=m
CONFIG_NFTL_RW=y
# CONFIG_INFTL is not set
CONFIG_RFD_FTL=m
# CONFIG_SSFDC is not set
# CONFIG_SM_FTL is not set
# CONFIG_MTD_OOPS is not set
# CONFIG_MTD_SWAP is not set

#
# RAM/ROM/Flash chip drivers
#
CONFIG_MTD_CFI=m
CONFIG_MTD_JEDECPROBE=m
CONFIG_MTD_GEN_PROBE=m
# CONFIG_MTD_CFI_ADV_OPTIONS is not set
CONFIG_MTD_MAP_BANK_WIDTH_1=y
CONFIG_MTD_MAP_BANK_WIDTH_2=y
CONFIG_MTD_MAP_BANK_WIDTH_4=y
# CONFIG_MTD_MAP_BANK_WIDTH_8 is not set
# CONFIG_MTD_MAP_BANK_WIDTH_16 is not set
# CONFIG_MTD_MAP_BANK_WIDTH_32 is not set
CONFIG_MTD_CFI_I1=y
CONFIG_MTD_CFI_I2=y
# CONFIG_MTD_CFI_I4 is not set
# CONFIG_MTD_CFI_I8 is not set
CONFIG_MTD_CFI_INTELEXT=m
CONFIG_MTD_CFI_AMDSTD=m
CONFIG_MTD_CFI_STAA=m
CONFIG_MTD_CFI_UTIL=m
CONFIG_MTD_RAM=m
CONFIG_MTD_ROM=m
CONFIG_MTD_ABSENT=m

#
# Mapping drivers for chip access
#
# CONFIG_MTD_COMPLEX_MAPPINGS is not set
# CONFIG_MTD_PHYSMAP is not set
# CONFIG_MTD_PHYSMAP_OF is not set
# CONFIG_MTD_INTEL_VR_NOR is not set
# CONFIG_MTD_PLATRAM is not set

#
# Self-contained MTD device drivers
#
# CONFIG_MTD_PMC551 is not set
# CONFIG_MTD_SLRAM is not set
# CONFIG_MTD_PHRAM is not set
CONFIG_MTD_MTDRAM=m
CONFIG_MTDRAM_TOTAL_SIZE=4096
CONFIG_MTDRAM_ERASE_SIZE=128
CONFIG_MTD_BLOCK2MTD=m

#
# Disk-On-Chip Device Drivers
#
# CONFIG_MTD_DOC2000 is not set
# CONFIG_MTD_DOC2001 is not set
# CONFIG_MTD_DOC2001PLUS is not set
CONFIG_MTD_NAND_ECC=m
CONFIG_MTD_NAND_ECC_SMC=y
CONFIG_MTD_NAND=m
# CONFIG_MTD_NAND_VERIFY_WRITE is not set
# CONFIG_MTD_NAND_ECC_BCH is not set
# CONFIG_MTD_SM_COMMON is not set
# CONFIG_MTD_NAND_MUSEUM_IDS is not set
# CONFIG_MTD_NAND_DENALI is not set
CONFIG_MTD_NAND_IDS=m
# CONFIG_MTD_NAND_RICOH is not set
CONFIG_MTD_NAND_DISKONCHIP=m
# CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADVANCED is not set
CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADDRESS=0
# CONFIG_MTD_NAND_DISKONCHIP_BBTWRITE is not set
# CONFIG_MTD_NAND_CAFE is not set
CONFIG_MTD_NAND_NANDSIM=m
# CONFIG_MTD_NAND_PLATFORM is not set
# CONFIG_MTD_ALAUDA is not set
# CONFIG_MTD_NAND_FSL_ELBC is not set
# CONFIG_MTD_NAND_FSL_UPM is not set
# CONFIG_MTD_ONENAND is not set

#
# LPDDR flash memory drivers
#
# CONFIG_MTD_LPDDR is not set
# CONFIG_MTD_UBI is not set
CONFIG_DTC=y
CONFIG_OF=y

#
# Device Tree and Open Firmware support
#
# CONFIG_PROC_DEVICETREE is not set
CONFIG_OF_FLATTREE=y
CONFIG_OF_EARLY_FLATTREE=y
CONFIG_OF_DYNAMIC=y
CONFIG_OF_ADDRESS=y
CONFIG_OF_IRQ=y
CONFIG_OF_DEVICE=y
CONFIG_OF_I2C=m
CONFIG_OF_NET=y
CONFIG_OF_MDIO=y
CONFIG_OF_PCI=y
CONFIG_PARPORT=m
CONFIG_PARPORT_PC=m
CONFIG_PARPORT_SERIAL=m
# CONFIG_PARPORT_PC_FIFO is not set
# CONFIG_PARPORT_PC_SUPERIO is not set
CONFIG_PARPORT_PC_PCMCIA=m
# CONFIG_PARPORT_GSC is not set
# CONFIG_PARPORT_AX88796 is not set
CONFIG_PARPORT_1284=y
CONFIG_PARPORT_NOT_PC=y
CONFIG_BLK_DEV=y
CONFIG_BLK_DEV_FD=m
CONFIG_PARIDE=m

#
# Parallel IDE high-level drivers
#
CONFIG_PARIDE_PD=m
CONFIG_PARIDE_PCD=m
CONFIG_PARIDE_PF=m
CONFIG_PARIDE_PT=m
CONFIG_PARIDE_PG=m

#
# Parallel IDE protocol modules
#
CONFIG_PARIDE_ATEN=m
CONFIG_PARIDE_BPCK=m
# CONFIG_PARIDE_BPCK6 is not set
CONFIG_PARIDE_COMM=m
CONFIG_PARIDE_DSTR=m
CONFIG_PARIDE_FIT2=m
CONFIG_PARIDE_FIT3=m
CONFIG_PARIDE_EPAT=m
CONFIG_PARIDE_EPATC8=y
CONFIG_PARIDE_EPIA=m
CONFIG_PARIDE_FRIQ=m
CONFIG_PARIDE_FRPW=m
CONFIG_PARIDE_KBIC=m
CONFIG_PARIDE_KTTI=m
CONFIG_PARIDE_ON20=m
CONFIG_PARIDE_ON26=m
CONFIG_BLK_CPQ_DA=m
CONFIG_BLK_CPQ_CISS_DA=m
CONFIG_CISS_SCSI_TAPE=y
CONFIG_BLK_DEV_DAC960=m
# CONFIG_BLK_DEV_UMEM is not set
# CONFIG_BLK_DEV_COW_COMMON is not set
CONFIG_BLK_DEV_LOOP=m
CONFIG_BLK_DEV_CRYPTOLOOP=m
# CONFIG_BLK_DEV_DRBD is not set
CONFIG_BLK_DEV_NBD=m
CONFIG_BLK_DEV_SX8=m
# CONFIG_BLK_DEV_UB is not set
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_COUNT=16
CONFIG_BLK_DEV_RAM_SIZE=16384
# CONFIG_BLK_DEV_XIP is not set
CONFIG_CDROM_PKTCDVD=m
CONFIG_CDROM_PKTCDVD_BUFFERS=8
# CONFIG_CDROM_PKTCDVD_WCACHE is not set
CONFIG_ATA_OVER_ETH=m
# CONFIG_BLK_DEV_HD is not set
# CONFIG_BLK_DEV_RBD is not set
# CONFIG_SENSORS_LIS3LV02D is not set
# CONFIG_MISC_DEVICES is not set
CONFIG_EEPROM_93CX6=m
CONFIG_HAVE_IDE=y
CONFIG_IDE=y

#
# Please see Documentation/ide/ide.txt for help/info on IDE drives
#
CONFIG_IDE_XFER_MODE=y
CONFIG_IDE_TIMINGS=y
CONFIG_IDE_ATAPI=y
# CONFIG_BLK_DEV_IDE_SATA is not set
CONFIG_IDE_GD=y
CONFIG_IDE_GD_ATA=y
# CONFIG_IDE_GD_ATAPI is not set
CONFIG_BLK_DEV_IDECS=m
# CONFIG_BLK_DEV_DELKIN is not set
CONFIG_BLK_DEV_IDECD=m
CONFIG_BLK_DEV_IDECD_VERBOSE_ERRORS=y
# CONFIG_BLK_DEV_IDETAPE is not set
CONFIG_IDE_TASK_IOCTL=y
CONFIG_IDE_PROC_FS=y

#
# IDE chipset support/bugfixes
#
# CONFIG_BLK_DEV_PLATFORM is not set
CONFIG_BLK_DEV_IDEDMA_SFF=y

#
# PCI IDE chipsets support
#
CONFIG_BLK_DEV_IDEPCI=y
CONFIG_IDEPCI_PCIBUS_ORDER=y
# CONFIG_BLK_DEV_OFFBOARD is not set
CONFIG_BLK_DEV_GENERIC=y
# CONFIG_BLK_DEV_OPTI621 is not set
CONFIG_BLK_DEV_IDEDMA_PCI=y
CONFIG_BLK_DEV_AEC62XX=y
CONFIG_BLK_DEV_ALI15X3=y
CONFIG_BLK_DEV_AMD74XX=y
CONFIG_BLK_DEV_CMD64X=y
# CONFIG_BLK_DEV_TRIFLEX is not set
# CONFIG_BLK_DEV_CS5520 is not set
# CONFIG_BLK_DEV_CS5530 is not set
CONFIG_BLK_DEV_HPT366=y
# CONFIG_BLK_DEV_JMICRON is not set
# CONFIG_BLK_DEV_SC1200 is not set
CONFIG_BLK_DEV_PIIX=y
# CONFIG_BLK_DEV_IT8172 is not set
# CONFIG_BLK_DEV_IT8213 is not set
CONFIG_BLK_DEV_IT821X=y
# CONFIG_BLK_DEV_NS87415 is not set
CONFIG_BLK_DEV_PDC202XX_OLD=y
CONFIG_BLK_DEV_PDC202XX_NEW=y
CONFIG_BLK_DEV_SVWKS=y
CONFIG_BLK_DEV_SIIMAGE=y
# CONFIG_BLK_DEV_SL82C105 is not set
# CONFIG_BLK_DEV_SLC90E66 is not set
# CONFIG_BLK_DEV_TRM290 is not set
CONFIG_BLK_DEV_VIA82CXXX=y
# CONFIG_BLK_DEV_TC86C001 is not set
CONFIG_BLK_DEV_IDEDMA=y

#
# SCSI device support
#
CONFIG_SCSI_MOD=m
CONFIG_RAID_ATTRS=m
CONFIG_SCSI=m
CONFIG_SCSI_DMA=y
# CONFIG_SCSI_TGT is not set
CONFIG_SCSI_NETLINK=y
CONFIG_SCSI_PROC_FS=y

#
# SCSI support type (disk, tape, CD-ROM)
#
CONFIG_BLK_DEV_SD=m
CONFIG_CHR_DEV_ST=m
CONFIG_CHR_DEV_OSST=m
CONFIG_BLK_DEV_SR=m
CONFIG_BLK_DEV_SR_VENDOR=y
CONFIG_CHR_DEV_SG=m
CONFIG_CHR_DEV_SCH=m
CONFIG_SCSI_MULTI_LUN=y
CONFIG_SCSI_CONSTANTS=y
CONFIG_SCSI_LOGGING=y
# CONFIG_SCSI_SCAN_ASYNC is not set
CONFIG_SCSI_WAIT_SCAN=m

#
# SCSI Transports
#
CONFIG_SCSI_SPI_ATTRS=m
CONFIG_SCSI_FC_ATTRS=m
CONFIG_SCSI_ISCSI_ATTRS=m
CONFIG_SCSI_SAS_ATTRS=m
CONFIG_SCSI_SAS_LIBSAS=m
CONFIG_SCSI_SAS_ATA=y
CONFIG_SCSI_SAS_HOST_SMP=y
CONFIG_SCSI_SRP_ATTRS=m
CONFIG_SCSI_LOWLEVEL=y
CONFIG_ISCSI_TCP=m
# CONFIG_ISCSI_BOOT_SYSFS is not set
# CONFIG_SCSI_CXGB3_ISCSI is not set
# CONFIG_SCSI_CXGB4_ISCSI is not set
# CONFIG_SCSI_BNX2_ISCSI is not set
# CONFIG_SCSI_BNX2X_FCOE is not set
# CONFIG_BE2ISCSI is not set
CONFIG_BLK_DEV_3W_XXXX_RAID=m
# CONFIG_SCSI_HPSA is not set
CONFIG_SCSI_3W_9XXX=m
# CONFIG_SCSI_3W_SAS is not set
CONFIG_SCSI_ACARD=m
CONFIG_SCSI_AACRAID=m
CONFIG_SCSI_AIC7XXX=m
CONFIG_AIC7XXX_CMDS_PER_DEVICE=4
CONFIG_AIC7XXX_RESET_DELAY_MS=15000
# CONFIG_AIC7XXX_DEBUG_ENABLE is not set
CONFIG_AIC7XXX_DEBUG_MASK=0
# CONFIG_AIC7XXX_REG_PRETTY_PRINT is not set
CONFIG_SCSI_AIC7XXX_OLD=m
CONFIG_SCSI_AIC79XX=m
CONFIG_AIC79XX_CMDS_PER_DEVICE=4
CONFIG_AIC79XX_RESET_DELAY_MS=15000
# CONFIG_AIC79XX_DEBUG_ENABLE is not set
CONFIG_AIC79XX_DEBUG_MASK=0
# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set
CONFIG_SCSI_AIC94XX=m
# CONFIG_AIC94XX_DEBUG is not set
# CONFIG_SCSI_MVSAS is not set
# CONFIG_SCSI_DPT_I2O is not set
# CONFIG_SCSI_ADVANSYS is not set
CONFIG_SCSI_ARCMSR=m
# CONFIG_SCSI_ARCMSR_AER is not set
CONFIG_MEGARAID_NEWGEN=y
CONFIG_MEGARAID_MM=m
CONFIG_MEGARAID_MAILBOX=m
CONFIG_MEGARAID_LEGACY=m
CONFIG_MEGARAID_SAS=m
# CONFIG_SCSI_MPT2SAS is not set
CONFIG_SCSI_HPTIOP=m
# CONFIG_SCSI_BUSLOGIC is not set
CONFIG_LIBFC=m
CONFIG_LIBFCOE=m
CONFIG_FCOE=m
# CONFIG_SCSI_DMX3191D is not set
# CONFIG_SCSI_EATA is not set
# CONFIG_SCSI_FUTURE_DOMAIN is not set
CONFIG_SCSI_GDTH=m
CONFIG_SCSI_IPS=m
CONFIG_SCSI_INITIO=m
# CONFIG_SCSI_INIA100 is not set
CONFIG_SCSI_PPA=m
CONFIG_SCSI_IMM=m
# CONFIG_SCSI_IZIP_EPP16 is not set
# CONFIG_SCSI_IZIP_SLOW_CTR is not set
CONFIG_SCSI_STEX=m
CONFIG_SCSI_SYM53C8XX_2=m
CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1
CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16
CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64
CONFIG_SCSI_SYM53C8XX_MMIO=y
# CONFIG_SCSI_IPR is not set
CONFIG_SCSI_QLOGIC_1280=m
CONFIG_SCSI_QLA_FC=m
CONFIG_SCSI_QLA_ISCSI=m
CONFIG_SCSI_LPFC=m
# CONFIG_SCSI_LPFC_DEBUG_FS is not set
CONFIG_SCSI_DC395x=m
# CONFIG_SCSI_DC390T is not set
# CONFIG_SCSI_NSP32 is not set
# CONFIG_SCSI_DEBUG is not set
# CONFIG_SCSI_PMCRAID is not set
# CONFIG_SCSI_PM8001 is not set
# CONFIG_SCSI_SRP is not set
# CONFIG_SCSI_BFA_FC is not set
# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
CONFIG_SCSI_DH=m
CONFIG_SCSI_DH_RDAC=m
# CONFIG_SCSI_DH_HP_SW is not set
# CONFIG_SCSI_DH_EMC is not set
# CONFIG_SCSI_DH_ALUA is not set
# CONFIG_SCSI_OSD_INITIATOR is not set
CONFIG_ATA=m
# CONFIG_ATA_NONSTANDARD is not set
CONFIG_ATA_VERBOSE_ERROR=y
CONFIG_SATA_PMP=y

#
# Controllers with non-SFF native interface
#
CONFIG_SATA_AHCI=m
# CONFIG_SATA_AHCI_PLATFORM is not set
# CONFIG_SATA_FSL is not set
CONFIG_SATA_INIC162X=m
# CONFIG_SATA_ACARD_AHCI is not set
CONFIG_SATA_SIL24=m
CONFIG_ATA_SFF=y

#
# SFF controllers with custom DMA interface
#
CONFIG_PDC_ADMA=m
CONFIG_SATA_QSTOR=m
CONFIG_SATA_SX4=m
CONFIG_ATA_BMDMA=y

#
# SATA SFF controllers with BMDMA
#
CONFIG_ATA_PIIX=m
CONFIG_SATA_MV=m
CONFIG_SATA_NV=m
CONFIG_SATA_PROMISE=m
CONFIG_SATA_SIL=m
CONFIG_SATA_SIS=m
CONFIG_SATA_SVW=m
CONFIG_SATA_ULI=m
CONFIG_SATA_VIA=m
CONFIG_SATA_VITESSE=m

#
# PATA SFF controllers with BMDMA
#
# CONFIG_PATA_ALI is not set
# CONFIG_PATA_AMD is not set
# CONFIG_PATA_ARASAN_CF is not set
# CONFIG_PATA_ARTOP is not set
# CONFIG_PATA_ATIIXP is not set
# CONFIG_PATA_ATP867X is not set
# CONFIG_PATA_CMD64X is not set
# CONFIG_PATA_CS5520 is not set
# CONFIG_PATA_CS5530 is not set
# CONFIG_PATA_CS5536 is not set
# CONFIG_PATA_CYPRESS is not set
# CONFIG_PATA_EFAR is not set
# CONFIG_PATA_HPT366 is not set
# CONFIG_PATA_HPT37X is not set
# CONFIG_PATA_HPT3X2N is not set
# CONFIG_PATA_HPT3X3 is not set
# CONFIG_PATA_IT8213 is not set
# CONFIG_PATA_IT821X is not set
# CONFIG_PATA_JMICRON is not set
CONFIG_PATA_MARVELL=m
# CONFIG_PATA_NETCELL is not set
# CONFIG_PATA_NINJA32 is not set
# CONFIG_PATA_NS87415 is not set
# CONFIG_PATA_OLDPIIX is not set
# CONFIG_PATA_OPTIDMA is not set
CONFIG_PATA_PDC2027X=m
# CONFIG_PATA_PDC_OLD is not set
# CONFIG_PATA_RADISYS is not set
# CONFIG_PATA_RDC is not set
# CONFIG_PATA_SC1200 is not set
# CONFIG_PATA_SCH is not set
# CONFIG_PATA_SERVERWORKS is not set
CONFIG_PATA_SIL680=m
CONFIG_PATA_SIS=m
# CONFIG_PATA_TOSHIBA is not set
# CONFIG_PATA_TRIFLEX is not set
# CONFIG_PATA_VIA is not set
# CONFIG_PATA_WINBOND is not set

#
# PIO-only SFF controllers
#
# CONFIG_PATA_CMD640_PCI is not set
# CONFIG_PATA_MPIIX is not set
# CONFIG_PATA_NS87410 is not set
# CONFIG_PATA_OPTI is not set
# CONFIG_PATA_PCMCIA is not set
# CONFIG_PATA_PLATFORM is not set
# CONFIG_PATA_RZ1000 is not set

#
# Generic fallback / legacy drivers
#
# CONFIG_ATA_GENERIC is not set
# CONFIG_PATA_LEGACY is not set
CONFIG_MD=y
CONFIG_BLK_DEV_MD=y
CONFIG_MD_AUTODETECT=y
CONFIG_MD_LINEAR=m
CONFIG_MD_RAID0=m
CONFIG_MD_RAID1=m
CONFIG_MD_RAID10=m
CONFIG_MD_RAID456=m
# CONFIG_MULTICORE_RAID456 is not set
CONFIG_MD_MULTIPATH=m
CONFIG_MD_FAULTY=m
CONFIG_BLK_DEV_DM=m
# CONFIG_DM_DEBUG is not set
CONFIG_DM_CRYPT=m
CONFIG_DM_SNAPSHOT=m
CONFIG_DM_MIRROR=m
# CONFIG_DM_RAID is not set
# CONFIG_DM_LOG_USERSPACE is not set
CONFIG_DM_ZERO=m
CONFIG_DM_MULTIPATH=m
# CONFIG_DM_MULTIPATH_QL is not set
# CONFIG_DM_MULTIPATH_ST is not set
# CONFIG_DM_DELAY is not set
CONFIG_DM_UEVENT=y
# CONFIG_DM_FLAKEY is not set
# CONFIG_TARGET_CORE is not set
CONFIG_FUSION=y
CONFIG_FUSION_SPI=m
CONFIG_FUSION_FC=m
CONFIG_FUSION_SAS=m
CONFIG_FUSION_MAX_SGE=128
CONFIG_FUSION_CTL=m
CONFIG_FUSION_LAN=m
CONFIG_FUSION_LOGGING=y

#
# IEEE 1394 (FireWire) support
#
CONFIG_FIREWIRE=m
CONFIG_FIREWIRE_OHCI=m
CONFIG_FIREWIRE_OHCI_DEBUG=y
CONFIG_FIREWIRE_SBP2=m
# CONFIG_FIREWIRE_NET is not set
# CONFIG_FIREWIRE_NOSY is not set
CONFIG_I2O=m
# CONFIG_I2O_LCT_NOTIFY_ON_CHANGES is not set
CONFIG_I2O_EXT_ADAPTEC=y
CONFIG_I2O_CONFIG=m
CONFIG_I2O_CONFIG_OLD_IOCTL=y
CONFIG_I2O_BUS=m
CONFIG_I2O_BLOCK=m
CONFIG_I2O_SCSI=m
CONFIG_I2O_PROC=m
# CONFIG_MACINTOSH_DRIVERS is not set
CONFIG_NETDEVICES=y
CONFIG_IFB=m
CONFIG_DUMMY=m
CONFIG_BONDING=m
# CONFIG_MACVLAN is not set
# CONFIG_EQUALIZER is not set
CONFIG_TUN=m
# CONFIG_VETH is not set
# CONFIG_ARCNET is not set
CONFIG_MII=m
CONFIG_PHYLIB=y

#
# MII PHY device drivers
#
CONFIG_MARVELL_PHY=m
CONFIG_DAVICOM_PHY=m
CONFIG_QSEMI_PHY=m
CONFIG_LXT_PHY=m
CONFIG_CICADA_PHY=m
CONFIG_VITESSE_PHY=m
CONFIG_SMSC_PHY=m
# CONFIG_BROADCOM_PHY is not set
# CONFIG_ICPLUS_PHY is not set
# CONFIG_REALTEK_PHY is not set
# CONFIG_NATIONAL_PHY is not set
# CONFIG_STE10XP is not set
# CONFIG_LSI_ET1011C_PHY is not set
# CONFIG_MICREL_PHY is not set
# CONFIG_FIXED_PHY is not set
# CONFIG_MDIO_BITBANG is not set
CONFIG_NET_ETHERNET=y
CONFIG_HAPPYMEAL=m
CONFIG_SUNGEM=m
CONFIG_CASSINI=m
CONFIG_NET_VENDOR_3COM=y
CONFIG_VORTEX=m
CONFIG_TYPHOON=m
# CONFIG_ETHOC is not set
# CONFIG_DNET is not set
CONFIG_NET_TULIP=y
CONFIG_DE2104X=m
CONFIG_DE2104X_DSL=0
CONFIG_TULIP=m
# CONFIG_TULIP_MWI is not set
CONFIG_TULIP_MMIO=y
# CONFIG_TULIP_NAPI is not set
CONFIG_DE4X5=m
CONFIG_WINBOND_840=m
CONFIG_DM9102=m
CONFIG_ULI526X=m
CONFIG_PCMCIA_XIRCOM=m
# CONFIG_HP100 is not set
# CONFIG_IBM_NEW_EMAC_ZMII is not set
# CONFIG_IBM_NEW_EMAC_RGMII is not set
# CONFIG_IBM_NEW_EMAC_TAH is not set
# CONFIG_IBM_NEW_EMAC_EMAC4 is not set
# CONFIG_IBM_NEW_EMAC_NO_FLOW_CTRL is not set
# CONFIG_IBM_NEW_EMAC_MAL_CLR_ICINTSTAT is not set
# CONFIG_IBM_NEW_EMAC_MAL_COMMON_ERR is not set
CONFIG_NET_PCI=y
CONFIG_PCNET32=m
CONFIG_AMD8111_ETH=m
CONFIG_ADAPTEC_STARFIRE=m
# CONFIG_KSZ884X_PCI is not set
CONFIG_B44=m
CONFIG_B44_PCI_AUTOSELECT=y
CONFIG_B44_PCICORE_AUTOSELECT=y
CONFIG_B44_PCI=y
CONFIG_FORCEDETH=m
CONFIG_E100=m
CONFIG_FEALNX=m
CONFIG_NATSEMI=m
CONFIG_NE2K_PCI=m
CONFIG_8139CP=m
CONFIG_8139TOO=m
# CONFIG_8139TOO_PIO is not set
# CONFIG_8139TOO_TUNE_TWISTER is not set
CONFIG_8139TOO_8129=y
# CONFIG_8139_OLD_RX_RESET is not set
# CONFIG_R6040 is not set
CONFIG_SIS900=m
CONFIG_EPIC100=m
# CONFIG_SMSC9420 is not set
CONFIG_SUNDANCE=m
# CONFIG_SUNDANCE_MMIO is not set
# CONFIG_TLAN is not set
# CONFIG_KS8851_MLL is not set
CONFIG_VIA_RHINE=m
CONFIG_VIA_RHINE_MMIO=y
# CONFIG_SC92031 is not set
CONFIG_NET_POCKET=y
# CONFIG_DE600 is not set
# CONFIG_DE620 is not set
# CONFIG_ATL2 is not set
# CONFIG_XILINX_EMACLITE is not set
CONFIG_NETDEV_1000=y
CONFIG_ACENIC=m
# CONFIG_ACENIC_OMIT_TIGON_I is not set
CONFIG_DL2K=m
CONFIG_E1000=m
CONFIG_E1000E=m
# CONFIG_IP1000 is not set
CONFIG_IGB=m
# CONFIG_IGBVF is not set
CONFIG_NS83820=m
# CONFIG_HAMACHI is not set
# CONFIG_YELLOWFIN is not set
CONFIG_R8169=m
CONFIG_SIS190=m
CONFIG_SKGE=m
# CONFIG_SKGE_DEBUG is not set
CONFIG_SKY2=m
# CONFIG_SKY2_DEBUG is not set
CONFIG_VIA_VELOCITY=m
CONFIG_TIGON3=m
CONFIG_BNX2=m
# CONFIG_CNIC is not set
CONFIG_FSL_PQ_MDIO=y
CONFIG_GIANFAR=y
# CONFIG_MV643XX_ETH is not set
# CONFIG_XILINX_LL_TEMAC is not set
CONFIG_QLA3XXX=m
# CONFIG_ATL1 is not set
# CONFIG_ATL1E is not set
# CONFIG_ATL1C is not set
# CONFIG_JME is not set
# CONFIG_STMMAC_ETH is not set
# CONFIG_PCH_GBE is not set
CONFIG_NETDEV_10000=y
CONFIG_MDIO=m
CONFIG_CHELSIO_T1=m
# CONFIG_CHELSIO_T1_1G is not set
CONFIG_CHELSIO_T3=m
# CONFIG_CHELSIO_T4 is not set
# CONFIG_CHELSIO_T4VF is not set
CONFIG_ENIC=m
CONFIG_IXGBE=m
# CONFIG_IXGBEVF is not set
CONFIG_IXGB=m
CONFIG_S2IO=m
# CONFIG_VXGE is not set
CONFIG_MYRI10GE=m
CONFIG_NETXEN_NIC=m
CONFIG_NIU=m
# CONFIG_MLX4_EN is not set
CONFIG_MLX4_CORE=m
CONFIG_MLX4_DEBUG=y
# CONFIG_TEHUTI is not set
CONFIG_BNX2X=m
# CONFIG_QLCNIC is not set
# CONFIG_QLGE is not set
# CONFIG_BNA is not set
# CONFIG_SFC is not set
# CONFIG_BE2NET is not set
CONFIG_TR=y
CONFIG_IBMOL=m
# CONFIG_IBMLS is not set
CONFIG_3C359=m
# CONFIG_TMS380TR is not set
CONFIG_WLAN=y
# CONFIG_PCMCIA_RAYCS is not set
# CONFIG_LIBERTAS_THINFIRM is not set
CONFIG_AIRO=m
CONFIG_ATMEL=m
CONFIG_PCI_ATMEL=m
CONFIG_PCMCIA_ATMEL=m
# CONFIG_AT76C50X_USB is not set
CONFIG_AIRO_CS=m
CONFIG_PCMCIA_WL3501=m
CONFIG_PRISM54=m
CONFIG_USB_ZD1201=m
# CONFIG_USB_NET_RNDIS_WLAN is not set
CONFIG_RTL8180=m
CONFIG_RTL8187=m
CONFIG_RTL8187_LEDS=y
# CONFIG_ADM8211 is not set
# CONFIG_MAC80211_HWSIM is not set
# CONFIG_MWL8K is not set
# CONFIG_ATH_COMMON is not set
# CONFIG_B43 is not set
# CONFIG_B43LEGACY is not set
CONFIG_HOSTAP=m
CONFIG_HOSTAP_FIRMWARE=y
CONFIG_HOSTAP_FIRMWARE_NVRAM=y
CONFIG_HOSTAP_PLX=m
CONFIG_HOSTAP_PCI=m
CONFIG_HOSTAP_CS=m
CONFIG_IPW2100=m
CONFIG_IPW2100_MONITOR=y
# CONFIG_IPW2100_DEBUG is not set
CONFIG_IPW2200=m
CONFIG_IPW2200_MONITOR=y
CONFIG_IPW2200_RADIOTAP=y
CONFIG_IPW2200_PROMISCUOUS=y
CONFIG_IPW2200_QOS=y
# CONFIG_IPW2200_DEBUG is not set
CONFIG_LIBIPW=m
# CONFIG_LIBIPW_DEBUG is not set
CONFIG_IWLAGN=m

#
# Debugging Options
#
# CONFIG_IWLWIFI_DEBUG is not set
# CONFIG_IWLWIFI_DEVICE_TRACING is not set
# CONFIG_IWLWIFI_DEVICE_SVTOOL is not set
# CONFIG_IWL_P2P is not set
CONFIG_IWLWIFI_LEGACY=m

#
# Debugging Options
#
# CONFIG_IWLWIFI_LEGACY_DEBUG is not set
# CONFIG_IWLWIFI_LEGACY_DEVICE_TRACING is not set
CONFIG_IWL4965=m
CONFIG_IWL3945=m
# CONFIG_IWM is not set
# CONFIG_LIBERTAS is not set
CONFIG_HERMES=m
# CONFIG_HERMES_PRISM is not set
CONFIG_HERMES_CACHE_FW_ON_INIT=y
CONFIG_PLX_HERMES=m
CONFIG_TMD_HERMES=m
CONFIG_NORTEL_HERMES=m
CONFIG_PCMCIA_HERMES=m
CONFIG_PCMCIA_SPECTRUM=m
# CONFIG_ORINOCO_USB is not set
# CONFIG_P54_COMMON is not set
CONFIG_RT2X00=m
CONFIG_RT2400PCI=m
CONFIG_RT2500PCI=m
CONFIG_RT61PCI=m
# CONFIG_RT2800PCI is not set
CONFIG_RT2500USB=m
CONFIG_RT73USB=m
# CONFIG_RT2800USB is not set
CONFIG_RT2X00_LIB_PCI=m
CONFIG_RT2X00_LIB_USB=m
CONFIG_RT2X00_LIB=m
CONFIG_RT2X00_LIB_FIRMWARE=y
CONFIG_RT2X00_LIB_CRYPTO=y
CONFIG_RT2X00_LIB_LEDS=y
# CONFIG_RT2X00_DEBUG is not set
# CONFIG_RTL8192CE is not set
# CONFIG_RTL8192SE is not set
# CONFIG_RTL8192CU is not set
# CONFIG_WL1251 is not set
# CONFIG_WL12XX_MENU is not set
CONFIG_ZD1211RW=m
# CONFIG_ZD1211RW_DEBUG is not set
# CONFIG_MWIFIEX is not set

#
# Enable WiMAX (Networking options) to see the WiMAX drivers
#

#
# USB Network Adapters
#
CONFIG_USB_CATC=m
CONFIG_USB_KAWETH=m
CONFIG_USB_PEGASUS=m
CONFIG_USB_RTL8150=m
CONFIG_USB_USBNET=m
CONFIG_USB_NET_AX8817X=m
CONFIG_USB_NET_CDCETHER=m
# CONFIG_USB_NET_CDC_EEM is not set
CONFIG_USB_NET_CDC_NCM=m
CONFIG_USB_NET_DM9601=m
# CONFIG_USB_NET_SMSC75XX is not set
# CONFIG_USB_NET_SMSC95XX is not set
CONFIG_USB_NET_GL620A=m
CONFIG_USB_NET_NET1080=m
CONFIG_USB_NET_PLUSB=m
# CONFIG_USB_NET_MCS7830 is not set
CONFIG_USB_NET_RNDIS_HOST=m
CONFIG_USB_NET_CDC_SUBSET=m
CONFIG_USB_ALI_M5632=y
CONFIG_USB_AN2720=y
CONFIG_USB_BELKIN=y
CONFIG_USB_ARMLINUX=y
CONFIG_USB_EPSON2888=y
# CONFIG_USB_KC2190 is not set
CONFIG_USB_NET_ZAURUS=m
# CONFIG_USB_NET_CX82310_ETH is not set
# CONFIG_USB_NET_KALMIA is not set
# CONFIG_USB_NET_INT51X1 is not set
# CONFIG_USB_IPHETH is not set
# CONFIG_USB_SIERRA_NET is not set
# CONFIG_USB_VL600 is not set
CONFIG_NET_PCMCIA=y
CONFIG_PCMCIA_3C589=m
CONFIG_PCMCIA_3C574=m
CONFIG_PCMCIA_FMVJ18X=m
CONFIG_PCMCIA_PCNET=m
CONFIG_PCMCIA_NMCLAN=m
CONFIG_PCMCIA_SMC91C92=m
CONFIG_PCMCIA_XIRC2PS=m
CONFIG_PCMCIA_AXNET=m
# CONFIG_PCMCIA_IBMTR is not set
# CONFIG_WAN is not set
CONFIG_ATM_DRIVERS=y
# CONFIG_ATM_DUMMY is not set
CONFIG_ATM_TCP=m
CONFIG_ATM_LANAI=m
CONFIG_ATM_ENI=m
# CONFIG_ATM_ENI_DEBUG is not set
# CONFIG_ATM_ENI_TUNE_BURST is not set
CONFIG_ATM_FIRESTREAM=m
# CONFIG_ATM_ZATM is not set
# CONFIG_ATM_NICSTAR is not set
CONFIG_ATM_IDT77252=m
# CONFIG_ATM_IDT77252_DEBUG is not set
# CONFIG_ATM_IDT77252_RCV_ALL is not set
CONFIG_ATM_IDT77252_USE_SUNI=y
CONFIG_ATM_AMBASSADOR=m
# CONFIG_ATM_AMBASSADOR_DEBUG is not set
CONFIG_ATM_HORIZON=m
# CONFIG_ATM_HORIZON_DEBUG is not set
# CONFIG_ATM_IA is not set
# CONFIG_ATM_FORE200E is not set
CONFIG_ATM_HE=m
# CONFIG_ATM_HE_USE_SUNI is not set
# CONFIG_ATM_SOLOS is not set

#
# CAIF transport drivers
#
CONFIG_FDDI=y
# CONFIG_DEFXX is not set
# CONFIG_SKFP is not set
# CONFIG_HIPPI is not set
# CONFIG_PLIP is not set
CONFIG_PPP=m
CONFIG_PPP_MULTILINK=y
CONFIG_PPP_FILTER=y
CONFIG_PPP_ASYNC=m
CONFIG_PPP_SYNC_TTY=m
CONFIG_PPP_DEFLATE=m
# CONFIG_PPP_BSDCOMP is not set
CONFIG_PPP_MPPE=m
CONFIG_PPPOE=m
CONFIG_PPPOATM=m
CONFIG_SLIP=m
CONFIG_SLIP_COMPRESSED=y
CONFIG_SLHC=m
CONFIG_SLIP_SMART=y
# CONFIG_SLIP_MODE_SLIP6 is not set
CONFIG_NET_FC=y
CONFIG_NETCONSOLE=m
# CONFIG_NETCONSOLE_DYNAMIC is not set
CONFIG_NETPOLL=y
CONFIG_NETPOLL_TRAP=y
CONFIG_NET_POLL_CONTROLLER=y
# CONFIG_VMXNET3 is not set
# CONFIG_ISDN is not set
# CONFIG_PHONE is not set

#
# Input device support
#
CONFIG_INPUT=y
CONFIG_INPUT_FF_MEMLESS=y
# CONFIG_INPUT_POLLDEV is not set
# CONFIG_INPUT_SPARSEKMAP is not set

#
# Userland interfaces
#
CONFIG_INPUT_MOUSEDEV=y
# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
CONFIG_INPUT_JOYDEV=m
CONFIG_INPUT_EVDEV=y
# CONFIG_INPUT_EVBUG is not set

#
# Input Device Drivers
#
CONFIG_INPUT_KEYBOARD=y
# CONFIG_KEYBOARD_ADP5588 is not set
# CONFIG_KEYBOARD_ADP5589 is not set
CONFIG_KEYBOARD_ATKBD=y
# CONFIG_KEYBOARD_QT1070 is not set
# CONFIG_KEYBOARD_QT2160 is not set
# CONFIG_KEYBOARD_LKKBD is not set
# CONFIG_KEYBOARD_TCA6416 is not set
# CONFIG_KEYBOARD_LM8323 is not set
# CONFIG_KEYBOARD_MAX7359 is not set
# CONFIG_KEYBOARD_MCS is not set
# CONFIG_KEYBOARD_MPR121 is not set
# CONFIG_KEYBOARD_NEWTON is not set
# CONFIG_KEYBOARD_OPENCORES is not set
# CONFIG_KEYBOARD_STOWAWAY is not set
# CONFIG_KEYBOARD_SUNKBD is not set
# CONFIG_KEYBOARD_XTKBD is not set
CONFIG_INPUT_MOUSE=y
CONFIG_MOUSE_PS2=y
CONFIG_MOUSE_PS2_ALPS=y
CONFIG_MOUSE_PS2_LOGIPS2PP=y
CONFIG_MOUSE_PS2_SYNAPTICS=y
CONFIG_MOUSE_PS2_TRACKPOINT=y
# CONFIG_MOUSE_PS2_ELANTECH is not set
# CONFIG_MOUSE_PS2_SENTELIC is not set
# CONFIG_MOUSE_PS2_TOUCHKIT is not set
CONFIG_MOUSE_SERIAL=m
# CONFIG_MOUSE_APPLETOUCH is not set
# CONFIG_MOUSE_BCM5974 is not set
CONFIG_MOUSE_VSXXXAA=m
# CONFIG_MOUSE_SYNAPTICS_I2C is not set
CONFIG_INPUT_JOYSTICK=y
# CONFIG_JOYSTICK_ANALOG is not set
# CONFIG_JOYSTICK_A3D is not set
# CONFIG_JOYSTICK_ADI is not set
# CONFIG_JOYSTICK_COBRA is not set
# CONFIG_JOYSTICK_GF2K is not set
# CONFIG_JOYSTICK_GRIP is not set
# CONFIG_JOYSTICK_GRIP_MP is not set
# CONFIG_JOYSTICK_GUILLEMOT is not set
# CONFIG_JOYSTICK_INTERACT is not set
# CONFIG_JOYSTICK_SIDEWINDER is not set
# CONFIG_JOYSTICK_TMDC is not set
# CONFIG_JOYSTICK_IFORCE is not set
# CONFIG_JOYSTICK_WARRIOR is not set
# CONFIG_JOYSTICK_MAGELLAN is not set
# CONFIG_JOYSTICK_SPACEORB is not set
# CONFIG_JOYSTICK_SPACEBALL is not set
# CONFIG_JOYSTICK_STINGER is not set
CONFIG_JOYSTICK_TWIDJOY=m
# CONFIG_JOYSTICK_ZHENHUA is not set
# CONFIG_JOYSTICK_DB9 is not set
# CONFIG_JOYSTICK_GAMECON is not set
# CONFIG_JOYSTICK_TURBOGRAFX is not set
# CONFIG_JOYSTICK_AS5011 is not set
CONFIG_JOYSTICK_JOYDUMP=m
# CONFIG_JOYSTICK_XPAD is not set
# CONFIG_INPUT_TABLET is not set
CONFIG_INPUT_TOUCHSCREEN=y
# CONFIG_TOUCHSCREEN_AD7879 is not set
# CONFIG_TOUCHSCREEN_ATMEL_MXT is not set
# CONFIG_TOUCHSCREEN_BU21013 is not set
# CONFIG_TOUCHSCREEN_DYNAPRO is not set
# CONFIG_TOUCHSCREEN_HAMPSHIRE is not set
# CONFIG_TOUCHSCREEN_EETI is not set
# CONFIG_TOUCHSCREEN_FUJITSU is not set
CONFIG_TOUCHSCREEN_GUNZE=m
CONFIG_TOUCHSCREEN_ELO=m
# CONFIG_TOUCHSCREEN_WACOM_W8001 is not set
# CONFIG_TOUCHSCREEN_MAX11801 is not set
# CONFIG_TOUCHSCREEN_MCS5000 is not set
CONFIG_TOUCHSCREEN_MTOUCH=m
# CONFIG_TOUCHSCREEN_INEXIO is not set
CONFIG_TOUCHSCREEN_MK712=m
# CONFIG_TOUCHSCREEN_PENMOUNT is not set
# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set
# CONFIG_TOUCHSCREEN_TOUCHWIN is not set
# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set
# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set
# CONFIG_TOUCHSCREEN_TSC2007 is not set
# CONFIG_TOUCHSCREEN_ST1232 is not set
# CONFIG_TOUCHSCREEN_TPS6507X is not set
CONFIG_INPUT_MISC=y
# CONFIG_INPUT_AD714X is not set
# CONFIG_INPUT_ATI_REMOTE is not set
# CONFIG_INPUT_ATI_REMOTE2 is not set
# CONFIG_INPUT_KEYSPAN_REMOTE is not set
# CONFIG_INPUT_POWERMATE is not set
# CONFIG_INPUT_YEALINK is not set
# CONFIG_INPUT_CM109 is not set
CONFIG_INPUT_UINPUT=m
# CONFIG_INPUT_PCF8574 is not set
# CONFIG_INPUT_ADXL34X is not set
# CONFIG_INPUT_CMA3000 is not set

#
# Hardware I/O ports
#
CONFIG_SERIO=y
CONFIG_SERIO_I8042=y
CONFIG_SERIO_SERPORT=y
# CONFIG_SERIO_PARKBD is not set
# CONFIG_SERIO_PCIPS2 is not set
CONFIG_SERIO_LIBPS2=y
CONFIG_SERIO_RAW=m
# CONFIG_SERIO_XILINX_XPS_PS2 is not set
# CONFIG_SERIO_ALTERA_PS2 is not set
# CONFIG_SERIO_PS2MULT is not set
CONFIG_GAMEPORT=m
CONFIG_GAMEPORT_NS558=m
CONFIG_GAMEPORT_L4=m
CONFIG_GAMEPORT_EMU10K1=m
CONFIG_GAMEPORT_FM801=m

#
# Character devices
#
CONFIG_VT=y
CONFIG_CONSOLE_TRANSLATIONS=y
CONFIG_VT_CONSOLE=y
CONFIG_HW_CONSOLE=y
CONFIG_VT_HW_CONSOLE_BINDING=y
CONFIG_UNIX98_PTYS=y
# CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set
# CONFIG_LEGACY_PTYS is not set
CONFIG_SERIAL_NONSTANDARD=y
# CONFIG_ROCKETPORT is not set
CONFIG_CYCLADES=m
# CONFIG_CYZ_INTR is not set
# CONFIG_MOXA_INTELLIO is not set
# CONFIG_MOXA_SMARTIO is not set
CONFIG_SYNCLINK=m
CONFIG_SYNCLINKMP=m
CONFIG_SYNCLINK_GT=m
# CONFIG_NOZOMI is not set
# CONFIG_ISI is not set
CONFIG_N_HDLC=m
# CONFIG_N_GSM is not set
# CONFIG_TRACE_SINK is not set
CONFIG_DEVKMEM=y
# CONFIG_STALDRV is not set

#
# Serial drivers
#
CONFIG_SERIAL_8250=y
CONFIG_SERIAL_8250_CONSOLE=y
CONFIG_SERIAL_8250_PCI=y
CONFIG_SERIAL_8250_CS=m
CONFIG_SERIAL_8250_NR_UARTS=32
CONFIG_SERIAL_8250_RUNTIME_UARTS=4
CONFIG_SERIAL_8250_EXTENDED=y
CONFIG_SERIAL_8250_MANY_PORTS=y
CONFIG_SERIAL_8250_SHARE_IRQ=y
CONFIG_SERIAL_8250_DETECT_IRQ=y
CONFIG_SERIAL_8250_RSA=y

#
# Non-8250 serial port support
#
# CONFIG_SERIAL_MFD_HSU is not set
# CONFIG_SERIAL_UARTLITE is not set
CONFIG_SERIAL_CORE=y
CONFIG_SERIAL_CORE_CONSOLE=y
CONFIG_SERIAL_JSM=m
# CONFIG_SERIAL_OF_PLATFORM is not set
# CONFIG_SERIAL_TIMBERDALE is not set
# CONFIG_SERIAL_ALTERA_JTAGUART is not set
# CONFIG_SERIAL_ALTERA_UART is not set
# CONFIG_SERIAL_PCH_UART is not set
# CONFIG_SERIAL_XILINX_PS_UART is not set
CONFIG_PRINTER=m
CONFIG_LP_CONSOLE=y
CONFIG_PPDEV=m
# CONFIG_HVC_UDBG is not set
CONFIG_IPMI_HANDLER=m
CONFIG_IPMI_PANIC_EVENT=y
CONFIG_IPMI_PANIC_STRING=y
CONFIG_IPMI_DEVICE_INTERFACE=m
CONFIG_IPMI_SI=m
CONFIG_IPMI_WATCHDOG=m
CONFIG_IPMI_POWEROFF=m
CONFIG_HW_RANDOM=y
# CONFIG_HW_RANDOM_TIMERIOMEM is not set
CONFIG_NVRAM=y
# CONFIG_GEN_RTC is not set
# CONFIG_R3964 is not set
# CONFIG_APPLICOM is not set

#
# PCMCIA character devices
#
# CONFIG_SYNCLINK_CS is not set
CONFIG_CARDMAN_4000=m
CONFIG_CARDMAN_4040=m
# CONFIG_IPWIRELESS is not set
CONFIG_RAW_DRIVER=y
CONFIG_MAX_RAW_DEVS=8192
CONFIG_TCG_TPM=m
CONFIG_TCG_TIS=m
CONFIG_TCG_NSC=m
CONFIG_TCG_ATMEL=m
CONFIG_DEVPORT=y
# CONFIG_RAMOOPS is not set
CONFIG_I2C=m
CONFIG_I2C_BOARDINFO=y
CONFIG_I2C_COMPAT=y
CONFIG_I2C_CHARDEV=m
# CONFIG_I2C_MUX is not set
CONFIG_I2C_HELPER_AUTO=y
CONFIG_I2C_SMBUS=m
CONFIG_I2C_ALGOBIT=m

#
# I2C Hardware Bus support
#

#
# PC SMBus host controller drivers
#
# CONFIG_I2C_ALI1535 is not set
# CONFIG_I2C_ALI1563 is not set
# CONFIG_I2C_ALI15X3 is not set
CONFIG_I2C_AMD756=m
CONFIG_I2C_AMD8111=m
CONFIG_I2C_I801=m
# CONFIG_I2C_ISCH is not set
CONFIG_I2C_PIIX4=m
CONFIG_I2C_NFORCE2=m
# CONFIG_I2C_SIS5595 is not set
# CONFIG_I2C_SIS630 is not set
CONFIG_I2C_SIS96X=m
CONFIG_I2C_VIA=m
CONFIG_I2C_VIAPRO=m

#
# I2C system bus drivers (mostly embedded / system-on-chip)
#
# CONFIG_I2C_INTEL_MID is not set
# CONFIG_I2C_MPC is not set
# CONFIG_I2C_OCORES is not set
# CONFIG_I2C_PCA_PLATFORM is not set
# CONFIG_I2C_PXA_PCI is not set
# CONFIG_I2C_SIMTEC is not set
# CONFIG_I2C_XILINX is not set
# CONFIG_I2C_EG20T is not set

#
# External I2C/SMBus adapter drivers
#
# CONFIG_I2C_DIOLAN_U2C is not set
CONFIG_I2C_PARPORT=m
CONFIG_I2C_PARPORT_LIGHT=m
# CONFIG_I2C_TAOS_EVM is not set
# CONFIG_I2C_TINY_USB is not set

#
# Other I2C/SMBus bus drivers
#
CONFIG_I2C_STUB=m
# CONFIG_I2C_DEBUG_CORE is not set
# CONFIG_I2C_DEBUG_ALGO is not set
# CONFIG_I2C_DEBUG_BUS is not set
# CONFIG_SPI is not set

#
# PPS support
#
# CONFIG_PPS is not set

#
# PPS generators support
#

#
# PTP clock support
#

#
# Enable Device Drivers -> PPS to see the PTP clock options.
#
CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y
# CONFIG_GPIOLIB is not set
# CONFIG_W1 is not set
CONFIG_POWER_SUPPLY=m
# CONFIG_POWER_SUPPLY_DEBUG is not set
# CONFIG_PDA_POWER is not set
# CONFIG_TEST_POWER is not set
# CONFIG_BATTERY_DS2780 is not set
# CONFIG_BATTERY_DS2782 is not set
# CONFIG_BATTERY_BQ20Z75 is not set
# CONFIG_BATTERY_BQ27x00 is not set
# CONFIG_BATTERY_MAX17040 is not set
# CONFIG_BATTERY_MAX17042 is not set
# CONFIG_CHARGER_MAX8903 is not set
CONFIG_HWMON=m
CONFIG_HWMON_VID=m
# CONFIG_HWMON_DEBUG_CHIP is not set

#
# Native drivers
#
# CONFIG_SENSORS_AD7414 is not set
# CONFIG_SENSORS_AD7418 is not set
CONFIG_SENSORS_ADM1021=m
CONFIG_SENSORS_ADM1025=m
CONFIG_SENSORS_ADM1026=m
# CONFIG_SENSORS_ADM1029 is not set
CONFIG_SENSORS_ADM1031=m
CONFIG_SENSORS_ADM9240=m
# CONFIG_SENSORS_ADT7411 is not set
# CONFIG_SENSORS_ADT7462 is not set
# CONFIG_SENSORS_ADT7470 is not set
# CONFIG_SENSORS_ADT7475 is not set
# CONFIG_SENSORS_ASC7621 is not set
CONFIG_SENSORS_ATXP1=m
# CONFIG_SENSORS_DS620 is not set
CONFIG_SENSORS_DS1621=m
# CONFIG_SENSORS_I5K_AMB is not set
CONFIG_SENSORS_F71805F=m
# CONFIG_SENSORS_F71882FG is not set
# CONFIG_SENSORS_F75375S is not set
# CONFIG_SENSORS_G760A is not set
CONFIG_SENSORS_GL518SM=m
CONFIG_SENSORS_GL520SM=m
# CONFIG_SENSORS_IBMAEM is not set
# CONFIG_SENSORS_IBMPEX is not set
CONFIG_SENSORS_IT87=m
# CONFIG_SENSORS_JC42 is not set
# CONFIG_SENSORS_LINEAGE is not set
CONFIG_SENSORS_LM63=m
# CONFIG_SENSORS_LM73 is not set
CONFIG_SENSORS_LM75=m
CONFIG_SENSORS_LM77=m
CONFIG_SENSORS_LM78=m
CONFIG_SENSORS_LM80=m
CONFIG_SENSORS_LM83=m
CONFIG_SENSORS_LM85=m
CONFIG_SENSORS_LM87=m
CONFIG_SENSORS_LM90=m
CONFIG_SENSORS_LM92=m
# CONFIG_SENSORS_LM93 is not set
# CONFIG_SENSORS_LTC4151 is not set
# CONFIG_SENSORS_LTC4215 is not set
# CONFIG_SENSORS_LTC4245 is not set
# CONFIG_SENSORS_LTC4261 is not set
# CONFIG_SENSORS_LM95241 is not set
# CONFIG_SENSORS_MAX16065 is not set
CONFIG_SENSORS_MAX1619=m
# CONFIG_SENSORS_MAX6639 is not set
# CONFIG_SENSORS_MAX6642 is not set
# CONFIG_SENSORS_MAX6650 is not set
CONFIG_SENSORS_PC87360=m
# CONFIG_SENSORS_PC87427 is not set
CONFIG_SENSORS_PCF8591=m
# CONFIG_PMBUS is not set
# CONFIG_SENSORS_SHT21 is not set
CONFIG_SENSORS_SIS5595=m
# CONFIG_SENSORS_SMM665 is not set
# CONFIG_SENSORS_DME1737 is not set
# CONFIG_SENSORS_EMC1403 is not set
# CONFIG_SENSORS_EMC2103 is not set
# CONFIG_SENSORS_EMC6W201 is not set
CONFIG_SENSORS_SMSC47M1=m
CONFIG_SENSORS_SMSC47M192=m
CONFIG_SENSORS_SMSC47B397=m
# CONFIG_SENSORS_SCH5627 is not set
# CONFIG_SENSORS_ADS1015 is not set
# CONFIG_SENSORS_ADS7828 is not set
# CONFIG_SENSORS_AMC6821 is not set
# CONFIG_SENSORS_THMC50 is not set
# CONFIG_SENSORS_TMP102 is not set
# CONFIG_SENSORS_TMP401 is not set
# CONFIG_SENSORS_TMP421 is not set
CONFIG_SENSORS_VIA686A=m
# CONFIG_SENSORS_VT1211 is not set
CONFIG_SENSORS_VT8231=m
CONFIG_SENSORS_W83781D=m
CONFIG_SENSORS_W83791D=m
CONFIG_SENSORS_W83792D=m
# CONFIG_SENSORS_W83793 is not set
# CONFIG_SENSORS_W83795 is not set
CONFIG_SENSORS_W83L785TS=m
# CONFIG_SENSORS_W83L786NG is not set
CONFIG_SENSORS_W83627HF=m
CONFIG_SENSORS_W83627EHF=m
# CONFIG_THERMAL is not set
CONFIG_WATCHDOG=y
# CONFIG_WATCHDOG_NOWAYOUT is not set

#
# Watchdog Device Drivers
#
CONFIG_SOFT_WATCHDOG=m
CONFIG_ALIM7101_WDT=m
# CONFIG_BOOKE_WDT is not set

#
# PCI-based Watchdog Cards
#
CONFIG_PCIPCWATCHDOG=m
CONFIG_WDTPCI=m

#
# USB-based Watchdog Cards
#
CONFIG_USBPCWATCHDOG=m
CONFIG_SSB_POSSIBLE=y

#
# Sonics Silicon Backplane
#
CONFIG_SSB=m
CONFIG_SSB_SPROM=y
CONFIG_SSB_PCIHOST_POSSIBLE=y
CONFIG_SSB_PCIHOST=y
# CONFIG_SSB_B43_PCI_BRIDGE is not set
CONFIG_SSB_PCMCIAHOST_POSSIBLE=y
# CONFIG_SSB_PCMCIAHOST is not set
CONFIG_SSB_SDIOHOST_POSSIBLE=y
# CONFIG_SSB_SDIOHOST is not set
# CONFIG_SSB_DEBUG is not set
CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y
CONFIG_SSB_DRIVER_PCICORE=y
CONFIG_BCMA_POSSIBLE=y

#
# Broadcom specific AMBA
#
# CONFIG_BCMA is not set
CONFIG_MFD_SUPPORT=y
# CONFIG_MFD_CORE is not set
# CONFIG_MFD_SM501 is not set
# CONFIG_HTC_PASIC3 is not set
# CONFIG_TPS6105X is not set
# CONFIG_TPS6507X is not set
# CONFIG_MFD_TMIO is not set
# CONFIG_MFD_WM8400 is not set
# CONFIG_MFD_PCF50633 is not set
# CONFIG_ABX500_CORE is not set
# CONFIG_LPC_SCH is not set
# CONFIG_MFD_RDC321X is not set
# CONFIG_MFD_JANZ_CMODIO is not set
# CONFIG_MFD_VX855 is not set
# CONFIG_MFD_WL1273_CORE is not set
# CONFIG_REGULATOR is not set
# CONFIG_MEDIA_SUPPORT is not set

#
# Graphics support
#
CONFIG_AGP=y
CONFIG_VGA_ARB=y
CONFIG_VGA_ARB_MAX_GPUS=16
CONFIG_DRM=m
CONFIG_DRM_KMS_HELPER=m
CONFIG_DRM_TTM=m
# CONFIG_DRM_TDFX is not set
CONFIG_DRM_R128=m
CONFIG_DRM_RADEON=m
# CONFIG_DRM_RADEON_KMS is not set
CONFIG_DRM_MGA=m
# CONFIG_DRM_SIS is not set
CONFIG_DRM_VIA=m
CONFIG_DRM_SAVAGE=m
# CONFIG_STUB_POULSBO is not set
CONFIG_VGASTATE=m
# CONFIG_VIDEO_OUTPUT_CONTROL is not set
CONFIG_FB=y
# CONFIG_FIRMWARE_EDID is not set
CONFIG_FB_DDC=m
# CONFIG_FB_BOOT_VESA_SUPPORT is not set
CONFIG_FB_CFB_FILLRECT=m
CONFIG_FB_CFB_COPYAREA=m
CONFIG_FB_CFB_IMAGEBLIT=m
# CONFIG_FB_CFB_REV_PIXELS_IN_BYTE is not set
# CONFIG_FB_SYS_FILLRECT is not set
# CONFIG_FB_SYS_COPYAREA is not set
# CONFIG_FB_SYS_IMAGEBLIT is not set
# CONFIG_FB_FOREIGN_ENDIAN is not set
# CONFIG_FB_SYS_FOPS is not set
# CONFIG_FB_WMT_GE_ROPS is not set
# CONFIG_FB_SVGALIB is not set
# CONFIG_FB_MACMODES is not set
CONFIG_FB_BACKLIGHT=y
CONFIG_FB_MODE_HELPERS=y
CONFIG_FB_TILEBLITTING=y

#
# Frame buffer hardware drivers
#
CONFIG_FB_CIRRUS=m
# CONFIG_FB_PM2 is not set
# CONFIG_FB_CYBER2000 is not set
# CONFIG_FB_OF is not set
# CONFIG_FB_CT65550 is not set
# CONFIG_FB_ASILIANT is not set
# CONFIG_FB_IMSTT is not set
CONFIG_FB_VGA16=m
# CONFIG_FB_UVESA is not set
# CONFIG_FB_S1D13XXX is not set
CONFIG_FB_NVIDIA=m
CONFIG_FB_NVIDIA_I2C=y
# CONFIG_FB_NVIDIA_DEBUG is not set
CONFIG_FB_NVIDIA_BACKLIGHT=y
CONFIG_FB_RIVA=m
# CONFIG_FB_RIVA_I2C is not set
# CONFIG_FB_RIVA_DEBUG is not set
CONFIG_FB_RIVA_BACKLIGHT=y
# CONFIG_FB_MATROX is not set
# CONFIG_FB_RADEON is not set
# CONFIG_FB_ATY128 is not set
# CONFIG_FB_ATY is not set
# CONFIG_FB_S3 is not set
CONFIG_FB_SAVAGE=m
CONFIG_FB_SAVAGE_I2C=y
CONFIG_FB_SAVAGE_ACCEL=y
# CONFIG_FB_SIS is not set
# CONFIG_FB_NEOMAGIC is not set
CONFIG_FB_KYRO=m
# CONFIG_FB_3DFX is not set
# CONFIG_FB_VOODOO1 is not set
# CONFIG_FB_VT8623 is not set
# CONFIG_FB_TRIDENT is not set
# CONFIG_FB_ARK is not set
# CONFIG_FB_PM3 is not set
# CONFIG_FB_CARMINE is not set
# CONFIG_FB_FSL_DIU is not set
# CONFIG_FB_UDL is not set
# CONFIG_FB_IBM_GXT4500 is not set
# CONFIG_FB_VIRTUAL is not set
# CONFIG_FB_METRONOME is not set
# CONFIG_FB_MB862XX is not set
# CONFIG_FB_BROADSHEET is not set
CONFIG_BACKLIGHT_LCD_SUPPORT=y
CONFIG_LCD_CLASS_DEVICE=m
# CONFIG_LCD_PLATFORM is not set
CONFIG_BACKLIGHT_CLASS_DEVICE=y
CONFIG_BACKLIGHT_GENERIC=y
# CONFIG_BACKLIGHT_ADP8860 is not set
# CONFIG_BACKLIGHT_ADP8870 is not set

#
# Display device support
#
# CONFIG_DISPLAY_SUPPORT is not set

#
# Console display driver support
#
CONFIG_VGA_CONSOLE=y
CONFIG_VGACON_SOFT_SCROLLBACK=y
CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
CONFIG_DUMMY_CONSOLE=y
CONFIG_FRAMEBUFFER_CONSOLE=y
CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y
CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y
# CONFIG_FONTS is not set
CONFIG_FONT_8x8=y
CONFIG_FONT_8x16=y
CONFIG_LOGO=y
# CONFIG_LOGO_LINUX_MONO is not set
# CONFIG_LOGO_LINUX_VGA16 is not set
CONFIG_LOGO_LINUX_CLUT224=y
# CONFIG_SOUND is not set
# CONFIG_HID_SUPPORT is not set
CONFIG_USB_SUPPORT=y
CONFIG_USB_ARCH_HAS_HCD=y
CONFIG_USB_ARCH_HAS_OHCI=y
CONFIG_USB_ARCH_HAS_EHCI=y
CONFIG_USB=y
# CONFIG_USB_DEBUG is not set
# CONFIG_USB_ANNOUNCE_NEW_DEVICES is not set

#
# Miscellaneous USB options
#
CONFIG_USB_DEVICEFS=y
CONFIG_USB_DEVICE_CLASS=y
# CONFIG_USB_DYNAMIC_MINORS is not set
CONFIG_USB_MON=y
# CONFIG_USB_WUSB is not set
# CONFIG_USB_WUSB_CBAF is not set

#
# USB Host Controller Drivers
#
# CONFIG_USB_C67X00_HCD is not set
# CONFIG_USB_XHCI_HCD is not set
CONFIG_USB_EHCI_HCD=m
CONFIG_USB_EHCI_ROOT_HUB_TT=y
CONFIG_USB_EHCI_TT_NEWSCHED=y
# CONFIG_XPS_USB_HCD_XILINX is not set
# CONFIG_USB_EHCI_FSL is not set
CONFIG_USB_EHCI_HCD_PPC_OF=y
# CONFIG_USB_OXU210HP_HCD is not set
CONFIG_USB_ISP116X_HCD=m
# CONFIG_USB_ISP1760_HCD is not set
# CONFIG_USB_ISP1362_HCD is not set
CONFIG_USB_OHCI_HCD=m
# CONFIG_USB_OHCI_HCD_PPC_OF_BE is not set
# CONFIG_USB_OHCI_HCD_PPC_OF_LE is not set
# CONFIG_USB_OHCI_HCD_PPC_OF is not set
# CONFIG_USB_OHCI_HCD_SSB is not set
# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set
# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set
CONFIG_USB_OHCI_LITTLE_ENDIAN=y
CONFIG_USB_UHCI_HCD=m
CONFIG_USB_SL811_HCD=m
# CONFIG_USB_SL811_HCD_ISO is not set
CONFIG_USB_SL811_CS=m
# CONFIG_USB_R8A66597_HCD is not set
# CONFIG_USB_WHCI_HCD is not set
# CONFIG_USB_HWA_HCD is not set

#
# USB Device Class drivers
#
CONFIG_USB_ACM=m
CONFIG_USB_PRINTER=m
# CONFIG_USB_WDM is not set
# CONFIG_USB_TMC is not set

#
# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
#

#
# also be needed; see USB_STORAGE Help for more info
#
CONFIG_USB_STORAGE=m
# CONFIG_USB_STORAGE_DEBUG is not set
# CONFIG_USB_STORAGE_REALTEK is not set
CONFIG_USB_STORAGE_DATAFAB=m
CONFIG_USB_STORAGE_FREECOM=m
CONFIG_USB_STORAGE_ISD200=m
CONFIG_USB_STORAGE_USBAT=m
CONFIG_USB_STORAGE_SDDR09=m
CONFIG_USB_STORAGE_SDDR55=m
CONFIG_USB_STORAGE_JUMPSHOT=m
CONFIG_USB_STORAGE_ALAUDA=m
# CONFIG_USB_STORAGE_ONETOUCH is not set
# CONFIG_USB_STORAGE_KARMA is not set
# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set
# CONFIG_USB_STORAGE_ENE_UB6250 is not set
# CONFIG_USB_UAS is not set
# CONFIG_USB_LIBUSUAL is not set

#
# USB Imaging devices
#
CONFIG_USB_MDC800=m
CONFIG_USB_MICROTEK=m

#
# USB port drivers
#
CONFIG_USB_USS720=m
# CONFIG_USB_SERIAL is not set

#
# USB Miscellaneous drivers
#
CONFIG_USB_EMI62=m
CONFIG_USB_EMI26=m
# CONFIG_USB_ADUTUX is not set
# CONFIG_USB_SEVSEG is not set
CONFIG_USB_RIO500=m
CONFIG_USB_LEGOTOWER=m
CONFIG_USB_LCD=m
CONFIG_USB_LED=m
# CONFIG_USB_CYPRESS_CY7C63 is not set
# CONFIG_USB_CYTHERM is not set
CONFIG_USB_IDMOUSE=m
# CONFIG_USB_FTDI_ELAN is not set
CONFIG_USB_APPLEDISPLAY=m
CONFIG_USB_SISUSBVGA=m
CONFIG_USB_SISUSBVGA_CON=y
CONFIG_USB_LD=m
# CONFIG_USB_TRANCEVIBRATOR is not set
# CONFIG_USB_IOWARRIOR is not set
CONFIG_USB_TEST=m
# CONFIG_USB_ISIGHTFW is not set
# CONFIG_USB_YUREX is not set
CONFIG_USB_ATM=m
CONFIG_USB_SPEEDTOUCH=m
CONFIG_USB_CXACRU=m
CONFIG_USB_UEAGLEATM=m
CONFIG_USB_XUSBATM=m
# CONFIG_USB_GADGET is not set

#
# OTG and related infrastructure
#
# CONFIG_NOP_USB_XCEIV is not set
# CONFIG_UWB is not set
CONFIG_MMC=m
# CONFIG_MMC_DEBUG is not set
# CONFIG_MMC_UNSAFE_RESUME is not set
# CONFIG_MMC_CLKGATE is not set

#
# MMC/SD/SDIO Card Drivers
#
CONFIG_MMC_BLOCK=m
CONFIG_MMC_BLOCK_MINORS=8
CONFIG_MMC_BLOCK_BOUNCE=y
# CONFIG_SDIO_UART is not set
# CONFIG_MMC_TEST is not set

#
# MMC/SD/SDIO Host Controller Drivers
#
CONFIG_MMC_SDHCI=m
# CONFIG_MMC_SDHCI_PCI is not set
# CONFIG_MMC_SDHCI_OF is not set
# CONFIG_MMC_SDHCI_PLTFM is not set
CONFIG_MMC_WBSD=m
# CONFIG_MMC_TIFM_SD is not set
# CONFIG_MMC_SDRICOH_CS is not set
# CONFIG_MMC_CB710 is not set
# CONFIG_MMC_VIA_SDMMC is not set
# CONFIG_MMC_VUB300 is not set
# CONFIG_MMC_USHC is not set
# CONFIG_MEMSTICK is not set
CONFIG_NEW_LEDS=y
CONFIG_LEDS_CLASS=y

#
# LED drivers
#
# CONFIG_LEDS_LM3530 is not set
# CONFIG_LEDS_PCA9532 is not set
# CONFIG_LEDS_LP3944 is not set
# CONFIG_LEDS_LP5521 is not set
# CONFIG_LEDS_LP5523 is not set
# CONFIG_LEDS_PCA955X is not set
# CONFIG_LEDS_BD2802 is not set
CONFIG_LEDS_TRIGGERS=y

#
# LED Triggers
#
CONFIG_LEDS_TRIGGER_TIMER=m
CONFIG_LEDS_TRIGGER_IDE_DISK=y
CONFIG_LEDS_TRIGGER_HEARTBEAT=m
# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set

#
# iptables trigger is under Netfilter config (LED target)
#
# CONFIG_NFC_DEVICES is not set
# CONFIG_ACCESSIBILITY is not set
CONFIG_INFINIBAND=m
CONFIG_INFINIBAND_USER_MAD=m
CONFIG_INFINIBAND_USER_ACCESS=m
CONFIG_INFINIBAND_USER_MEM=y
CONFIG_INFINIBAND_ADDR_TRANS=y
CONFIG_INFINIBAND_MTHCA=m
CONFIG_INFINIBAND_MTHCA_DEBUG=y
CONFIG_INFINIBAND_AMSO1100=m
# CONFIG_INFINIBAND_AMSO1100_DEBUG is not set
CONFIG_INFINIBAND_CXGB3=m
# CONFIG_INFINIBAND_CXGB3_DEBUG is not set
CONFIG_MLX4_INFINIBAND=m
CONFIG_INFINIBAND_NES=m
# CONFIG_INFINIBAND_NES_DEBUG is not set
CONFIG_INFINIBAND_IPOIB=m
CONFIG_INFINIBAND_IPOIB_CM=y
CONFIG_INFINIBAND_IPOIB_DEBUG=y
# CONFIG_INFINIBAND_IPOIB_DEBUG_DATA is not set
CONFIG_INFINIBAND_SRP=m
CONFIG_INFINIBAND_ISER=m
CONFIG_EDAC=y

#
# Reporting subsystems
#
# CONFIG_EDAC_DEBUG is not set
CONFIG_EDAC_MM_EDAC=m
# CONFIG_EDAC_MPC85XX is not set
# CONFIG_RTC_CLASS is not set
# CONFIG_DMADEVICES is not set
# CONFIG_AUXDISPLAY is not set
# CONFIG_UIO is not set
# CONFIG_STAGING is not set

#
# File systems
#
CONFIG_EXT2_FS=y
CONFIG_EXT2_FS_XATTR=y
CONFIG_EXT2_FS_POSIX_ACL=y
CONFIG_EXT2_FS_SECURITY=y
CONFIG_EXT2_FS_XIP=y
CONFIG_EXT3_FS=m
CONFIG_EXT3_DEFAULTS_TO_ORDERED=y
CONFIG_EXT3_FS_XATTR=y
CONFIG_EXT3_FS_POSIX_ACL=y
CONFIG_EXT3_FS_SECURITY=y
# CONFIG_EXT4_FS is not set
CONFIG_FS_XIP=y
CONFIG_JBD=m
# CONFIG_JBD_DEBUG is not set
CONFIG_FS_MBCACHE=y
# CONFIG_REISERFS_FS is not set
# CONFIG_JFS_FS is not set
# CONFIG_XFS_FS is not set
CONFIG_GFS2_FS=m
# CONFIG_GFS2_FS_LOCKING_DLM is not set
# CONFIG_OCFS2_FS is not set
# CONFIG_BTRFS_FS is not set
# CONFIG_NILFS2_FS is not set
CONFIG_FS_POSIX_ACL=y
CONFIG_EXPORTFS=m
CONFIG_FILE_LOCKING=y
CONFIG_FSNOTIFY=y
CONFIG_DNOTIFY=y
CONFIG_INOTIFY_USER=y
# CONFIG_FANOTIFY is not set
CONFIG_QUOTA=y
# CONFIG_QUOTA_NETLINK_INTERFACE is not set
CONFIG_PRINT_QUOTA_WARNING=y
# CONFIG_QUOTA_DEBUG is not set
CONFIG_QUOTA_TREE=y
# CONFIG_QFMT_V1 is not set
CONFIG_QFMT_V2=y
CONFIG_QUOTACTL=y
CONFIG_AUTOFS4_FS=m
# CONFIG_FUSE_FS is not set

#
# Caches
#
CONFIG_FSCACHE=m
# CONFIG_FSCACHE_STATS is not set
# CONFIG_FSCACHE_HISTOGRAM is not set
# CONFIG_FSCACHE_DEBUG is not set
# CONFIG_FSCACHE_OBJECT_LIST is not set
CONFIG_CACHEFILES=m
CONFIG_CACHEFILES_DEBUG=y
# CONFIG_CACHEFILES_HISTOGRAM is not set

#
# CD-ROM/DVD Filesystems
#
CONFIG_ISO9660_FS=y
CONFIG_JOLIET=y
CONFIG_ZISOFS=y
CONFIG_UDF_FS=m
CONFIG_UDF_NLS=y

#
# DOS/FAT/NT Filesystems
#
CONFIG_FAT_FS=m
CONFIG_MSDOS_FS=m
CONFIG_VFAT_FS=m
CONFIG_FAT_DEFAULT_CODEPAGE=437
CONFIG_FAT_DEFAULT_IOCHARSET="ascii"
# CONFIG_NTFS_FS is not set

#
# Pseudo filesystems
#
CONFIG_PROC_FS=y
CONFIG_PROC_KCORE=y
CONFIG_PROC_VMCORE=y
CONFIG_PROC_SYSCTL=y
CONFIG_PROC_PAGE_MONITOR=y
CONFIG_SYSFS=y
CONFIG_TMPFS=y
# CONFIG_TMPFS_POSIX_ACL is not set
# CONFIG_TMPFS_XATTR is not set
# CONFIG_HUGETLB_PAGE is not set
CONFIG_CONFIGFS_FS=m
CONFIG_MISC_FILESYSTEMS=y
# CONFIG_ADFS_FS is not set
# CONFIG_AFFS_FS is not set
CONFIG_ECRYPT_FS=m
CONFIG_HFS_FS=m
CONFIG_HFSPLUS_FS=m
# CONFIG_BEFS_FS is not set
# CONFIG_BFS_FS is not set
# CONFIG_EFS_FS is not set
CONFIG_JFFS2_FS=m
CONFIG_JFFS2_FS_DEBUG=0
CONFIG_JFFS2_FS_WRITEBUFFER=y
# CONFIG_JFFS2_FS_WBUF_VERIFY is not set
CONFIG_JFFS2_SUMMARY=y
# CONFIG_JFFS2_FS_XATTR is not set
# CONFIG_JFFS2_COMPRESSION_OPTIONS is not set
CONFIG_JFFS2_ZLIB=y
# CONFIG_JFFS2_LZO is not set
CONFIG_JFFS2_RTIME=y
# CONFIG_JFFS2_RUBIN is not set
# CONFIG_LOGFS is not set
CONFIG_CRAMFS=m
CONFIG_SQUASHFS=m
# CONFIG_SQUASHFS_XATTR is not set
# CONFIG_SQUASHFS_LZO is not set
# CONFIG_SQUASHFS_XZ is not set
# CONFIG_SQUASHFS_EMBEDDED is not set
CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3
CONFIG_VXFS_FS=m
# CONFIG_MINIX_FS is not set
# CONFIG_OMFS_FS is not set
# CONFIG_HPFS_FS is not set
# CONFIG_QNX4FS_FS is not set
# CONFIG_ROMFS_FS is not set
# CONFIG_PSTORE is not set
# CONFIG_SYSV_FS is not set
# CONFIG_UFS_FS is not set
CONFIG_NETWORK_FILESYSTEMS=y
CONFIG_NFS_FS=y
CONFIG_NFS_V3=y
CONFIG_NFS_V3_ACL=y
CONFIG_NFS_V4=y
# CONFIG_NFS_V4_1 is not set
CONFIG_ROOT_NFS=y
# CONFIG_NFS_USE_LEGACY_DNS is not set
CONFIG_NFS_USE_KERNEL_DNS=y
# CONFIG_NFS_USE_NEW_IDMAPPER is not set
CONFIG_NFSD=m
CONFIG_NFSD_DEPRECATED=y
CONFIG_NFSD_V2_ACL=y
CONFIG_NFSD_V3=y
CONFIG_NFSD_V3_ACL=y
CONFIG_NFSD_V4=y
CONFIG_LOCKD=y
CONFIG_LOCKD_V4=y
CONFIG_NFS_ACL_SUPPORT=y
CONFIG_NFS_COMMON=y
CONFIG_SUNRPC=y
CONFIG_SUNRPC_GSS=y
CONFIG_SUNRPC_XPRT_RDMA=m
# CONFIG_CEPH_FS is not set
CONFIG_CIFS=m
# CONFIG_CIFS_STATS is not set
CONFIG_CIFS_WEAK_PW_HASH=y
CONFIG_CIFS_UPCALL=y
CONFIG_CIFS_XATTR=y
CONFIG_CIFS_POSIX=y
# CONFIG_CIFS_DEBUG2 is not set
CONFIG_CIFS_DFS_UPCALL=y
# CONFIG_CIFS_FSCACHE is not set
# CONFIG_CIFS_ACL is not set
# CONFIG_NCP_FS is not set
# CONFIG_CODA_FS is not set
# CONFIG_AFS_FS is not set

#
# Partition Types
#
CONFIG_PARTITION_ADVANCED=y
# CONFIG_ACORN_PARTITION is not set
CONFIG_OSF_PARTITION=y
CONFIG_AMIGA_PARTITION=y
# CONFIG_ATARI_PARTITION is not set
CONFIG_MAC_PARTITION=y
CONFIG_MSDOS_PARTITION=y
CONFIG_BSD_DISKLABEL=y
CONFIG_MINIX_SUBPARTITION=y
CONFIG_SOLARIS_X86_PARTITION=y
CONFIG_UNIXWARE_DISKLABEL=y
# CONFIG_LDM_PARTITION is not set
CONFIG_SGI_PARTITION=y
# CONFIG_ULTRIX_PARTITION is not set
CONFIG_SUN_PARTITION=y
CONFIG_KARMA_PARTITION=y
CONFIG_EFI_PARTITION=y
# CONFIG_SYSV68_PARTITION is not set
CONFIG_NLS=y
CONFIG_NLS_DEFAULT="utf8"
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_CODEPAGE_737=m
CONFIG_NLS_CODEPAGE_775=m
CONFIG_NLS_CODEPAGE_850=m
CONFIG_NLS_CODEPAGE_852=m
CONFIG_NLS_CODEPAGE_855=m
CONFIG_NLS_CODEPAGE_857=m
CONFIG_NLS_CODEPAGE_860=m
CONFIG_NLS_CODEPAGE_861=m
CONFIG_NLS_CODEPAGE_862=m
CONFIG_NLS_CODEPAGE_863=m
CONFIG_NLS_CODEPAGE_864=m
CONFIG_NLS_CODEPAGE_865=m
CONFIG_NLS_CODEPAGE_866=m
CONFIG_NLS_CODEPAGE_869=m
CONFIG_NLS_CODEPAGE_936=m
CONFIG_NLS_CODEPAGE_950=m
CONFIG_NLS_CODEPAGE_932=m
CONFIG_NLS_CODEPAGE_949=m
CONFIG_NLS_CODEPAGE_874=m
CONFIG_NLS_ISO8859_8=m
CONFIG_NLS_CODEPAGE_1250=m
CONFIG_NLS_CODEPAGE_1251=m
CONFIG_NLS_ASCII=y
CONFIG_NLS_ISO8859_1=m
CONFIG_NLS_ISO8859_2=m
CONFIG_NLS_ISO8859_3=m
CONFIG_NLS_ISO8859_4=m
CONFIG_NLS_ISO8859_5=m
CONFIG_NLS_ISO8859_6=m
CONFIG_NLS_ISO8859_7=m
CONFIG_NLS_ISO8859_9=m
CONFIG_NLS_ISO8859_13=m
CONFIG_NLS_ISO8859_14=m
CONFIG_NLS_ISO8859_15=m
CONFIG_NLS_KOI8_R=m
CONFIG_NLS_KOI8_U=m
CONFIG_NLS_UTF8=m
CONFIG_DLM=m
CONFIG_DLM_DEBUG=y
CONFIG_BINARY_PRINTF=y

#
# Library routines
#
CONFIG_RAID6_PQ=m
CONFIG_BITREVERSE=y
CONFIG_CRC_CCITT=m
CONFIG_CRC16=m
# CONFIG_CRC_T10DIF is not set
CONFIG_CRC_ITU_T=m
CONFIG_CRC32=y
# CONFIG_CRC7 is not set
CONFIG_LIBCRC32C=y
CONFIG_ZLIB_INFLATE=y
CONFIG_ZLIB_DEFLATE=m
CONFIG_LZO_DECOMPRESS=y
CONFIG_XZ_DEC=y
CONFIG_XZ_DEC_X86=y
CONFIG_XZ_DEC_POWERPC=y
CONFIG_XZ_DEC_IA64=y
CONFIG_XZ_DEC_ARM=y
CONFIG_XZ_DEC_ARMTHUMB=y
CONFIG_XZ_DEC_SPARC=y
CONFIG_XZ_DEC_BCJ=y
# CONFIG_XZ_DEC_TEST is not set
CONFIG_DECOMPRESS_GZIP=y
CONFIG_DECOMPRESS_BZIP2=y
CONFIG_DECOMPRESS_LZMA=y
CONFIG_DECOMPRESS_XZ=y
CONFIG_DECOMPRESS_LZO=y
CONFIG_GENERIC_ALLOCATOR=y
CONFIG_REED_SOLOMON=m
CONFIG_REED_SOLOMON_DEC16=y
CONFIG_TEXTSEARCH=y
CONFIG_TEXTSEARCH_KMP=m
CONFIG_TEXTSEARCH_BM=m
CONFIG_TEXTSEARCH_FSM=m
CONFIG_HAS_IOMEM=y
CONFIG_HAS_IOPORT=y
CONFIG_HAS_DMA=y
CONFIG_CPU_RMAP=y
CONFIG_NLATTR=y
CONFIG_GENERIC_ATOMIC64=y
CONFIG_AVERAGE=y

#
# Kernel hacking
#
# CONFIG_PRINTK_TIME is not set
CONFIG_DEFAULT_MESSAGE_LOGLEVEL=4
CONFIG_ENABLE_WARN_DEPRECATED=y
CONFIG_ENABLE_MUST_CHECK=y
CONFIG_FRAME_WARN=1024
CONFIG_MAGIC_SYSRQ=y
# CONFIG_STRIP_ASM_SYMS is not set
# CONFIG_UNUSED_SYMBOLS is not set
CONFIG_DEBUG_FS=y
# CONFIG_HEADERS_CHECK is not set
# CONFIG_DEBUG_SECTION_MISMATCH is not set
CONFIG_DEBUG_KERNEL=y
# CONFIG_DEBUG_SHIRQ is not set
# CONFIG_LOCKUP_DETECTOR is not set
# CONFIG_HARDLOCKUP_DETECTOR is not set
# CONFIG_DETECT_HUNG_TASK is not set
CONFIG_SCHED_DEBUG=y
CONFIG_SCHEDSTATS=y
# CONFIG_TIMER_STATS is not set
# CONFIG_DEBUG_OBJECTS is not set
# CONFIG_DEBUG_SLAB is not set
# CONFIG_DEBUG_KMEMLEAK is not set
# CONFIG_DEBUG_RT_MUTEXES is not set
# CONFIG_RT_MUTEX_TESTER is not set
# CONFIG_DEBUG_SPINLOCK is not set
# CONFIG_DEBUG_MUTEXES is not set
# CONFIG_DEBUG_LOCK_ALLOC is not set
# CONFIG_PROVE_LOCKING is not set
# CONFIG_SPARSE_RCU_POINTER is not set
# CONFIG_LOCK_STAT is not set
# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
CONFIG_STACKTRACE=y
# CONFIG_DEBUG_STACK_USAGE is not set
# CONFIG_DEBUG_KOBJECT is not set
CONFIG_DEBUG_BUGVERBOSE=y
CONFIG_DEBUG_INFO=y
# CONFIG_DEBUG_INFO_REDUCED is not set
# CONFIG_DEBUG_VM is not set
# CONFIG_DEBUG_WRITECOUNT is not set
CONFIG_DEBUG_MEMORY_INIT=y
CONFIG_DEBUG_LIST=y
# CONFIG_TEST_LIST_SORT is not set
# CONFIG_DEBUG_SG is not set
# CONFIG_DEBUG_NOTIFIERS is not set
# CONFIG_DEBUG_CREDENTIALS is not set
CONFIG_FRAME_POINTER=y
# CONFIG_RCU_TORTURE_TEST is not set
CONFIG_RCU_CPU_STALL_TIMEOUT=60
# CONFIG_KPROBES_SANITY_TEST is not set
# CONFIG_BACKTRACE_SELF_TEST is not set
# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set
# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set
# CONFIG_DEBUG_PER_CPU_MAPS is not set
# CONFIG_LKDTM is not set
# CONFIG_FAULT_INJECTION is not set
# CONFIG_LATENCYTOP is not set
# CONFIG_SYSCTL_SYSCALL_CHECK is not set
# CONFIG_DEBUG_PAGEALLOC is not set
CONFIG_NOP_TRACER=y
CONFIG_HAVE_FUNCTION_TRACER=y
CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
CONFIG_HAVE_DYNAMIC_FTRACE=y
CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
CONFIG_HAVE_SYSCALL_TRACEPOINTS=y
CONFIG_RING_BUFFER=y
CONFIG_EVENT_TRACING=y
CONFIG_EVENT_POWER_TRACING_DEPRECATED=y
CONFIG_CONTEXT_SWITCH_TRACER=y
CONFIG_RING_BUFFER_ALLOW_SWAP=y
CONFIG_TRACING=y
CONFIG_GENERIC_TRACER=y
CONFIG_TRACING_SUPPORT=y
CONFIG_FTRACE=y
CONFIG_FUNCTION_TRACER=y
CONFIG_FUNCTION_GRAPH_TRACER=y
# CONFIG_IRQSOFF_TRACER is not set
# CONFIG_SCHED_TRACER is not set
# CONFIG_FTRACE_SYSCALLS is not set
CONFIG_BRANCH_PROFILE_NONE=y
# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set
# CONFIG_PROFILE_ALL_BRANCHES is not set
# CONFIG_STACK_TRACER is not set
CONFIG_BLK_DEV_IO_TRACE=y
CONFIG_KPROBE_EVENT=y
CONFIG_DYNAMIC_FTRACE=y
# CONFIG_FUNCTION_PROFILER is not set
CONFIG_FTRACE_MCOUNT_RECORD=y
# CONFIG_FTRACE_STARTUP_TEST is not set
# CONFIG_RING_BUFFER_BENCHMARK is not set
# CONFIG_FIREWIRE_OHCI_REMOTE_DMA is not set
# CONFIG_DYNAMIC_DEBUG is not set
# CONFIG_DMA_API_DEBUG is not set
# CONFIG_ATOMIC64_SELFTEST is not set
# CONFIG_ASYNC_RAID6_TEST is not set
# CONFIG_SAMPLES is not set
CONFIG_HAVE_ARCH_KGDB=y
# CONFIG_KGDB is not set
# CONFIG_TEST_KSTRTOX is not set
# CONFIG_PPC_DISABLE_WERROR is not set
CONFIG_PPC_WERROR=y
CONFIG_PRINT_STACK_DEPTH=64
CONFIG_DEBUG_STACKOVERFLOW=y
# CONFIG_PPC_EMULATED_STATS is not set
# CONFIG_CODE_PATCHING_SELFTEST is not set
# CONFIG_FTR_FIXUP_SELFTEST is not set
# CONFIG_MSI_BITMAP_SELFTEST is not set
# CONFIG_XMON is not set
# CONFIG_VIRQ_DEBUG is not set
# CONFIG_BDI_SWITCH is not set
# CONFIG_PPC_EARLY_DEBUG is not set

#
# Security options
#
CONFIG_KEYS=y
# CONFIG_TRUSTED_KEYS is not set
# CONFIG_KEYS_DEBUG_PROC_KEYS is not set
# CONFIG_SECURITY_DMESG_RESTRICT is not set
# CONFIG_SECURITY is not set
CONFIG_SECURITYFS=y
CONFIG_DEFAULT_SECURITY_DAC=y
CONFIG_DEFAULT_SECURITY=""
CONFIG_XOR_BLOCKS=m
CONFIG_ASYNC_CORE=m
CONFIG_ASYNC_MEMCPY=m
CONFIG_ASYNC_XOR=m
CONFIG_ASYNC_PQ=m
CONFIG_ASYNC_RAID6_RECOV=m
CONFIG_CRYPTO=y

#
# Crypto core or helper
#
CONFIG_CRYPTO_ALGAPI=y
CONFIG_CRYPTO_ALGAPI2=y
CONFIG_CRYPTO_AEAD=m
CONFIG_CRYPTO_AEAD2=y
CONFIG_CRYPTO_BLKCIPHER=m
CONFIG_CRYPTO_BLKCIPHER2=y
CONFIG_CRYPTO_HASH=y
CONFIG_CRYPTO_HASH2=y
CONFIG_CRYPTO_RNG=m
CONFIG_CRYPTO_RNG2=y
CONFIG_CRYPTO_PCOMP2=y
CONFIG_CRYPTO_MANAGER=y
CONFIG_CRYPTO_MANAGER2=y
CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=y
# CONFIG_CRYPTO_GF128MUL is not set
CONFIG_CRYPTO_NULL=m
# CONFIG_CRYPTO_PCRYPT is not set
CONFIG_CRYPTO_WORKQUEUE=y
# CONFIG_CRYPTO_CRYPTD is not set
CONFIG_CRYPTO_AUTHENC=m
CONFIG_CRYPTO_TEST=m

#
# Authenticated Encryption with Associated Data
#
CONFIG_CRYPTO_CCM=m
# CONFIG_CRYPTO_GCM is not set
CONFIG_CRYPTO_SEQIV=m

#
# Block modes
#
CONFIG_CRYPTO_CBC=m
CONFIG_CRYPTO_CTR=m
# CONFIG_CRYPTO_CTS is not set
CONFIG_CRYPTO_ECB=m
# CONFIG_CRYPTO_LRW is not set
# CONFIG_CRYPTO_PCBC is not set
# CONFIG_CRYPTO_XTS is not set

#
# Hash modes
#
CONFIG_CRYPTO_HMAC=y
CONFIG_CRYPTO_XCBC=m
# CONFIG_CRYPTO_VMAC is not set

#
# Digest
#
CONFIG_CRYPTO_CRC32C=y
# CONFIG_CRYPTO_GHASH is not set
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MD5=m
CONFIG_CRYPTO_MICHAEL_MIC=m
# CONFIG_CRYPTO_RMD128 is not set
# CONFIG_CRYPTO_RMD160 is not set
# CONFIG_CRYPTO_RMD256 is not set
# CONFIG_CRYPTO_RMD320 is not set
CONFIG_CRYPTO_SHA1=y
CONFIG_CRYPTO_SHA256=m
CONFIG_CRYPTO_SHA512=m
CONFIG_CRYPTO_TGR192=m
CONFIG_CRYPTO_WP512=m

#
# Ciphers
#
CONFIG_CRYPTO_AES=m
CONFIG_CRYPTO_ANUBIS=m
CONFIG_CRYPTO_ARC4=m
CONFIG_CRYPTO_BLOWFISH=m
# CONFIG_CRYPTO_CAMELLIA is not set
CONFIG_CRYPTO_CAST5=m
CONFIG_CRYPTO_CAST6=m
CONFIG_CRYPTO_DES=m
# CONFIG_CRYPTO_FCRYPT is not set
CONFIG_CRYPTO_KHAZAD=m
# CONFIG_CRYPTO_SALSA20 is not set
# CONFIG_CRYPTO_SEED is not set
CONFIG_CRYPTO_SERPENT=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_TWOFISH_COMMON=m

#
# Compression
#
CONFIG_CRYPTO_DEFLATE=m
# CONFIG_CRYPTO_ZLIB is not set
# CONFIG_CRYPTO_LZO is not set

#
# Random Number Generation
#
CONFIG_CRYPTO_ANSI_CPRNG=m
# CONFIG_CRYPTO_USER_API_HASH is not set
# CONFIG_CRYPTO_USER_API_SKCIPHER is not set
CONFIG_CRYPTO_HW=y
# CONFIG_CRYPTO_DEV_HIFN_795X is not set
# CONFIG_CRYPTO_DEV_FSL_CAAM is not set
# CONFIG_CRYPTO_DEV_TALITOS is not set
# CONFIG_PPC_CLOCK is not set
# CONFIG_VIRTUALIZATION is not set

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-18  7:36                         ` Benjamin Herrenschmidt
@ 2011-07-19  3:30                           ` Shan Hai
  -1 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-19  3:30 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Peter Zijlstra, Peter Zijlstra, paulus, tglx, walken, dhowells,
	cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel

On 07/18/2011 03:36 PM, Benjamin Herrenschmidt wrote:
> On Mon, 2011-07-18 at 15:26 +0800, Shan Hai wrote:
>> I am sorry I hadn't tried your newer patch, I tried it but it still
>> could not work in my test environment, I will dig into and tell you
>> why that failed later.
> Ok, please let me know what you find !
>

Have not been finding out the reason why failed,
I tried the following based on your code,
(1)
diff --git a/kernel/futex.c b/kernel/futex.c
index fe28dc2..820556d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -353,10 +353,11 @@ static int fault_in_user_writeable(u32 __user *uaddr)
  {
         struct mm_struct *mm = current->mm;
         int ret;
+       int flags = FOLL_TOUCH | FOLL_GET | FOLL_WRITE | FOLL_FIXFAULT;

         down_read(&mm->mmap_sem);
-       ret = get_user_pages(current, mm, (unsigned long)uaddr,
-                            1, 1, 0, NULL, NULL);
+       ret = __get_user_pages(current, mm, (unsigned long)uaddr, 1,
+                              flags, NULL, NULL, NULL);
         up_read(&mm->mmap_sem);

         return ret < 0 ? ret : 0;

(2)
diff --git a/mm/memory.c b/mm/memory.c
index 9b8a01d..f7ba26e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
...
+
+       if ((flags & (FOLL_WRITE | FOLL_FIXFAULT)) && !pte_dirty(pte))
+               handle_pte_sw_young_dirty(vma, address, ptep,
+                                          FAULT_FLAG_WRITE);
...

And everything lookes good, but still couldn't work, need more 
investigation.

>> Yep, I know holding lots of ifdef's everywhere is not so good,
>> but if we have some other way(I don't know how till now) to
>> figure out the arch has the need to fixup up the write permission
>> we could eradicate the ugly ifdef's here.
>>
>> I think the handle_mm_fault could do all dirty/young tracking,
>> because the purpose of making follow_page return NULL to
>> its caller is that want to the handle_mm_fault to be called
>> on write permission protection fault.
> I see your point. Rather than factoring the fixup code out, we could
> force gup to call handle_mm_fault()... that makes sense.
>
> However, I don't think we should special case archs. There's plenty of
> cases where we don't care about this fixup even on archs that do SW
> tracking of dirty and young. For example when gup is using for
> subsequent DMA.
>
> Only the (rare ?) cases where it's used as a mean to fixup a failing
> "atomic" user access are relevant.
>
> So I believe we should still pass an explicit flag to __get_user_pages()
> as I propose to activate that behaviour.
>

How about the following one?
the write permission fixup behaviour is triggered explicitly by
the trouble making parts like futex as you suggested.

In this way, the follow_page() mimics exactly how the MMU
faults on atomic access to the user pages, and we could handle
the fault by already existing handle_mm_fault which also do
the dirty/young tracking properly.


diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9670f71..8a76694 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1546,6 +1546,7 @@ struct page *follow_page(struct vm_area_struct *, 
unsigned long address,
  #define FOLL_MLOCK    0x40    /* mark page as mlocked */
  #define FOLL_SPLIT    0x80    /* don't return transhuge pages, split 
them */
  #define FOLL_HWPOISON    0x100    /* check page is hwpoisoned */
+#define FOLL_FIXFAULT    0x200    /* fixup after a fault (PTE 
dirty/young upd) */

  typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
              void *data);
diff --git a/kernel/futex.c b/kernel/futex.c
index fe28dc2..820556d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -353,10 +353,11 @@ static int fault_in_user_writeable(u32 __user *uaddr)
  {
      struct mm_struct *mm = current->mm;
      int ret;
+    int flags = FOLL_TOUCH | FOLL_GET | FOLL_WRITE | FOLL_FIXFAULT;

      down_read(&mm->mmap_sem);
-    ret = get_user_pages(current, mm, (unsigned long)uaddr,
-                 1, 1, 0, NULL, NULL);
+    ret = __get_user_pages(current, mm, (unsigned long)uaddr, 1,
+                   flags, NULL, NULL, NULL);
      up_read(&mm->mmap_sem);

      return ret < 0 ? ret : 0;
diff --git a/mm/memory.c b/mm/memory.c
index 9b8a01d..5682501 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1442,6 +1442,7 @@ struct page *follow_page(struct vm_area_struct 
*vma, unsigned long address,
      spinlock_t *ptl;
      struct page *page;
      struct mm_struct *mm = vma->vm_mm;
+    int fix_write_permission = 0;

      page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
      if (!IS_ERR(page)) {
@@ -1519,6 +1520,9 @@ split_fallthrough:
          if ((flags & FOLL_WRITE) &&
              !pte_dirty(pte) && !PageDirty(page))
              set_page_dirty(page);
+
+        if ((flags & (FOLL_WRITE | FOLL_FIXFAULT)) && !pte_dirty(pte))
+            fix_write_permission = 1;
          /*
           * pte_mkyoung() would be more correct here, but atomic care
           * is needed to avoid losing the dirty bit: it is easier to use
@@ -1551,7 +1555,7 @@ split_fallthrough:
  unlock:
      pte_unmap_unlock(ptep, ptl);
  out:
-    return page;
+    return (fix_write_permission) ? NULL : page;

  bad_page:
      pte_unmap_unlock(ptep, ptl);

> At this point, since we have isolated the special case callers, I think
> we are pretty much in a situation where there's no point trying to
> optimize the x86 case more, it's a fairly slow path anyway, and so no
> ifdef should be needed (and x86 already #define out the TLB flush for
> spurious faults in handle_pte_fault today).
>
> We don't even need to change follow_page()... we just don't call it the
> first time around.
>
> I'll cook up another patch later but first we need to find out why the
> one you have doesn't work. There might be another problem lurking (or I
> just made a stupid mistake).
>
> BTW. Can you give me some details about how you reproduce the problem ?
> I should setup something on a booke machine here to verify things.
>
> Cheers,
> Ben.
>


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-19  3:30                           ` Shan Hai
  0 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-19  3:30 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: tony.luck, Peter Zijlstra, Peter Zijlstra, linux-kernel,
	cmetcalf, dhowells, paulus, tglx, walken, linuxppc-dev, akpm

On 07/18/2011 03:36 PM, Benjamin Herrenschmidt wrote:
> On Mon, 2011-07-18 at 15:26 +0800, Shan Hai wrote:
>> I am sorry I hadn't tried your newer patch, I tried it but it still
>> could not work in my test environment, I will dig into and tell you
>> why that failed later.
> Ok, please let me know what you find !
>

Have not been finding out the reason why failed,
I tried the following based on your code,
(1)
diff --git a/kernel/futex.c b/kernel/futex.c
index fe28dc2..820556d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -353,10 +353,11 @@ static int fault_in_user_writeable(u32 __user *uaddr)
  {
         struct mm_struct *mm = current->mm;
         int ret;
+       int flags = FOLL_TOUCH | FOLL_GET | FOLL_WRITE | FOLL_FIXFAULT;

         down_read(&mm->mmap_sem);
-       ret = get_user_pages(current, mm, (unsigned long)uaddr,
-                            1, 1, 0, NULL, NULL);
+       ret = __get_user_pages(current, mm, (unsigned long)uaddr, 1,
+                              flags, NULL, NULL, NULL);
         up_read(&mm->mmap_sem);

         return ret < 0 ? ret : 0;

(2)
diff --git a/mm/memory.c b/mm/memory.c
index 9b8a01d..f7ba26e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
...
+
+       if ((flags & (FOLL_WRITE | FOLL_FIXFAULT)) && !pte_dirty(pte))
+               handle_pte_sw_young_dirty(vma, address, ptep,
+                                          FAULT_FLAG_WRITE);
...

And everything lookes good, but still couldn't work, need more 
investigation.

>> Yep, I know holding lots of ifdef's everywhere is not so good,
>> but if we have some other way(I don't know how till now) to
>> figure out the arch has the need to fixup up the write permission
>> we could eradicate the ugly ifdef's here.
>>
>> I think the handle_mm_fault could do all dirty/young tracking,
>> because the purpose of making follow_page return NULL to
>> its caller is that want to the handle_mm_fault to be called
>> on write permission protection fault.
> I see your point. Rather than factoring the fixup code out, we could
> force gup to call handle_mm_fault()... that makes sense.
>
> However, I don't think we should special case archs. There's plenty of
> cases where we don't care about this fixup even on archs that do SW
> tracking of dirty and young. For example when gup is using for
> subsequent DMA.
>
> Only the (rare ?) cases where it's used as a mean to fixup a failing
> "atomic" user access are relevant.
>
> So I believe we should still pass an explicit flag to __get_user_pages()
> as I propose to activate that behaviour.
>

How about the following one?
the write permission fixup behaviour is triggered explicitly by
the trouble making parts like futex as you suggested.

In this way, the follow_page() mimics exactly how the MMU
faults on atomic access to the user pages, and we could handle
the fault by already existing handle_mm_fault which also do
the dirty/young tracking properly.


diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9670f71..8a76694 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1546,6 +1546,7 @@ struct page *follow_page(struct vm_area_struct *, 
unsigned long address,
  #define FOLL_MLOCK    0x40    /* mark page as mlocked */
  #define FOLL_SPLIT    0x80    /* don't return transhuge pages, split 
them */
  #define FOLL_HWPOISON    0x100    /* check page is hwpoisoned */
+#define FOLL_FIXFAULT    0x200    /* fixup after a fault (PTE 
dirty/young upd) */

  typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
              void *data);
diff --git a/kernel/futex.c b/kernel/futex.c
index fe28dc2..820556d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -353,10 +353,11 @@ static int fault_in_user_writeable(u32 __user *uaddr)
  {
      struct mm_struct *mm = current->mm;
      int ret;
+    int flags = FOLL_TOUCH | FOLL_GET | FOLL_WRITE | FOLL_FIXFAULT;

      down_read(&mm->mmap_sem);
-    ret = get_user_pages(current, mm, (unsigned long)uaddr,
-                 1, 1, 0, NULL, NULL);
+    ret = __get_user_pages(current, mm, (unsigned long)uaddr, 1,
+                   flags, NULL, NULL, NULL);
      up_read(&mm->mmap_sem);

      return ret < 0 ? ret : 0;
diff --git a/mm/memory.c b/mm/memory.c
index 9b8a01d..5682501 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1442,6 +1442,7 @@ struct page *follow_page(struct vm_area_struct 
*vma, unsigned long address,
      spinlock_t *ptl;
      struct page *page;
      struct mm_struct *mm = vma->vm_mm;
+    int fix_write_permission = 0;

      page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
      if (!IS_ERR(page)) {
@@ -1519,6 +1520,9 @@ split_fallthrough:
          if ((flags & FOLL_WRITE) &&
              !pte_dirty(pte) && !PageDirty(page))
              set_page_dirty(page);
+
+        if ((flags & (FOLL_WRITE | FOLL_FIXFAULT)) && !pte_dirty(pte))
+            fix_write_permission = 1;
          /*
           * pte_mkyoung() would be more correct here, but atomic care
           * is needed to avoid losing the dirty bit: it is easier to use
@@ -1551,7 +1555,7 @@ split_fallthrough:
  unlock:
      pte_unmap_unlock(ptep, ptl);
  out:
-    return page;
+    return (fix_write_permission) ? NULL : page;

  bad_page:
      pte_unmap_unlock(ptep, ptl);

> At this point, since we have isolated the special case callers, I think
> we are pretty much in a situation where there's no point trying to
> optimize the x86 case more, it's a fairly slow path anyway, and so no
> ifdef should be needed (and x86 already #define out the TLB flush for
> spurious faults in handle_pte_fault today).
>
> We don't even need to change follow_page()... we just don't call it the
> first time around.
>
> I'll cook up another patch later but first we need to find out why the
> one you have doesn't work. There might be another problem lurking (or I
> just made a stupid mistake).
>
> BTW. Can you give me some details about how you reproduce the problem ?
> I should setup something on a booke machine here to verify things.
>
> Cheers,
> Ben.
>

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
  2011-07-19  3:30                           ` Shan Hai
@ 2011-07-19  4:20                             ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-19  4:20 UTC (permalink / raw)
  To: Shan Hai
  Cc: Peter Zijlstra, Peter Zijlstra, paulus, tglx, walken, dhowells,
	cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel,
	Kumar Gala

On Tue, 2011-07-19 at 11:30 +0800, Shan Hai wrote:
> On 07/18/2011 03:36 PM, Benjamin Herrenschmidt wrote:
> > On Mon, 2011-07-18 at 15:26 +0800, Shan Hai wrote:
> >> I am sorry I hadn't tried your newer patch, I tried it but it still
> >> could not work in my test environment, I will dig into and tell you
> >> why that failed later.
> > Ok, please let me know what you find !
> >
> 
> Have not been finding out the reason why failed,
> I tried the following based on your code,

Ok, looks like we'll need to dig more, though the original findings
still stand, which means we might be chasing two different bugs :-)

I haven't had time to try to reproduce today and may not this week,
so I'll have to let you toy around with it until I get a chance to
try to track it down myself unless somebody else gets into it... Kumar ?
Anybody on FSL side feels like having a look ?
 
> How about the following one?
> the write permission fixup behaviour is triggered explicitly by
> the trouble making parts like futex as you suggested.
> 
> In this way, the follow_page() mimics exactly how the MMU
> faults on atomic access to the user pages, and we could handle
> the fault by already existing handle_mm_fault which also do
> the dirty/young tracking properly.

So you say this still doesn't fix your problem right ?

> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 9670f71..8a76694 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1546,6 +1546,7 @@ struct page *follow_page(struct vm_area_struct *, 
> unsigned long address,
>   #define FOLL_MLOCK    0x40    /* mark page as mlocked */
>   #define FOLL_SPLIT    0x80    /* don't return transhuge pages, split 
> them */
>   #define FOLL_HWPOISON    0x100    /* check page is hwpoisoned */
> +#define FOLL_FIXFAULT    0x200    /* fixup after a fault (PTE 
> dirty/young upd) */

Badly wrapped it seems :-) And totally whitespace damaged...

>   typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
>               void *data);
> diff --git a/kernel/futex.c b/kernel/futex.c
> index fe28dc2..820556d 100644
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -353,10 +353,11 @@ static int fault_in_user_writeable(u32 __user *uaddr)
>   {
>       struct mm_struct *mm = current->mm;
>       int ret;
> +    int flags = FOLL_TOUCH | FOLL_GET | FOLL_WRITE | FOLL_FIXFAULT;

You don't want TOUCH -and- FIXFAULT do you ? Also you don't want GET
since you aren't passing a page array or vma array anyway.

>       down_read(&mm->mmap_sem);
> -    ret = get_user_pages(current, mm, (unsigned long)uaddr,
> -                 1, 1, 0, NULL, NULL);
> +    ret = __get_user_pages(current, mm, (unsigned long)uaddr, 1,
> +                   flags, NULL, NULL, NULL);
>       up_read(&mm->mmap_sem);
> 
>       return ret < 0 ? ret : 0;
> diff --git a/mm/memory.c b/mm/memory.c
> index 9b8a01d..5682501 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1442,6 +1442,7 @@ struct page *follow_page(struct vm_area_struct 
> *vma, unsigned long address,
>       spinlock_t *ptl;
>       struct page *page;
>       struct mm_struct *mm = vma->vm_mm;
> +    int fix_write_permission = 0;

Don't do that.

>       page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
>       if (!IS_ERR(page)) {
> @@ -1519,6 +1520,9 @@ split_fallthrough:
>           if ((flags & FOLL_WRITE) &&
>               !pte_dirty(pte) && !PageDirty(page))
>               set_page_dirty(page);
> +
> +        if ((flags & (FOLL_WRITE | FOLL_FIXFAULT)) && !pte_dirty(pte))
> +            fix_write_permission = 1;

No, you missed my point completely. If FOLL_FIXFAULT is set, you don't
even need to call follow_page() to begin with... you -always- want to
force a call to handle_mm_fault (and only one, no loop), regardless
of whether the PTE is dirty or not, since you need to also address
the lack of a young bit.

(That might explain why your patch doesn't work if your problem is
caused by a missing young bit).

What about the patch in my next email...

Ben.




^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core
@ 2011-07-19  4:20                             ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-19  4:20 UTC (permalink / raw)
  To: Shan Hai
  Cc: tony.luck, Peter Zijlstra, Peter Zijlstra, linux-kernel,
	cmetcalf, dhowells, paulus, tglx, walken, linuxppc-dev, akpm

On Tue, 2011-07-19 at 11:30 +0800, Shan Hai wrote:
> On 07/18/2011 03:36 PM, Benjamin Herrenschmidt wrote:
> > On Mon, 2011-07-18 at 15:26 +0800, Shan Hai wrote:
> >> I am sorry I hadn't tried your newer patch, I tried it but it still
> >> could not work in my test environment, I will dig into and tell you
> >> why that failed later.
> > Ok, please let me know what you find !
> >
> 
> Have not been finding out the reason why failed,
> I tried the following based on your code,

Ok, looks like we'll need to dig more, though the original findings
still stand, which means we might be chasing two different bugs :-)

I haven't had time to try to reproduce today and may not this week,
so I'll have to let you toy around with it until I get a chance to
try to track it down myself unless somebody else gets into it... Kumar ?
Anybody on FSL side feels like having a look ?
 
> How about the following one?
> the write permission fixup behaviour is triggered explicitly by
> the trouble making parts like futex as you suggested.
> 
> In this way, the follow_page() mimics exactly how the MMU
> faults on atomic access to the user pages, and we could handle
> the fault by already existing handle_mm_fault which also do
> the dirty/young tracking properly.

So you say this still doesn't fix your problem right ?

> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 9670f71..8a76694 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1546,6 +1546,7 @@ struct page *follow_page(struct vm_area_struct *, 
> unsigned long address,
>   #define FOLL_MLOCK    0x40    /* mark page as mlocked */
>   #define FOLL_SPLIT    0x80    /* don't return transhuge pages, split 
> them */
>   #define FOLL_HWPOISON    0x100    /* check page is hwpoisoned */
> +#define FOLL_FIXFAULT    0x200    /* fixup after a fault (PTE 
> dirty/young upd) */

Badly wrapped it seems :-) And totally whitespace damaged...

>   typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
>               void *data);
> diff --git a/kernel/futex.c b/kernel/futex.c
> index fe28dc2..820556d 100644
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -353,10 +353,11 @@ static int fault_in_user_writeable(u32 __user *uaddr)
>   {
>       struct mm_struct *mm = current->mm;
>       int ret;
> +    int flags = FOLL_TOUCH | FOLL_GET | FOLL_WRITE | FOLL_FIXFAULT;

You don't want TOUCH -and- FIXFAULT do you ? Also you don't want GET
since you aren't passing a page array or vma array anyway.

>       down_read(&mm->mmap_sem);
> -    ret = get_user_pages(current, mm, (unsigned long)uaddr,
> -                 1, 1, 0, NULL, NULL);
> +    ret = __get_user_pages(current, mm, (unsigned long)uaddr, 1,
> +                   flags, NULL, NULL, NULL);
>       up_read(&mm->mmap_sem);
> 
>       return ret < 0 ? ret : 0;
> diff --git a/mm/memory.c b/mm/memory.c
> index 9b8a01d..5682501 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1442,6 +1442,7 @@ struct page *follow_page(struct vm_area_struct 
> *vma, unsigned long address,
>       spinlock_t *ptl;
>       struct page *page;
>       struct mm_struct *mm = vma->vm_mm;
> +    int fix_write_permission = 0;

Don't do that.

>       page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
>       if (!IS_ERR(page)) {
> @@ -1519,6 +1520,9 @@ split_fallthrough:
>           if ((flags & FOLL_WRITE) &&
>               !pte_dirty(pte) && !PageDirty(page))
>               set_page_dirty(page);
> +
> +        if ((flags & (FOLL_WRITE | FOLL_FIXFAULT)) && !pte_dirty(pte))
> +            fix_write_permission = 1;

No, you missed my point completely. If FOLL_FIXFAULT is set, you don't
even need to call follow_page() to begin with... you -always- want to
force a call to handle_mm_fault (and only one, no loop), regardless
of whether the PTE is dirty or not, since you need to also address
the lack of a young bit.

(That might explain why your patch doesn't work if your problem is
caused by a missing young bit).

What about the patch in my next email...

Ben.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
  2011-07-19  3:30                           ` Shan Hai
@ 2011-07-19  4:29                             ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-19  4:29 UTC (permalink / raw)
  To: Shan Hai
  Cc: Peter Zijlstra, Peter Zijlstra, paulus, tglx, walken, dhowells,
	cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel

The futex code currently attempts to write to user memory within
a pagefault disabled section, and if that fails, tries to fix it
up using get_user_pages().

This doesn't work on archs where the dirty and young bits are
maintained by software, since they will gate access permission
in the TLB, and will not be updated by gup().

In addition, there's an expectation on some archs that a
spurious write fault triggers a local TLB flush, and that is
missing from the picture as well.

I decided that adding those "features" to gup() would be too much
for this already too complex function, and instead added a new
simpler fixup_user_fault() which is essentially a wrapper around
handle_mm_fault() which the futex code can call.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---

Shan, can you test this ? It might not fix the problem since I'm
starting to have the nasty feeling that you are hitting what is
somewhat a subtly different issue or my previous patch should
have worked (but then I might have done a stupid mistake as well)
but let us know anyway.

Cheers,
Ben.

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9670f71..1036614 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -985,6 +985,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 			struct page **pages);
 struct page *get_dump_page(unsigned long addr);
+extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+			    unsigned long address, unsigned int fault_flags);
 
 extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
 extern void do_invalidatepage(struct page *page, unsigned long offset);
diff --git a/kernel/futex.c b/kernel/futex.c
index fe28dc2..7a0a4ed 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
 	int ret;
 
 	down_read(&mm->mmap_sem);
-	ret = get_user_pages(current, mm, (unsigned long)uaddr,
-			     1, 1, 0, NULL, NULL);
+	ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
+			       FAULT_FLAG_WRITE);
 	up_read(&mm->mmap_sem);
 
 	return ret < 0 ? ret : 0;
diff --git a/mm/memory.c b/mm/memory.c
index 40b7531..b967fb0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1815,7 +1815,64 @@ next_page:
 }
 EXPORT_SYMBOL(__get_user_pages);
 
-/**
+/*
+ * fixup_user_fault() - manually resolve a user page  fault
+ * @tsk:	the task_struct to use for page fault accounting, or
+ *		NULL if faults are not to be recorded.
+ * @mm:		mm_struct of target mm
+ * @address:	user address
+ * @fault_flags:flags to pass down to handle_mm_fault()
+ *
+ * This is meant to be called in the specific scenario where for
+ * locking reasons we try to access user memory in atomic context
+ * (within a pagefault_disable() section), this returns -EFAULT,
+ * and we want to resolve the user fault before trying again.
+ *
+ * Typically this is meant to be used by the futex code.
+ *
+ * The main difference with get_user_pages() is that this function
+ * will unconditionally call handle_mm_fault() which will in turn
+ * perform all the necessary SW fixup of the dirty and young bits
+ * in the PTE, while handle_mm_fault() only guarantees to update
+ * these in the struct page.
+ *
+ * This is important for some architectures where those bits also
+ * gate the access permission to the page because their are
+ * maintained in software. On such architecture, gup() will not
+ * be enough to make a subsequent access succeed.
+ *
+ * This should be called with the mm_sem held for read.
+ */
+int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+		     unsigned long address, unsigned int fault_flags)
+{
+	struct vm_area_struct *vma;
+	int ret;
+
+	vma = find_extend_vma(mm, address);
+	if (!vma || address < vma->vm_start)
+		return -EFAULT;
+	
+	ret = handle_mm_fault(mm, vma, address, fault_flags);
+	if (ret & VM_FAULT_ERROR) {
+		if (ret & VM_FAULT_OOM)
+			return -ENOMEM;
+		if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
+			return -EHWPOISON;
+		if (ret & VM_FAULT_SIGBUS)
+			return -EFAULT;
+		BUG();
+	}
+	if (tsk) {
+		if (ret & VM_FAULT_MAJOR)
+			tsk->maj_flt++;
+		else
+			tsk->min_flt++;
+	}
+	return 0;
+}
+
+/*
  * get_user_pages() - pin user pages in memory
  * @tsk:	the task_struct to use for page fault accounting, or
  *		NULL if faults are not to be recorded.



^ permalink raw reply	[flat|nested] 138+ messages in thread

* [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
@ 2011-07-19  4:29                             ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-19  4:29 UTC (permalink / raw)
  To: Shan Hai
  Cc: tony.luck, Peter Zijlstra, Peter Zijlstra, linux-kernel,
	cmetcalf, dhowells, paulus, tglx, walken, linuxppc-dev, akpm

The futex code currently attempts to write to user memory within
a pagefault disabled section, and if that fails, tries to fix it
up using get_user_pages().

This doesn't work on archs where the dirty and young bits are
maintained by software, since they will gate access permission
in the TLB, and will not be updated by gup().

In addition, there's an expectation on some archs that a
spurious write fault triggers a local TLB flush, and that is
missing from the picture as well.

I decided that adding those "features" to gup() would be too much
for this already too complex function, and instead added a new
simpler fixup_user_fault() which is essentially a wrapper around
handle_mm_fault() which the futex code can call.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---

Shan, can you test this ? It might not fix the problem since I'm
starting to have the nasty feeling that you are hitting what is
somewhat a subtly different issue or my previous patch should
have worked (but then I might have done a stupid mistake as well)
but let us know anyway.

Cheers,
Ben.

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9670f71..1036614 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -985,6 +985,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 			struct page **pages);
 struct page *get_dump_page(unsigned long addr);
+extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+			    unsigned long address, unsigned int fault_flags);
 
 extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
 extern void do_invalidatepage(struct page *page, unsigned long offset);
diff --git a/kernel/futex.c b/kernel/futex.c
index fe28dc2..7a0a4ed 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
 	int ret;
 
 	down_read(&mm->mmap_sem);
-	ret = get_user_pages(current, mm, (unsigned long)uaddr,
-			     1, 1, 0, NULL, NULL);
+	ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
+			       FAULT_FLAG_WRITE);
 	up_read(&mm->mmap_sem);
 
 	return ret < 0 ? ret : 0;
diff --git a/mm/memory.c b/mm/memory.c
index 40b7531..b967fb0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1815,7 +1815,64 @@ next_page:
 }
 EXPORT_SYMBOL(__get_user_pages);
 
-/**
+/*
+ * fixup_user_fault() - manually resolve a user page  fault
+ * @tsk:	the task_struct to use for page fault accounting, or
+ *		NULL if faults are not to be recorded.
+ * @mm:		mm_struct of target mm
+ * @address:	user address
+ * @fault_flags:flags to pass down to handle_mm_fault()
+ *
+ * This is meant to be called in the specific scenario where for
+ * locking reasons we try to access user memory in atomic context
+ * (within a pagefault_disable() section), this returns -EFAULT,
+ * and we want to resolve the user fault before trying again.
+ *
+ * Typically this is meant to be used by the futex code.
+ *
+ * The main difference with get_user_pages() is that this function
+ * will unconditionally call handle_mm_fault() which will in turn
+ * perform all the necessary SW fixup of the dirty and young bits
+ * in the PTE, while handle_mm_fault() only guarantees to update
+ * these in the struct page.
+ *
+ * This is important for some architectures where those bits also
+ * gate the access permission to the page because their are
+ * maintained in software. On such architecture, gup() will not
+ * be enough to make a subsequent access succeed.
+ *
+ * This should be called with the mm_sem held for read.
+ */
+int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+		     unsigned long address, unsigned int fault_flags)
+{
+	struct vm_area_struct *vma;
+	int ret;
+
+	vma = find_extend_vma(mm, address);
+	if (!vma || address < vma->vm_start)
+		return -EFAULT;
+	
+	ret = handle_mm_fault(mm, vma, address, fault_flags);
+	if (ret & VM_FAULT_ERROR) {
+		if (ret & VM_FAULT_OOM)
+			return -ENOMEM;
+		if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
+			return -EHWPOISON;
+		if (ret & VM_FAULT_SIGBUS)
+			return -EFAULT;
+		BUG();
+	}
+	if (tsk) {
+		if (ret & VM_FAULT_MAJOR)
+			tsk->maj_flt++;
+		else
+			tsk->min_flt++;
+	}
+	return 0;
+}
+
+/*
  * get_user_pages() - pin user pages in memory
  * @tsk:	the task_struct to use for page fault accounting, or
  *		NULL if faults are not to be recorded.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
  2011-07-19  4:29                             ` Benjamin Herrenschmidt
@ 2011-07-19  4:55                               ` Shan Hai
  -1 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-19  4:55 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Peter Zijlstra, Peter Zijlstra, paulus, tglx, walken, dhowells,
	cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel

On 07/19/2011 12:29 PM, Benjamin Herrenschmidt wrote:
> The futex code currently attempts to write to user memory within
> a pagefault disabled section, and if that fails, tries to fix it
> up using get_user_pages().
>
> This doesn't work on archs where the dirty and young bits are
> maintained by software, since they will gate access permission
> in the TLB, and will not be updated by gup().
>
> In addition, there's an expectation on some archs that a
> spurious write fault triggers a local TLB flush, and that is
> missing from the picture as well.
>
> I decided that adding those "features" to gup() would be too much
> for this already too complex function, and instead added a new
> simpler fixup_user_fault() which is essentially a wrapper around
> handle_mm_fault() which the futex code can call.
>
> Signed-off-by: Benjamin Herrenschmidt<benh@kernel.crashing.org>
> ---
>
> Shan, can you test this ? It might not fix the problem since I'm
> starting to have the nasty feeling that you are hitting what is
> somewhat a subtly different issue or my previous patch should
> have worked (but then I might have done a stupid mistake as well)
> but let us know anyway.
>

Ok, I will test the patch, I think this should work, because
it's similar to my first posted patch, the difference is that
I tried to do it in the futex_atomic_cmpxchg_inatomic() in
the ppc specific path, lower level than yours as in
fault_in_user_writable :-)

Anyway, I will notify you on the test result.

Thanks
Shan Hai

> Cheers,
> Ben.
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 9670f71..1036614 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -985,6 +985,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
>   int get_user_pages_fast(unsigned long start, int nr_pages, int write,
>   			struct page **pages);
>   struct page *get_dump_page(unsigned long addr);
> +extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
> +			    unsigned long address, unsigned int fault_flags);
>
>   extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
>   extern void do_invalidatepage(struct page *page, unsigned long offset);
> diff --git a/kernel/futex.c b/kernel/futex.c
> index fe28dc2..7a0a4ed 100644
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
>   	int ret;
>
>   	down_read(&mm->mmap_sem);
> -	ret = get_user_pages(current, mm, (unsigned long)uaddr,
> -			     1, 1, 0, NULL, NULL);
> +	ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
> +			       FAULT_FLAG_WRITE);
>   	up_read(&mm->mmap_sem);
>
>   	return ret<  0 ? ret : 0;
> diff --git a/mm/memory.c b/mm/memory.c
> index 40b7531..b967fb0 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1815,7 +1815,64 @@ next_page:
>   }
>   EXPORT_SYMBOL(__get_user_pages);
>
> -/**
> +/*
> + * fixup_user_fault() - manually resolve a user page  fault
> + * @tsk:	the task_struct to use for page fault accounting, or
> + *		NULL if faults are not to be recorded.
> + * @mm:		mm_struct of target mm
> + * @address:	user address
> + * @fault_flags:flags to pass down to handle_mm_fault()
> + *
> + * This is meant to be called in the specific scenario where for
> + * locking reasons we try to access user memory in atomic context
> + * (within a pagefault_disable() section), this returns -EFAULT,
> + * and we want to resolve the user fault before trying again.
> + *
> + * Typically this is meant to be used by the futex code.
> + *
> + * The main difference with get_user_pages() is that this function
> + * will unconditionally call handle_mm_fault() which will in turn
> + * perform all the necessary SW fixup of the dirty and young bits
> + * in the PTE, while handle_mm_fault() only guarantees to update
> + * these in the struct page.
> + *
> + * This is important for some architectures where those bits also
> + * gate the access permission to the page because their are
> + * maintained in software. On such architecture, gup() will not
> + * be enough to make a subsequent access succeed.
> + *
> + * This should be called with the mm_sem held for read.
> + */
> +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
> +		     unsigned long address, unsigned int fault_flags)
> +{
> +	struct vm_area_struct *vma;
> +	int ret;
> +
> +	vma = find_extend_vma(mm, address);
> +	if (!vma || address<  vma->vm_start)
> +		return -EFAULT;
> +	
> +	ret = handle_mm_fault(mm, vma, address, fault_flags);
> +	if (ret&  VM_FAULT_ERROR) {
> +		if (ret&  VM_FAULT_OOM)
> +			return -ENOMEM;
> +		if (ret&  (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
> +			return -EHWPOISON;
> +		if (ret&  VM_FAULT_SIGBUS)
> +			return -EFAULT;
> +		BUG();
> +	}
> +	if (tsk) {
> +		if (ret&  VM_FAULT_MAJOR)
> +			tsk->maj_flt++;
> +		else
> +			tsk->min_flt++;
> +	}
> +	return 0;
> +}
> +
> +/*
>    * get_user_pages() - pin user pages in memory
>    * @tsk:	the task_struct to use for page fault accounting, or
>    *		NULL if faults are not to be recorded.
>
>


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
@ 2011-07-19  4:55                               ` Shan Hai
  0 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-19  4:55 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: tony.luck, Peter Zijlstra, Peter Zijlstra, linux-kernel,
	cmetcalf, dhowells, paulus, tglx, walken, linuxppc-dev, akpm

On 07/19/2011 12:29 PM, Benjamin Herrenschmidt wrote:
> The futex code currently attempts to write to user memory within
> a pagefault disabled section, and if that fails, tries to fix it
> up using get_user_pages().
>
> This doesn't work on archs where the dirty and young bits are
> maintained by software, since they will gate access permission
> in the TLB, and will not be updated by gup().
>
> In addition, there's an expectation on some archs that a
> spurious write fault triggers a local TLB flush, and that is
> missing from the picture as well.
>
> I decided that adding those "features" to gup() would be too much
> for this already too complex function, and instead added a new
> simpler fixup_user_fault() which is essentially a wrapper around
> handle_mm_fault() which the futex code can call.
>
> Signed-off-by: Benjamin Herrenschmidt<benh@kernel.crashing.org>
> ---
>
> Shan, can you test this ? It might not fix the problem since I'm
> starting to have the nasty feeling that you are hitting what is
> somewhat a subtly different issue or my previous patch should
> have worked (but then I might have done a stupid mistake as well)
> but let us know anyway.
>

Ok, I will test the patch, I think this should work, because
it's similar to my first posted patch, the difference is that
I tried to do it in the futex_atomic_cmpxchg_inatomic() in
the ppc specific path, lower level than yours as in
fault_in_user_writable :-)

Anyway, I will notify you on the test result.

Thanks
Shan Hai

> Cheers,
> Ben.
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 9670f71..1036614 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -985,6 +985,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
>   int get_user_pages_fast(unsigned long start, int nr_pages, int write,
>   			struct page **pages);
>   struct page *get_dump_page(unsigned long addr);
> +extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
> +			    unsigned long address, unsigned int fault_flags);
>
>   extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
>   extern void do_invalidatepage(struct page *page, unsigned long offset);
> diff --git a/kernel/futex.c b/kernel/futex.c
> index fe28dc2..7a0a4ed 100644
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
>   	int ret;
>
>   	down_read(&mm->mmap_sem);
> -	ret = get_user_pages(current, mm, (unsigned long)uaddr,
> -			     1, 1, 0, NULL, NULL);
> +	ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
> +			       FAULT_FLAG_WRITE);
>   	up_read(&mm->mmap_sem);
>
>   	return ret<  0 ? ret : 0;
> diff --git a/mm/memory.c b/mm/memory.c
> index 40b7531..b967fb0 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1815,7 +1815,64 @@ next_page:
>   }
>   EXPORT_SYMBOL(__get_user_pages);
>
> -/**
> +/*
> + * fixup_user_fault() - manually resolve a user page  fault
> + * @tsk:	the task_struct to use for page fault accounting, or
> + *		NULL if faults are not to be recorded.
> + * @mm:		mm_struct of target mm
> + * @address:	user address
> + * @fault_flags:flags to pass down to handle_mm_fault()
> + *
> + * This is meant to be called in the specific scenario where for
> + * locking reasons we try to access user memory in atomic context
> + * (within a pagefault_disable() section), this returns -EFAULT,
> + * and we want to resolve the user fault before trying again.
> + *
> + * Typically this is meant to be used by the futex code.
> + *
> + * The main difference with get_user_pages() is that this function
> + * will unconditionally call handle_mm_fault() which will in turn
> + * perform all the necessary SW fixup of the dirty and young bits
> + * in the PTE, while handle_mm_fault() only guarantees to update
> + * these in the struct page.
> + *
> + * This is important for some architectures where those bits also
> + * gate the access permission to the page because their are
> + * maintained in software. On such architecture, gup() will not
> + * be enough to make a subsequent access succeed.
> + *
> + * This should be called with the mm_sem held for read.
> + */
> +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
> +		     unsigned long address, unsigned int fault_flags)
> +{
> +	struct vm_area_struct *vma;
> +	int ret;
> +
> +	vma = find_extend_vma(mm, address);
> +	if (!vma || address<  vma->vm_start)
> +		return -EFAULT;
> +	
> +	ret = handle_mm_fault(mm, vma, address, fault_flags);
> +	if (ret&  VM_FAULT_ERROR) {
> +		if (ret&  VM_FAULT_OOM)
> +			return -ENOMEM;
> +		if (ret&  (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
> +			return -EHWPOISON;
> +		if (ret&  VM_FAULT_SIGBUS)
> +			return -EFAULT;
> +		BUG();
> +	}
> +	if (tsk) {
> +		if (ret&  VM_FAULT_MAJOR)
> +			tsk->maj_flt++;
> +		else
> +			tsk->min_flt++;
> +	}
> +	return 0;
> +}
> +
> +/*
>    * get_user_pages() - pin user pages in memory
>    * @tsk:	the task_struct to use for page fault accounting, or
>    *		NULL if faults are not to be recorded.
>
>

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
  2011-07-19  4:29                             ` Benjamin Herrenschmidt
@ 2011-07-19  5:17                               ` Shan Hai
  -1 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-19  5:17 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Peter Zijlstra, Peter Zijlstra, paulus, tglx, walken, dhowells,
	cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel

On 07/19/2011 12:29 PM, Benjamin Herrenschmidt wrote:
> The futex code currently attempts to write to user memory within
> a pagefault disabled section, and if that fails, tries to fix it
> up using get_user_pages().
>
> This doesn't work on archs where the dirty and young bits are
> maintained by software, since they will gate access permission
> in the TLB, and will not be updated by gup().
>
> In addition, there's an expectation on some archs that a
> spurious write fault triggers a local TLB flush, and that is
> missing from the picture as well.
>
> I decided that adding those "features" to gup() would be too much
> for this already too complex function, and instead added a new
> simpler fixup_user_fault() which is essentially a wrapper around
> handle_mm_fault() which the futex code can call.
>
> Signed-off-by: Benjamin Herrenschmidt<benh@kernel.crashing.org>
> ---
>
> Shan, can you test this ? It might not fix the problem since I'm
> starting to have the nasty feeling that you are hitting what is
> somewhat a subtly different issue or my previous patch should
> have worked (but then I might have done a stupid mistake as well)
> but let us know anyway.
>

The patch works, but I have certain confusions,
- Do we want to handle_mm_fault on each futex_lock_pi
     even though in most cases there is no write permission
     fixup's needed?
- How about let the archs do their own write permission
     fixup as what I did in my original
     "[PATCH 1/1] Fixup write permission of TLB on powerpc e500 core"?
     (I will fix the stupid errors in my original patch if the concept 
is acceptable)
     in this way we could decrease the overhead of handle_mm_fault
     in the path which does not need write permission fixup.

Thanks
Shan Hai
> Cheers,
> Ben.
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 9670f71..1036614 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -985,6 +985,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
>   int get_user_pages_fast(unsigned long start, int nr_pages, int write,
>   			struct page **pages);
>   struct page *get_dump_page(unsigned long addr);
> +extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
> +			    unsigned long address, unsigned int fault_flags);
>
>   extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
>   extern void do_invalidatepage(struct page *page, unsigned long offset);
> diff --git a/kernel/futex.c b/kernel/futex.c
> index fe28dc2..7a0a4ed 100644
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
>   	int ret;
>
>   	down_read(&mm->mmap_sem);
> -	ret = get_user_pages(current, mm, (unsigned long)uaddr,
> -			     1, 1, 0, NULL, NULL);
> +	ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
> +			       FAULT_FLAG_WRITE);
>   	up_read(&mm->mmap_sem);
>
>   	return ret<  0 ? ret : 0;
> diff --git a/mm/memory.c b/mm/memory.c
> index 40b7531..b967fb0 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1815,7 +1815,64 @@ next_page:
>   }
>   EXPORT_SYMBOL(__get_user_pages);
>
> -/**
> +/*
> + * fixup_user_fault() - manually resolve a user page  fault
> + * @tsk:	the task_struct to use for page fault accounting, or
> + *		NULL if faults are not to be recorded.
> + * @mm:		mm_struct of target mm
> + * @address:	user address
> + * @fault_flags:flags to pass down to handle_mm_fault()
> + *
> + * This is meant to be called in the specific scenario where for
> + * locking reasons we try to access user memory in atomic context
> + * (within a pagefault_disable() section), this returns -EFAULT,
> + * and we want to resolve the user fault before trying again.
> + *
> + * Typically this is meant to be used by the futex code.
> + *
> + * The main difference with get_user_pages() is that this function
> + * will unconditionally call handle_mm_fault() which will in turn
> + * perform all the necessary SW fixup of the dirty and young bits
> + * in the PTE, while handle_mm_fault() only guarantees to update
> + * these in the struct page.
> + *
> + * This is important for some architectures where those bits also
> + * gate the access permission to the page because their are
> + * maintained in software. On such architecture, gup() will not
> + * be enough to make a subsequent access succeed.
> + *
> + * This should be called with the mm_sem held for read.
> + */
> +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
> +		     unsigned long address, unsigned int fault_flags)
> +{
> +	struct vm_area_struct *vma;
> +	int ret;
> +
> +	vma = find_extend_vma(mm, address);
> +	if (!vma || address<  vma->vm_start)
> +		return -EFAULT;
> +	
> +	ret = handle_mm_fault(mm, vma, address, fault_flags);
> +	if (ret&  VM_FAULT_ERROR) {
> +		if (ret&  VM_FAULT_OOM)
> +			return -ENOMEM;
> +		if (ret&  (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
> +			return -EHWPOISON;
> +		if (ret&  VM_FAULT_SIGBUS)
> +			return -EFAULT;
> +		BUG();
> +	}
> +	if (tsk) {
> +		if (ret&  VM_FAULT_MAJOR)
> +			tsk->maj_flt++;
> +		else
> +			tsk->min_flt++;
> +	}
> +	return 0;
> +}
> +
> +/*
>    * get_user_pages() - pin user pages in memory
>    * @tsk:	the task_struct to use for page fault accounting, or
>    *		NULL if faults are not to be recorded.
>
>


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
@ 2011-07-19  5:17                               ` Shan Hai
  0 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-19  5:17 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: tony.luck, Peter Zijlstra, Peter Zijlstra, linux-kernel,
	cmetcalf, dhowells, paulus, tglx, walken, linuxppc-dev, akpm

On 07/19/2011 12:29 PM, Benjamin Herrenschmidt wrote:
> The futex code currently attempts to write to user memory within
> a pagefault disabled section, and if that fails, tries to fix it
> up using get_user_pages().
>
> This doesn't work on archs where the dirty and young bits are
> maintained by software, since they will gate access permission
> in the TLB, and will not be updated by gup().
>
> In addition, there's an expectation on some archs that a
> spurious write fault triggers a local TLB flush, and that is
> missing from the picture as well.
>
> I decided that adding those "features" to gup() would be too much
> for this already too complex function, and instead added a new
> simpler fixup_user_fault() which is essentially a wrapper around
> handle_mm_fault() which the futex code can call.
>
> Signed-off-by: Benjamin Herrenschmidt<benh@kernel.crashing.org>
> ---
>
> Shan, can you test this ? It might not fix the problem since I'm
> starting to have the nasty feeling that you are hitting what is
> somewhat a subtly different issue or my previous patch should
> have worked (but then I might have done a stupid mistake as well)
> but let us know anyway.
>

The patch works, but I have certain confusions,
- Do we want to handle_mm_fault on each futex_lock_pi
     even though in most cases there is no write permission
     fixup's needed?
- How about let the archs do their own write permission
     fixup as what I did in my original
     "[PATCH 1/1] Fixup write permission of TLB on powerpc e500 core"?
     (I will fix the stupid errors in my original patch if the concept 
is acceptable)
     in this way we could decrease the overhead of handle_mm_fault
     in the path which does not need write permission fixup.

Thanks
Shan Hai
> Cheers,
> Ben.
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 9670f71..1036614 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -985,6 +985,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
>   int get_user_pages_fast(unsigned long start, int nr_pages, int write,
>   			struct page **pages);
>   struct page *get_dump_page(unsigned long addr);
> +extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
> +			    unsigned long address, unsigned int fault_flags);
>
>   extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
>   extern void do_invalidatepage(struct page *page, unsigned long offset);
> diff --git a/kernel/futex.c b/kernel/futex.c
> index fe28dc2..7a0a4ed 100644
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
>   	int ret;
>
>   	down_read(&mm->mmap_sem);
> -	ret = get_user_pages(current, mm, (unsigned long)uaddr,
> -			     1, 1, 0, NULL, NULL);
> +	ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
> +			       FAULT_FLAG_WRITE);
>   	up_read(&mm->mmap_sem);
>
>   	return ret<  0 ? ret : 0;
> diff --git a/mm/memory.c b/mm/memory.c
> index 40b7531..b967fb0 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1815,7 +1815,64 @@ next_page:
>   }
>   EXPORT_SYMBOL(__get_user_pages);
>
> -/**
> +/*
> + * fixup_user_fault() - manually resolve a user page  fault
> + * @tsk:	the task_struct to use for page fault accounting, or
> + *		NULL if faults are not to be recorded.
> + * @mm:		mm_struct of target mm
> + * @address:	user address
> + * @fault_flags:flags to pass down to handle_mm_fault()
> + *
> + * This is meant to be called in the specific scenario where for
> + * locking reasons we try to access user memory in atomic context
> + * (within a pagefault_disable() section), this returns -EFAULT,
> + * and we want to resolve the user fault before trying again.
> + *
> + * Typically this is meant to be used by the futex code.
> + *
> + * The main difference with get_user_pages() is that this function
> + * will unconditionally call handle_mm_fault() which will in turn
> + * perform all the necessary SW fixup of the dirty and young bits
> + * in the PTE, while handle_mm_fault() only guarantees to update
> + * these in the struct page.
> + *
> + * This is important for some architectures where those bits also
> + * gate the access permission to the page because their are
> + * maintained in software. On such architecture, gup() will not
> + * be enough to make a subsequent access succeed.
> + *
> + * This should be called with the mm_sem held for read.
> + */
> +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
> +		     unsigned long address, unsigned int fault_flags)
> +{
> +	struct vm_area_struct *vma;
> +	int ret;
> +
> +	vma = find_extend_vma(mm, address);
> +	if (!vma || address<  vma->vm_start)
> +		return -EFAULT;
> +	
> +	ret = handle_mm_fault(mm, vma, address, fault_flags);
> +	if (ret&  VM_FAULT_ERROR) {
> +		if (ret&  VM_FAULT_OOM)
> +			return -ENOMEM;
> +		if (ret&  (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
> +			return -EHWPOISON;
> +		if (ret&  VM_FAULT_SIGBUS)
> +			return -EFAULT;
> +		BUG();
> +	}
> +	if (tsk) {
> +		if (ret&  VM_FAULT_MAJOR)
> +			tsk->maj_flt++;
> +		else
> +			tsk->min_flt++;
> +	}
> +	return 0;
> +}
> +
> +/*
>    * get_user_pages() - pin user pages in memory
>    * @tsk:	the task_struct to use for page fault accounting, or
>    *		NULL if faults are not to be recorded.
>
>

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
  2011-07-19  5:17                               ` Shan Hai
@ 2011-07-19  5:24                                 ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-19  5:24 UTC (permalink / raw)
  To: Shan Hai
  Cc: Peter Zijlstra, Peter Zijlstra, paulus, tglx, walken, dhowells,
	cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel

On Tue, 2011-07-19 at 13:17 +0800, Shan Hai wrote:

> The patch works, but I have certain confusions,
> - Do we want to handle_mm_fault on each futex_lock_pi
>      even though in most cases there is no write permission
>      fixup's needed?

Don't we only ever call this when futex_atomic_op_inuser() failed ?
Which means a fixup -is- needed .... The fast path is still there.

> - How about let the archs do their own write permission
>      fixup as what I did in my original

Why ? This is generic and will fix all archs at once with generic code
which is a significant improvement in my book and a lot more
maintainable :-)

>      "[PATCH 1/1] Fixup write permission of TLB on powerpc e500 core"?
>      (I will fix the stupid errors in my original patch if the concept 
> is acceptable)
>      in this way we could decrease the overhead of handle_mm_fault
>      in the path which does not need write permission fixup.

Which overhead ? gup does handle_mm_fault() as well if needed.

What I do is I replace what is arguably an abuse of gup() in the case
where a fixup -is- needed with a dedicated function designed to perform
the said fixup ... and do it properly which gup() didn't :-)

Cheers,
Ben.

> Thanks
> Shan Hai
> > Cheers,
> > Ben.
> >
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 9670f71..1036614 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -985,6 +985,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
> >   int get_user_pages_fast(unsigned long start, int nr_pages, int write,
> >   			struct page **pages);
> >   struct page *get_dump_page(unsigned long addr);
> > +extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
> > +			    unsigned long address, unsigned int fault_flags);
> >
> >   extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
> >   extern void do_invalidatepage(struct page *page, unsigned long offset);
> > diff --git a/kernel/futex.c b/kernel/futex.c
> > index fe28dc2..7a0a4ed 100644
> > --- a/kernel/futex.c
> > +++ b/kernel/futex.c
> > @@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
> >   	int ret;
> >
> >   	down_read(&mm->mmap_sem);
> > -	ret = get_user_pages(current, mm, (unsigned long)uaddr,
> > -			     1, 1, 0, NULL, NULL);
> > +	ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
> > +			       FAULT_FLAG_WRITE);
> >   	up_read(&mm->mmap_sem);
> >
> >   	return ret<  0 ? ret : 0;
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 40b7531..b967fb0 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -1815,7 +1815,64 @@ next_page:
> >   }
> >   EXPORT_SYMBOL(__get_user_pages);
> >
> > -/**
> > +/*
> > + * fixup_user_fault() - manually resolve a user page  fault
> > + * @tsk:	the task_struct to use for page fault accounting, or
> > + *		NULL if faults are not to be recorded.
> > + * @mm:		mm_struct of target mm
> > + * @address:	user address
> > + * @fault_flags:flags to pass down to handle_mm_fault()
> > + *
> > + * This is meant to be called in the specific scenario where for
> > + * locking reasons we try to access user memory in atomic context
> > + * (within a pagefault_disable() section), this returns -EFAULT,
> > + * and we want to resolve the user fault before trying again.
> > + *
> > + * Typically this is meant to be used by the futex code.
> > + *
> > + * The main difference with get_user_pages() is that this function
> > + * will unconditionally call handle_mm_fault() which will in turn
> > + * perform all the necessary SW fixup of the dirty and young bits
> > + * in the PTE, while handle_mm_fault() only guarantees to update
> > + * these in the struct page.
> > + *
> > + * This is important for some architectures where those bits also
> > + * gate the access permission to the page because their are
> > + * maintained in software. On such architecture, gup() will not
> > + * be enough to make a subsequent access succeed.
> > + *
> > + * This should be called with the mm_sem held for read.
> > + */
> > +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
> > +		     unsigned long address, unsigned int fault_flags)
> > +{
> > +	struct vm_area_struct *vma;
> > +	int ret;
> > +
> > +	vma = find_extend_vma(mm, address);
> > +	if (!vma || address<  vma->vm_start)
> > +		return -EFAULT;
> > +	
> > +	ret = handle_mm_fault(mm, vma, address, fault_flags);
> > +	if (ret&  VM_FAULT_ERROR) {
> > +		if (ret&  VM_FAULT_OOM)
> > +			return -ENOMEM;
> > +		if (ret&  (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
> > +			return -EHWPOISON;
> > +		if (ret&  VM_FAULT_SIGBUS)
> > +			return -EFAULT;
> > +		BUG();
> > +	}
> > +	if (tsk) {
> > +		if (ret&  VM_FAULT_MAJOR)
> > +			tsk->maj_flt++;
> > +		else
> > +			tsk->min_flt++;
> > +	}
> > +	return 0;
> > +}
> > +
> > +/*
> >    * get_user_pages() - pin user pages in memory
> >    * @tsk:	the task_struct to use for page fault accounting, or
> >    *		NULL if faults are not to be recorded.
> >
> >
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
@ 2011-07-19  5:24                                 ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-19  5:24 UTC (permalink / raw)
  To: Shan Hai
  Cc: tony.luck, Peter Zijlstra, Peter Zijlstra, linux-kernel,
	cmetcalf, dhowells, paulus, tglx, walken, linuxppc-dev, akpm

On Tue, 2011-07-19 at 13:17 +0800, Shan Hai wrote:

> The patch works, but I have certain confusions,
> - Do we want to handle_mm_fault on each futex_lock_pi
>      even though in most cases there is no write permission
>      fixup's needed?

Don't we only ever call this when futex_atomic_op_inuser() failed ?
Which means a fixup -is- needed .... The fast path is still there.

> - How about let the archs do their own write permission
>      fixup as what I did in my original

Why ? This is generic and will fix all archs at once with generic code
which is a significant improvement in my book and a lot more
maintainable :-)

>      "[PATCH 1/1] Fixup write permission of TLB on powerpc e500 core"?
>      (I will fix the stupid errors in my original patch if the concept 
> is acceptable)
>      in this way we could decrease the overhead of handle_mm_fault
>      in the path which does not need write permission fixup.

Which overhead ? gup does handle_mm_fault() as well if needed.

What I do is I replace what is arguably an abuse of gup() in the case
where a fixup -is- needed with a dedicated function designed to perform
the said fixup ... and do it properly which gup() didn't :-)

Cheers,
Ben.

> Thanks
> Shan Hai
> > Cheers,
> > Ben.
> >
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 9670f71..1036614 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -985,6 +985,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
> >   int get_user_pages_fast(unsigned long start, int nr_pages, int write,
> >   			struct page **pages);
> >   struct page *get_dump_page(unsigned long addr);
> > +extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
> > +			    unsigned long address, unsigned int fault_flags);
> >
> >   extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
> >   extern void do_invalidatepage(struct page *page, unsigned long offset);
> > diff --git a/kernel/futex.c b/kernel/futex.c
> > index fe28dc2..7a0a4ed 100644
> > --- a/kernel/futex.c
> > +++ b/kernel/futex.c
> > @@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
> >   	int ret;
> >
> >   	down_read(&mm->mmap_sem);
> > -	ret = get_user_pages(current, mm, (unsigned long)uaddr,
> > -			     1, 1, 0, NULL, NULL);
> > +	ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
> > +			       FAULT_FLAG_WRITE);
> >   	up_read(&mm->mmap_sem);
> >
> >   	return ret<  0 ? ret : 0;
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 40b7531..b967fb0 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -1815,7 +1815,64 @@ next_page:
> >   }
> >   EXPORT_SYMBOL(__get_user_pages);
> >
> > -/**
> > +/*
> > + * fixup_user_fault() - manually resolve a user page  fault
> > + * @tsk:	the task_struct to use for page fault accounting, or
> > + *		NULL if faults are not to be recorded.
> > + * @mm:		mm_struct of target mm
> > + * @address:	user address
> > + * @fault_flags:flags to pass down to handle_mm_fault()
> > + *
> > + * This is meant to be called in the specific scenario where for
> > + * locking reasons we try to access user memory in atomic context
> > + * (within a pagefault_disable() section), this returns -EFAULT,
> > + * and we want to resolve the user fault before trying again.
> > + *
> > + * Typically this is meant to be used by the futex code.
> > + *
> > + * The main difference with get_user_pages() is that this function
> > + * will unconditionally call handle_mm_fault() which will in turn
> > + * perform all the necessary SW fixup of the dirty and young bits
> > + * in the PTE, while handle_mm_fault() only guarantees to update
> > + * these in the struct page.
> > + *
> > + * This is important for some architectures where those bits also
> > + * gate the access permission to the page because their are
> > + * maintained in software. On such architecture, gup() will not
> > + * be enough to make a subsequent access succeed.
> > + *
> > + * This should be called with the mm_sem held for read.
> > + */
> > +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
> > +		     unsigned long address, unsigned int fault_flags)
> > +{
> > +	struct vm_area_struct *vma;
> > +	int ret;
> > +
> > +	vma = find_extend_vma(mm, address);
> > +	if (!vma || address<  vma->vm_start)
> > +		return -EFAULT;
> > +	
> > +	ret = handle_mm_fault(mm, vma, address, fault_flags);
> > +	if (ret&  VM_FAULT_ERROR) {
> > +		if (ret&  VM_FAULT_OOM)
> > +			return -ENOMEM;
> > +		if (ret&  (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
> > +			return -EHWPOISON;
> > +		if (ret&  VM_FAULT_SIGBUS)
> > +			return -EFAULT;
> > +		BUG();
> > +	}
> > +	if (tsk) {
> > +		if (ret&  VM_FAULT_MAJOR)
> > +			tsk->maj_flt++;
> > +		else
> > +			tsk->min_flt++;
> > +	}
> > +	return 0;
> > +}
> > +
> > +/*
> >    * get_user_pages() - pin user pages in memory
> >    * @tsk:	the task_struct to use for page fault accounting, or
> >    *		NULL if faults are not to be recorded.
> >
> >
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
  2011-07-19  5:24                                 ` Benjamin Herrenschmidt
@ 2011-07-19  5:38                                   ` Shan Hai
  -1 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-19  5:38 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Peter Zijlstra, Peter Zijlstra, paulus, tglx, walken, dhowells,
	cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel

On 07/19/2011 01:24 PM, Benjamin Herrenschmidt wrote:
> On Tue, 2011-07-19 at 13:17 +0800, Shan Hai wrote:
>
>> The patch works, but I have certain confusions,
>> - Do we want to handle_mm_fault on each futex_lock_pi
>>       even though in most cases there is no write permission
>>       fixup's needed?
> Don't we only ever call this when futex_atomic_op_inuser() failed ?
> Which means a fixup -is- needed .... The fast path is still there.
>

What you said is another path, that is futex_wake_op(),
but what about futex_lock_pi in which my test case failed?
your patch will call handle_mm_fault on every futex contention
in the futex_lock_pi path.

futex_lock_pi()
     ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
         case -EFAULT:
                         goto uaddr_faulted;

     ...
uaddr_faulted:
     ret = fault_in_user_writeable(uaddr);


>> - How about let the archs do their own write permission
>>       fixup as what I did in my original
> Why ? This is generic and will fix all archs at once with generic code
> which is a significant improvement in my book and a lot more
> maintainable :-)
>

If the overhead in the futex_lock_pi  path is not considerable yes fix it up
generally is nice :-)

>>       "[PATCH 1/1] Fixup write permission of TLB on powerpc e500 core"?
>>       (I will fix the stupid errors in my original patch if the concept
>> is acceptable)
>>       in this way we could decrease the overhead of handle_mm_fault
>>       in the path which does not need write permission fixup.
> Which overhead ? gup does handle_mm_fault() as well if needed.

it does it *if needed*, and this requirement is rare in my opinion.


Thanks
Shan Hai

> What I do is I replace what is arguably an abuse of gup() in the case
> where a fixup -is- needed with a dedicated function designed to perform
> the said fixup ... and do it properly which gup() didn't :-)
>
> Cheers,
> Ben.
>
>> Thanks
>> Shan Hai
>>> Cheers,
>>> Ben.
>>>
>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>> index 9670f71..1036614 100644
>>> --- a/include/linux/mm.h
>>> +++ b/include/linux/mm.h
>>> @@ -985,6 +985,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
>>>    int get_user_pages_fast(unsigned long start, int nr_pages, int write,
>>>    			struct page **pages);
>>>    struct page *get_dump_page(unsigned long addr);
>>> +extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
>>> +			    unsigned long address, unsigned int fault_flags);
>>>
>>>    extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
>>>    extern void do_invalidatepage(struct page *page, unsigned long offset);
>>> diff --git a/kernel/futex.c b/kernel/futex.c
>>> index fe28dc2..7a0a4ed 100644
>>> --- a/kernel/futex.c
>>> +++ b/kernel/futex.c
>>> @@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
>>>    	int ret;
>>>
>>>    	down_read(&mm->mmap_sem);
>>> -	ret = get_user_pages(current, mm, (unsigned long)uaddr,
>>> -			     1, 1, 0, NULL, NULL);
>>> +	ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
>>> +			       FAULT_FLAG_WRITE);
>>>    	up_read(&mm->mmap_sem);
>>>
>>>    	return ret<   0 ? ret : 0;
>>> diff --git a/mm/memory.c b/mm/memory.c
>>> index 40b7531..b967fb0 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -1815,7 +1815,64 @@ next_page:
>>>    }
>>>    EXPORT_SYMBOL(__get_user_pages);
>>>
>>> -/**
>>> +/*
>>> + * fixup_user_fault() - manually resolve a user page  fault
>>> + * @tsk:	the task_struct to use for page fault accounting, or
>>> + *		NULL if faults are not to be recorded.
>>> + * @mm:		mm_struct of target mm
>>> + * @address:	user address
>>> + * @fault_flags:flags to pass down to handle_mm_fault()
>>> + *
>>> + * This is meant to be called in the specific scenario where for
>>> + * locking reasons we try to access user memory in atomic context
>>> + * (within a pagefault_disable() section), this returns -EFAULT,
>>> + * and we want to resolve the user fault before trying again.
>>> + *
>>> + * Typically this is meant to be used by the futex code.
>>> + *
>>> + * The main difference with get_user_pages() is that this function
>>> + * will unconditionally call handle_mm_fault() which will in turn
>>> + * perform all the necessary SW fixup of the dirty and young bits
>>> + * in the PTE, while handle_mm_fault() only guarantees to update
>>> + * these in the struct page.
>>> + *
>>> + * This is important for some architectures where those bits also
>>> + * gate the access permission to the page because their are
>>> + * maintained in software. On such architecture, gup() will not
>>> + * be enough to make a subsequent access succeed.
>>> + *
>>> + * This should be called with the mm_sem held for read.
>>> + */
>>> +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
>>> +		     unsigned long address, unsigned int fault_flags)
>>> +{
>>> +	struct vm_area_struct *vma;
>>> +	int ret;
>>> +
>>> +	vma = find_extend_vma(mm, address);
>>> +	if (!vma || address<   vma->vm_start)
>>> +		return -EFAULT;
>>> +	
>>> +	ret = handle_mm_fault(mm, vma, address, fault_flags);
>>> +	if (ret&   VM_FAULT_ERROR) {
>>> +		if (ret&   VM_FAULT_OOM)
>>> +			return -ENOMEM;
>>> +		if (ret&   (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
>>> +			return -EHWPOISON;
>>> +		if (ret&   VM_FAULT_SIGBUS)
>>> +			return -EFAULT;
>>> +		BUG();
>>> +	}
>>> +	if (tsk) {
>>> +		if (ret&   VM_FAULT_MAJOR)
>>> +			tsk->maj_flt++;
>>> +		else
>>> +			tsk->min_flt++;
>>> +	}
>>> +	return 0;
>>> +}
>>> +
>>> +/*
>>>     * get_user_pages() - pin user pages in memory
>>>     * @tsk:	the task_struct to use for page fault accounting, or
>>>     *		NULL if faults are not to be recorded.
>>>
>>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
>


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
@ 2011-07-19  5:38                                   ` Shan Hai
  0 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-19  5:38 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: tony.luck, Peter Zijlstra, Peter Zijlstra, linux-kernel,
	cmetcalf, dhowells, paulus, tglx, walken, linuxppc-dev, akpm

On 07/19/2011 01:24 PM, Benjamin Herrenschmidt wrote:
> On Tue, 2011-07-19 at 13:17 +0800, Shan Hai wrote:
>
>> The patch works, but I have certain confusions,
>> - Do we want to handle_mm_fault on each futex_lock_pi
>>       even though in most cases there is no write permission
>>       fixup's needed?
> Don't we only ever call this when futex_atomic_op_inuser() failed ?
> Which means a fixup -is- needed .... The fast path is still there.
>

What you said is another path, that is futex_wake_op(),
but what about futex_lock_pi in which my test case failed?
your patch will call handle_mm_fault on every futex contention
in the futex_lock_pi path.

futex_lock_pi()
     ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
         case -EFAULT:
                         goto uaddr_faulted;

     ...
uaddr_faulted:
     ret = fault_in_user_writeable(uaddr);


>> - How about let the archs do their own write permission
>>       fixup as what I did in my original
> Why ? This is generic and will fix all archs at once with generic code
> which is a significant improvement in my book and a lot more
> maintainable :-)
>

If the overhead in the futex_lock_pi  path is not considerable yes fix it up
generally is nice :-)

>>       "[PATCH 1/1] Fixup write permission of TLB on powerpc e500 core"?
>>       (I will fix the stupid errors in my original patch if the concept
>> is acceptable)
>>       in this way we could decrease the overhead of handle_mm_fault
>>       in the path which does not need write permission fixup.
> Which overhead ? gup does handle_mm_fault() as well if needed.

it does it *if needed*, and this requirement is rare in my opinion.


Thanks
Shan Hai

> What I do is I replace what is arguably an abuse of gup() in the case
> where a fixup -is- needed with a dedicated function designed to perform
> the said fixup ... and do it properly which gup() didn't :-)
>
> Cheers,
> Ben.
>
>> Thanks
>> Shan Hai
>>> Cheers,
>>> Ben.
>>>
>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>> index 9670f71..1036614 100644
>>> --- a/include/linux/mm.h
>>> +++ b/include/linux/mm.h
>>> @@ -985,6 +985,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
>>>    int get_user_pages_fast(unsigned long start, int nr_pages, int write,
>>>    			struct page **pages);
>>>    struct page *get_dump_page(unsigned long addr);
>>> +extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
>>> +			    unsigned long address, unsigned int fault_flags);
>>>
>>>    extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
>>>    extern void do_invalidatepage(struct page *page, unsigned long offset);
>>> diff --git a/kernel/futex.c b/kernel/futex.c
>>> index fe28dc2..7a0a4ed 100644
>>> --- a/kernel/futex.c
>>> +++ b/kernel/futex.c
>>> @@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
>>>    	int ret;
>>>
>>>    	down_read(&mm->mmap_sem);
>>> -	ret = get_user_pages(current, mm, (unsigned long)uaddr,
>>> -			     1, 1, 0, NULL, NULL);
>>> +	ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
>>> +			       FAULT_FLAG_WRITE);
>>>    	up_read(&mm->mmap_sem);
>>>
>>>    	return ret<   0 ? ret : 0;
>>> diff --git a/mm/memory.c b/mm/memory.c
>>> index 40b7531..b967fb0 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -1815,7 +1815,64 @@ next_page:
>>>    }
>>>    EXPORT_SYMBOL(__get_user_pages);
>>>
>>> -/**
>>> +/*
>>> + * fixup_user_fault() - manually resolve a user page  fault
>>> + * @tsk:	the task_struct to use for page fault accounting, or
>>> + *		NULL if faults are not to be recorded.
>>> + * @mm:		mm_struct of target mm
>>> + * @address:	user address
>>> + * @fault_flags:flags to pass down to handle_mm_fault()
>>> + *
>>> + * This is meant to be called in the specific scenario where for
>>> + * locking reasons we try to access user memory in atomic context
>>> + * (within a pagefault_disable() section), this returns -EFAULT,
>>> + * and we want to resolve the user fault before trying again.
>>> + *
>>> + * Typically this is meant to be used by the futex code.
>>> + *
>>> + * The main difference with get_user_pages() is that this function
>>> + * will unconditionally call handle_mm_fault() which will in turn
>>> + * perform all the necessary SW fixup of the dirty and young bits
>>> + * in the PTE, while handle_mm_fault() only guarantees to update
>>> + * these in the struct page.
>>> + *
>>> + * This is important for some architectures where those bits also
>>> + * gate the access permission to the page because their are
>>> + * maintained in software. On such architecture, gup() will not
>>> + * be enough to make a subsequent access succeed.
>>> + *
>>> + * This should be called with the mm_sem held for read.
>>> + */
>>> +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
>>> +		     unsigned long address, unsigned int fault_flags)
>>> +{
>>> +	struct vm_area_struct *vma;
>>> +	int ret;
>>> +
>>> +	vma = find_extend_vma(mm, address);
>>> +	if (!vma || address<   vma->vm_start)
>>> +		return -EFAULT;
>>> +	
>>> +	ret = handle_mm_fault(mm, vma, address, fault_flags);
>>> +	if (ret&   VM_FAULT_ERROR) {
>>> +		if (ret&   VM_FAULT_OOM)
>>> +			return -ENOMEM;
>>> +		if (ret&   (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
>>> +			return -EHWPOISON;
>>> +		if (ret&   VM_FAULT_SIGBUS)
>>> +			return -EFAULT;
>>> +		BUG();
>>> +	}
>>> +	if (tsk) {
>>> +		if (ret&   VM_FAULT_MAJOR)
>>> +			tsk->maj_flt++;
>>> +		else
>>> +			tsk->min_flt++;
>>> +	}
>>> +	return 0;
>>> +}
>>> +
>>> +/*
>>>     * get_user_pages() - pin user pages in memory
>>>     * @tsk:	the task_struct to use for page fault accounting, or
>>>     *		NULL if faults are not to be recorded.
>>>
>>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
>

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
  2011-07-19  5:38                                   ` Shan Hai
@ 2011-07-19  7:46                                     ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-19  7:46 UTC (permalink / raw)
  To: Shan Hai
  Cc: Peter Zijlstra, Peter Zijlstra, paulus, tglx, walken, dhowells,
	cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel

On Tue, 2011-07-19 at 13:38 +0800, Shan Hai wrote:

> What you said is another path, that is futex_wake_op(),
> but what about futex_lock_pi in which my test case failed?
> your patch will call handle_mm_fault on every futex contention
> in the futex_lock_pi path.
> 
> futex_lock_pi()
>      ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
>          case -EFAULT:
>                          goto uaddr_faulted;
> 
>      ...
> uaddr_faulted:
>      ret = fault_in_user_writeable(uaddr);

Euh ... and how do we get to uaddr_faulted ? You may have missed the
return statement right before it :-)

>From what I can tell we only get there as a result of -EFAULT from
futex_lock_pi_atomic() which is exactly the case we are trying to
cover. 

 .../...
> >>       "[PATCH 1/1] Fixup write permission of TLB on powerpc e500 core"?
> >>       (I will fix the stupid errors in my original patch if the concept
> >> is acceptable)
> >>       in this way we could decrease the overhead of handle_mm_fault
> >>       in the path which does not need write permission fixup.
> > Which overhead ? gup does handle_mm_fault() as well if needed.
> 
> it does it *if needed*, and this requirement is rare in my opinion.

And how does gup figure out of it's needed ? By walking down the page
tables in follow_page... what does handle_mm_fault do ? walk down the
page tables...

The main (if not the only) relevant difference here, is going to be the
spurious fault TLB invaliate for writes ... which is a nop on x86....
and needed in all the cases we care about (and if it's not needed, then
it's up to the arch to nop it out, we should probably do it on powerpc
too ...  but that's un unrelated discussion).

Cheers,
Ben.

> Thanks
> Shan Hai
> 
> > What I do is I replace what is arguably an abuse of gup() in the case
> > where a fixup -is- needed with a dedicated function designed to perform
> > the said fixup ... and do it properly which gup() didn't :-)
> >
> > Cheers,
> > Ben.
> >
> >> Thanks
> >> Shan Hai
> >>> Cheers,
> >>> Ben.
> >>>
> >>> diff --git a/include/linux/mm.h b/include/linux/mm.h
> >>> index 9670f71..1036614 100644
> >>> --- a/include/linux/mm.h
> >>> +++ b/include/linux/mm.h
> >>> @@ -985,6 +985,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
> >>>    int get_user_pages_fast(unsigned long start, int nr_pages, int write,
> >>>    			struct page **pages);
> >>>    struct page *get_dump_page(unsigned long addr);
> >>> +extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
> >>> +			    unsigned long address, unsigned int fault_flags);
> >>>
> >>>    extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
> >>>    extern void do_invalidatepage(struct page *page, unsigned long offset);
> >>> diff --git a/kernel/futex.c b/kernel/futex.c
> >>> index fe28dc2..7a0a4ed 100644
> >>> --- a/kernel/futex.c
> >>> +++ b/kernel/futex.c
> >>> @@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
> >>>    	int ret;
> >>>
> >>>    	down_read(&mm->mmap_sem);
> >>> -	ret = get_user_pages(current, mm, (unsigned long)uaddr,
> >>> -			     1, 1, 0, NULL, NULL);
> >>> +	ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
> >>> +			       FAULT_FLAG_WRITE);
> >>>    	up_read(&mm->mmap_sem);
> >>>
> >>>    	return ret<   0 ? ret : 0;
> >>> diff --git a/mm/memory.c b/mm/memory.c
> >>> index 40b7531..b967fb0 100644
> >>> --- a/mm/memory.c
> >>> +++ b/mm/memory.c
> >>> @@ -1815,7 +1815,64 @@ next_page:
> >>>    }
> >>>    EXPORT_SYMBOL(__get_user_pages);
> >>>
> >>> -/**
> >>> +/*
> >>> + * fixup_user_fault() - manually resolve a user page  fault
> >>> + * @tsk:	the task_struct to use for page fault accounting, or
> >>> + *		NULL if faults are not to be recorded.
> >>> + * @mm:		mm_struct of target mm
> >>> + * @address:	user address
> >>> + * @fault_flags:flags to pass down to handle_mm_fault()
> >>> + *
> >>> + * This is meant to be called in the specific scenario where for
> >>> + * locking reasons we try to access user memory in atomic context
> >>> + * (within a pagefault_disable() section), this returns -EFAULT,
> >>> + * and we want to resolve the user fault before trying again.
> >>> + *
> >>> + * Typically this is meant to be used by the futex code.
> >>> + *
> >>> + * The main difference with get_user_pages() is that this function
> >>> + * will unconditionally call handle_mm_fault() which will in turn
> >>> + * perform all the necessary SW fixup of the dirty and young bits
> >>> + * in the PTE, while handle_mm_fault() only guarantees to update
> >>> + * these in the struct page.
> >>> + *
> >>> + * This is important for some architectures where those bits also
> >>> + * gate the access permission to the page because their are
> >>> + * maintained in software. On such architecture, gup() will not
> >>> + * be enough to make a subsequent access succeed.
> >>> + *
> >>> + * This should be called with the mm_sem held for read.
> >>> + */
> >>> +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
> >>> +		     unsigned long address, unsigned int fault_flags)
> >>> +{
> >>> +	struct vm_area_struct *vma;
> >>> +	int ret;
> >>> +
> >>> +	vma = find_extend_vma(mm, address);
> >>> +	if (!vma || address<   vma->vm_start)
> >>> +		return -EFAULT;
> >>> +	
> >>> +	ret = handle_mm_fault(mm, vma, address, fault_flags);
> >>> +	if (ret&   VM_FAULT_ERROR) {
> >>> +		if (ret&   VM_FAULT_OOM)
> >>> +			return -ENOMEM;
> >>> +		if (ret&   (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
> >>> +			return -EHWPOISON;
> >>> +		if (ret&   VM_FAULT_SIGBUS)
> >>> +			return -EFAULT;
> >>> +		BUG();
> >>> +	}
> >>> +	if (tsk) {
> >>> +		if (ret&   VM_FAULT_MAJOR)
> >>> +			tsk->maj_flt++;
> >>> +		else
> >>> +			tsk->min_flt++;
> >>> +	}
> >>> +	return 0;
> >>> +}
> >>> +
> >>> +/*
> >>>     * get_user_pages() - pin user pages in memory
> >>>     * @tsk:	the task_struct to use for page fault accounting, or
> >>>     *		NULL if faults are not to be recorded.
> >>>
> >>>
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> >> the body of a message to majordomo@vger.kernel.org
> >> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >> Please read the FAQ at  http://www.tux.org/lkml/
> >
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
@ 2011-07-19  7:46                                     ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-19  7:46 UTC (permalink / raw)
  To: Shan Hai
  Cc: tony.luck, Peter Zijlstra, Peter Zijlstra, linux-kernel,
	cmetcalf, dhowells, paulus, tglx, walken, linuxppc-dev, akpm

On Tue, 2011-07-19 at 13:38 +0800, Shan Hai wrote:

> What you said is another path, that is futex_wake_op(),
> but what about futex_lock_pi in which my test case failed?
> your patch will call handle_mm_fault on every futex contention
> in the futex_lock_pi path.
> 
> futex_lock_pi()
>      ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
>          case -EFAULT:
>                          goto uaddr_faulted;
> 
>      ...
> uaddr_faulted:
>      ret = fault_in_user_writeable(uaddr);

Euh ... and how do we get to uaddr_faulted ? You may have missed the
return statement right before it :-)

>From what I can tell we only get there as a result of -EFAULT from
futex_lock_pi_atomic() which is exactly the case we are trying to
cover. 

 .../...
> >>       "[PATCH 1/1] Fixup write permission of TLB on powerpc e500 core"?
> >>       (I will fix the stupid errors in my original patch if the concept
> >> is acceptable)
> >>       in this way we could decrease the overhead of handle_mm_fault
> >>       in the path which does not need write permission fixup.
> > Which overhead ? gup does handle_mm_fault() as well if needed.
> 
> it does it *if needed*, and this requirement is rare in my opinion.

And how does gup figure out of it's needed ? By walking down the page
tables in follow_page... what does handle_mm_fault do ? walk down the
page tables...

The main (if not the only) relevant difference here, is going to be the
spurious fault TLB invaliate for writes ... which is a nop on x86....
and needed in all the cases we care about (and if it's not needed, then
it's up to the arch to nop it out, we should probably do it on powerpc
too ...  but that's un unrelated discussion).

Cheers,
Ben.

> Thanks
> Shan Hai
> 
> > What I do is I replace what is arguably an abuse of gup() in the case
> > where a fixup -is- needed with a dedicated function designed to perform
> > the said fixup ... and do it properly which gup() didn't :-)
> >
> > Cheers,
> > Ben.
> >
> >> Thanks
> >> Shan Hai
> >>> Cheers,
> >>> Ben.
> >>>
> >>> diff --git a/include/linux/mm.h b/include/linux/mm.h
> >>> index 9670f71..1036614 100644
> >>> --- a/include/linux/mm.h
> >>> +++ b/include/linux/mm.h
> >>> @@ -985,6 +985,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
> >>>    int get_user_pages_fast(unsigned long start, int nr_pages, int write,
> >>>    			struct page **pages);
> >>>    struct page *get_dump_page(unsigned long addr);
> >>> +extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
> >>> +			    unsigned long address, unsigned int fault_flags);
> >>>
> >>>    extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
> >>>    extern void do_invalidatepage(struct page *page, unsigned long offset);
> >>> diff --git a/kernel/futex.c b/kernel/futex.c
> >>> index fe28dc2..7a0a4ed 100644
> >>> --- a/kernel/futex.c
> >>> +++ b/kernel/futex.c
> >>> @@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
> >>>    	int ret;
> >>>
> >>>    	down_read(&mm->mmap_sem);
> >>> -	ret = get_user_pages(current, mm, (unsigned long)uaddr,
> >>> -			     1, 1, 0, NULL, NULL);
> >>> +	ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
> >>> +			       FAULT_FLAG_WRITE);
> >>>    	up_read(&mm->mmap_sem);
> >>>
> >>>    	return ret<   0 ? ret : 0;
> >>> diff --git a/mm/memory.c b/mm/memory.c
> >>> index 40b7531..b967fb0 100644
> >>> --- a/mm/memory.c
> >>> +++ b/mm/memory.c
> >>> @@ -1815,7 +1815,64 @@ next_page:
> >>>    }
> >>>    EXPORT_SYMBOL(__get_user_pages);
> >>>
> >>> -/**
> >>> +/*
> >>> + * fixup_user_fault() - manually resolve a user page  fault
> >>> + * @tsk:	the task_struct to use for page fault accounting, or
> >>> + *		NULL if faults are not to be recorded.
> >>> + * @mm:		mm_struct of target mm
> >>> + * @address:	user address
> >>> + * @fault_flags:flags to pass down to handle_mm_fault()
> >>> + *
> >>> + * This is meant to be called in the specific scenario where for
> >>> + * locking reasons we try to access user memory in atomic context
> >>> + * (within a pagefault_disable() section), this returns -EFAULT,
> >>> + * and we want to resolve the user fault before trying again.
> >>> + *
> >>> + * Typically this is meant to be used by the futex code.
> >>> + *
> >>> + * The main difference with get_user_pages() is that this function
> >>> + * will unconditionally call handle_mm_fault() which will in turn
> >>> + * perform all the necessary SW fixup of the dirty and young bits
> >>> + * in the PTE, while handle_mm_fault() only guarantees to update
> >>> + * these in the struct page.
> >>> + *
> >>> + * This is important for some architectures where those bits also
> >>> + * gate the access permission to the page because their are
> >>> + * maintained in software. On such architecture, gup() will not
> >>> + * be enough to make a subsequent access succeed.
> >>> + *
> >>> + * This should be called with the mm_sem held for read.
> >>> + */
> >>> +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
> >>> +		     unsigned long address, unsigned int fault_flags)
> >>> +{
> >>> +	struct vm_area_struct *vma;
> >>> +	int ret;
> >>> +
> >>> +	vma = find_extend_vma(mm, address);
> >>> +	if (!vma || address<   vma->vm_start)
> >>> +		return -EFAULT;
> >>> +	
> >>> +	ret = handle_mm_fault(mm, vma, address, fault_flags);
> >>> +	if (ret&   VM_FAULT_ERROR) {
> >>> +		if (ret&   VM_FAULT_OOM)
> >>> +			return -ENOMEM;
> >>> +		if (ret&   (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
> >>> +			return -EHWPOISON;
> >>> +		if (ret&   VM_FAULT_SIGBUS)
> >>> +			return -EFAULT;
> >>> +		BUG();
> >>> +	}
> >>> +	if (tsk) {
> >>> +		if (ret&   VM_FAULT_MAJOR)
> >>> +			tsk->maj_flt++;
> >>> +		else
> >>> +			tsk->min_flt++;
> >>> +	}
> >>> +	return 0;
> >>> +}
> >>> +
> >>> +/*
> >>>     * get_user_pages() - pin user pages in memory
> >>>     * @tsk:	the task_struct to use for page fault accounting, or
> >>>     *		NULL if faults are not to be recorded.
> >>>
> >>>
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> >> the body of a message to majordomo@vger.kernel.org
> >> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >> Please read the FAQ at  http://www.tux.org/lkml/
> >
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
  2011-07-19  7:46                                     ` Benjamin Herrenschmidt
@ 2011-07-19  8:24                                       ` Shan Hai
  -1 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-19  8:24 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Peter Zijlstra, Peter Zijlstra, paulus, tglx, walken, dhowells,
	cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel

On 07/19/2011 03:46 PM, Benjamin Herrenschmidt wrote:
> On Tue, 2011-07-19 at 13:38 +0800, Shan Hai wrote:
>
>> What you said is another path, that is futex_wake_op(),
>> but what about futex_lock_pi in which my test case failed?
>> your patch will call handle_mm_fault on every futex contention
>> in the futex_lock_pi path.
>>
>> futex_lock_pi()
>>       ret = futex_lock_pi_atomic(uaddr, hb,&q.key,&q.pi_state, current, 0);
>>           case -EFAULT:
>>                           goto uaddr_faulted;
>>
>>       ...
>> uaddr_faulted:
>>       ret = fault_in_user_writeable(uaddr);
> Euh ... and how do we get to uaddr_faulted ? You may have missed the
> return statement right before it :-)
>
>  From what I can tell we only get there as a result of -EFAULT from
> futex_lock_pi_atomic() which is exactly the case we are trying to
> cover.
>

Got it, if the fault_in_user_writeable() is designed to catch the
exact same write permission fault problem we discuss here, so
your patch fixed that very nicely, we should fixup it by directly
calling handle_mm_fault like what you did because we are for sure
to know what just happened(permission violation), its not necessary
to check what's happened by calling gup-->follow_page, and
further the follow_page failed to report the fault :-)

Thanks
Shan Hai

>   .../...
>>>>        "[PATCH 1/1] Fixup write permission of TLB on powerpc e500 core"?
>>>>        (I will fix the stupid errors in my original patch if the concept
>>>> is acceptable)
>>>>        in this way we could decrease the overhead of handle_mm_fault
>>>>        in the path which does not need write permission fixup.
>>> Which overhead ? gup does handle_mm_fault() as well if needed.
>> it does it *if needed*, and this requirement is rare in my opinion.
> And how does gup figure out of it's needed ? By walking down the page
> tables in follow_page... what does handle_mm_fault do ? walk down the
> page tables...
>
> The main (if not the only) relevant difference here, is going to be the
> spurious fault TLB invaliate for writes ... which is a nop on x86....
> and needed in all the cases we care about (and if it's not needed, then
> it's up to the arch to nop it out, we should probably do it on powerpc
> too ...  but that's un unrelated discussion).
>
> Cheers,
> Ben.
>
>> Thanks
>> Shan Hai
>>
>>> What I do is I replace what is arguably an abuse of gup() in the case
>>> where a fixup -is- needed with a dedicated function designed to perform
>>> the said fixup ... and do it properly which gup() didn't :-)
>>>
>>> Cheers,
>>> Ben.
>>>
>>>> Thanks
>>>> Shan Hai
>>>>> Cheers,
>>>>> Ben.
>>>>>
>>>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>>>> index 9670f71..1036614 100644
>>>>> --- a/include/linux/mm.h
>>>>> +++ b/include/linux/mm.h
>>>>> @@ -985,6 +985,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
>>>>>     int get_user_pages_fast(unsigned long start, int nr_pages, int write,
>>>>>     			struct page **pages);
>>>>>     struct page *get_dump_page(unsigned long addr);
>>>>> +extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
>>>>> +			    unsigned long address, unsigned int fault_flags);
>>>>>
>>>>>     extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
>>>>>     extern void do_invalidatepage(struct page *page, unsigned long offset);
>>>>> diff --git a/kernel/futex.c b/kernel/futex.c
>>>>> index fe28dc2..7a0a4ed 100644
>>>>> --- a/kernel/futex.c
>>>>> +++ b/kernel/futex.c
>>>>> @@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
>>>>>     	int ret;
>>>>>
>>>>>     	down_read(&mm->mmap_sem);
>>>>> -	ret = get_user_pages(current, mm, (unsigned long)uaddr,
>>>>> -			     1, 1, 0, NULL, NULL);
>>>>> +	ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
>>>>> +			       FAULT_FLAG_WRITE);
>>>>>     	up_read(&mm->mmap_sem);
>>>>>
>>>>>     	return ret<    0 ? ret : 0;
>>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>>> index 40b7531..b967fb0 100644
>>>>> --- a/mm/memory.c
>>>>> +++ b/mm/memory.c
>>>>> @@ -1815,7 +1815,64 @@ next_page:
>>>>>     }
>>>>>     EXPORT_SYMBOL(__get_user_pages);
>>>>>
>>>>> -/**
>>>>> +/*
>>>>> + * fixup_user_fault() - manually resolve a user page  fault
>>>>> + * @tsk:	the task_struct to use for page fault accounting, or
>>>>> + *		NULL if faults are not to be recorded.
>>>>> + * @mm:		mm_struct of target mm
>>>>> + * @address:	user address
>>>>> + * @fault_flags:flags to pass down to handle_mm_fault()
>>>>> + *
>>>>> + * This is meant to be called in the specific scenario where for
>>>>> + * locking reasons we try to access user memory in atomic context
>>>>> + * (within a pagefault_disable() section), this returns -EFAULT,
>>>>> + * and we want to resolve the user fault before trying again.
>>>>> + *
>>>>> + * Typically this is meant to be used by the futex code.
>>>>> + *
>>>>> + * The main difference with get_user_pages() is that this function
>>>>> + * will unconditionally call handle_mm_fault() which will in turn
>>>>> + * perform all the necessary SW fixup of the dirty and young bits
>>>>> + * in the PTE, while handle_mm_fault() only guarantees to update
>>>>> + * these in the struct page.
>>>>> + *
>>>>> + * This is important for some architectures where those bits also
>>>>> + * gate the access permission to the page because their are
>>>>> + * maintained in software. On such architecture, gup() will not
>>>>> + * be enough to make a subsequent access succeed.
>>>>> + *
>>>>> + * This should be called with the mm_sem held for read.
>>>>> + */
>>>>> +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
>>>>> +		     unsigned long address, unsigned int fault_flags)
>>>>> +{
>>>>> +	struct vm_area_struct *vma;
>>>>> +	int ret;
>>>>> +
>>>>> +	vma = find_extend_vma(mm, address);
>>>>> +	if (!vma || address<    vma->vm_start)
>>>>> +		return -EFAULT;
>>>>> +	
>>>>> +	ret = handle_mm_fault(mm, vma, address, fault_flags);
>>>>> +	if (ret&    VM_FAULT_ERROR) {
>>>>> +		if (ret&    VM_FAULT_OOM)
>>>>> +			return -ENOMEM;
>>>>> +		if (ret&    (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
>>>>> +			return -EHWPOISON;
>>>>> +		if (ret&    VM_FAULT_SIGBUS)
>>>>> +			return -EFAULT;
>>>>> +		BUG();
>>>>> +	}
>>>>> +	if (tsk) {
>>>>> +		if (ret&    VM_FAULT_MAJOR)
>>>>> +			tsk->maj_flt++;
>>>>> +		else
>>>>> +			tsk->min_flt++;
>>>>> +	}
>>>>> +	return 0;
>>>>> +}
>>>>> +
>>>>> +/*
>>>>>      * get_user_pages() - pin user pages in memory
>>>>>      * @tsk:	the task_struct to use for page fault accounting, or
>>>>>      *		NULL if faults are not to be recorded.
>>>>>
>>>>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>> Please read the FAQ at  http://www.tux.org/lkml/
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
>


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
@ 2011-07-19  8:24                                       ` Shan Hai
  0 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-19  8:24 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: tony.luck, Peter Zijlstra, Peter Zijlstra, linux-kernel,
	cmetcalf, dhowells, paulus, tglx, walken, linuxppc-dev, akpm

On 07/19/2011 03:46 PM, Benjamin Herrenschmidt wrote:
> On Tue, 2011-07-19 at 13:38 +0800, Shan Hai wrote:
>
>> What you said is another path, that is futex_wake_op(),
>> but what about futex_lock_pi in which my test case failed?
>> your patch will call handle_mm_fault on every futex contention
>> in the futex_lock_pi path.
>>
>> futex_lock_pi()
>>       ret = futex_lock_pi_atomic(uaddr, hb,&q.key,&q.pi_state, current, 0);
>>           case -EFAULT:
>>                           goto uaddr_faulted;
>>
>>       ...
>> uaddr_faulted:
>>       ret = fault_in_user_writeable(uaddr);
> Euh ... and how do we get to uaddr_faulted ? You may have missed the
> return statement right before it :-)
>
>  From what I can tell we only get there as a result of -EFAULT from
> futex_lock_pi_atomic() which is exactly the case we are trying to
> cover.
>

Got it, if the fault_in_user_writeable() is designed to catch the
exact same write permission fault problem we discuss here, so
your patch fixed that very nicely, we should fixup it by directly
calling handle_mm_fault like what you did because we are for sure
to know what just happened(permission violation), its not necessary
to check what's happened by calling gup-->follow_page, and
further the follow_page failed to report the fault :-)

Thanks
Shan Hai

>   .../...
>>>>        "[PATCH 1/1] Fixup write permission of TLB on powerpc e500 core"?
>>>>        (I will fix the stupid errors in my original patch if the concept
>>>> is acceptable)
>>>>        in this way we could decrease the overhead of handle_mm_fault
>>>>        in the path which does not need write permission fixup.
>>> Which overhead ? gup does handle_mm_fault() as well if needed.
>> it does it *if needed*, and this requirement is rare in my opinion.
> And how does gup figure out of it's needed ? By walking down the page
> tables in follow_page... what does handle_mm_fault do ? walk down the
> page tables...
>
> The main (if not the only) relevant difference here, is going to be the
> spurious fault TLB invaliate for writes ... which is a nop on x86....
> and needed in all the cases we care about (and if it's not needed, then
> it's up to the arch to nop it out, we should probably do it on powerpc
> too ...  but that's un unrelated discussion).
>
> Cheers,
> Ben.
>
>> Thanks
>> Shan Hai
>>
>>> What I do is I replace what is arguably an abuse of gup() in the case
>>> where a fixup -is- needed with a dedicated function designed to perform
>>> the said fixup ... and do it properly which gup() didn't :-)
>>>
>>> Cheers,
>>> Ben.
>>>
>>>> Thanks
>>>> Shan Hai
>>>>> Cheers,
>>>>> Ben.
>>>>>
>>>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>>>> index 9670f71..1036614 100644
>>>>> --- a/include/linux/mm.h
>>>>> +++ b/include/linux/mm.h
>>>>> @@ -985,6 +985,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
>>>>>     int get_user_pages_fast(unsigned long start, int nr_pages, int write,
>>>>>     			struct page **pages);
>>>>>     struct page *get_dump_page(unsigned long addr);
>>>>> +extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
>>>>> +			    unsigned long address, unsigned int fault_flags);
>>>>>
>>>>>     extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
>>>>>     extern void do_invalidatepage(struct page *page, unsigned long offset);
>>>>> diff --git a/kernel/futex.c b/kernel/futex.c
>>>>> index fe28dc2..7a0a4ed 100644
>>>>> --- a/kernel/futex.c
>>>>> +++ b/kernel/futex.c
>>>>> @@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
>>>>>     	int ret;
>>>>>
>>>>>     	down_read(&mm->mmap_sem);
>>>>> -	ret = get_user_pages(current, mm, (unsigned long)uaddr,
>>>>> -			     1, 1, 0, NULL, NULL);
>>>>> +	ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
>>>>> +			       FAULT_FLAG_WRITE);
>>>>>     	up_read(&mm->mmap_sem);
>>>>>
>>>>>     	return ret<    0 ? ret : 0;
>>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>>> index 40b7531..b967fb0 100644
>>>>> --- a/mm/memory.c
>>>>> +++ b/mm/memory.c
>>>>> @@ -1815,7 +1815,64 @@ next_page:
>>>>>     }
>>>>>     EXPORT_SYMBOL(__get_user_pages);
>>>>>
>>>>> -/**
>>>>> +/*
>>>>> + * fixup_user_fault() - manually resolve a user page  fault
>>>>> + * @tsk:	the task_struct to use for page fault accounting, or
>>>>> + *		NULL if faults are not to be recorded.
>>>>> + * @mm:		mm_struct of target mm
>>>>> + * @address:	user address
>>>>> + * @fault_flags:flags to pass down to handle_mm_fault()
>>>>> + *
>>>>> + * This is meant to be called in the specific scenario where for
>>>>> + * locking reasons we try to access user memory in atomic context
>>>>> + * (within a pagefault_disable() section), this returns -EFAULT,
>>>>> + * and we want to resolve the user fault before trying again.
>>>>> + *
>>>>> + * Typically this is meant to be used by the futex code.
>>>>> + *
>>>>> + * The main difference with get_user_pages() is that this function
>>>>> + * will unconditionally call handle_mm_fault() which will in turn
>>>>> + * perform all the necessary SW fixup of the dirty and young bits
>>>>> + * in the PTE, while handle_mm_fault() only guarantees to update
>>>>> + * these in the struct page.
>>>>> + *
>>>>> + * This is important for some architectures where those bits also
>>>>> + * gate the access permission to the page because their are
>>>>> + * maintained in software. On such architecture, gup() will not
>>>>> + * be enough to make a subsequent access succeed.
>>>>> + *
>>>>> + * This should be called with the mm_sem held for read.
>>>>> + */
>>>>> +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
>>>>> +		     unsigned long address, unsigned int fault_flags)
>>>>> +{
>>>>> +	struct vm_area_struct *vma;
>>>>> +	int ret;
>>>>> +
>>>>> +	vma = find_extend_vma(mm, address);
>>>>> +	if (!vma || address<    vma->vm_start)
>>>>> +		return -EFAULT;
>>>>> +	
>>>>> +	ret = handle_mm_fault(mm, vma, address, fault_flags);
>>>>> +	if (ret&    VM_FAULT_ERROR) {
>>>>> +		if (ret&    VM_FAULT_OOM)
>>>>> +			return -ENOMEM;
>>>>> +		if (ret&    (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
>>>>> +			return -EHWPOISON;
>>>>> +		if (ret&    VM_FAULT_SIGBUS)
>>>>> +			return -EFAULT;
>>>>> +		BUG();
>>>>> +	}
>>>>> +	if (tsk) {
>>>>> +		if (ret&    VM_FAULT_MAJOR)
>>>>> +			tsk->maj_flt++;
>>>>> +		else
>>>>> +			tsk->min_flt++;
>>>>> +	}
>>>>> +	return 0;
>>>>> +}
>>>>> +
>>>>> +/*
>>>>>      * get_user_pages() - pin user pages in memory
>>>>>      * @tsk:	the task_struct to use for page fault accounting, or
>>>>>      *		NULL if faults are not to be recorded.
>>>>>
>>>>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>> Please read the FAQ at  http://www.tux.org/lkml/
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
>

^ permalink raw reply	[flat|nested] 138+ messages in thread

* RE: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW trackingof dirty & young
  2011-07-19  8:24                                       ` Shan Hai
@ 2011-07-19  8:26                                         ` David Laight
  -1 siblings, 0 replies; 138+ messages in thread
From: David Laight @ 2011-07-19  8:26 UTC (permalink / raw)
  To: Shan Hai, Benjamin Herrenschmidt
  Cc: tony.luck, Peter Zijlstra, Peter Zijlstra, linux-kernel,
	cmetcalf, dhowells, paulus, tglx, walken, linuxppc-dev, akpm

 
> Got it, if the fault_in_user_writeable() is designed to catch the
> exact same write permission fault problem we discuss here, so
> your patch fixed that very nicely, we should fixup it by directly
> calling handle_mm_fault like what you did because we are for sure
> to know what just happened(permission violation), its not necessary
> to check what's happened by calling gup-->follow_page, and
> further the follow_page failed to report the fault :-)

One thought I've had - and I don't know enough about the data
area in use to know if it is a problem - is what happens if
a different cpu faults on the same user page and has already
marked it 'valid' between the fault happening and the fault
handler looking at the page tables to find out why.
If any of the memory areas are shared, it might be that the
PTE (etc) might already show the page a writable by the
time the fault handler is looking at them - this might confuse it!

	David



^ permalink raw reply	[flat|nested] 138+ messages in thread

* RE: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW trackingof dirty & young
@ 2011-07-19  8:26                                         ` David Laight
  0 siblings, 0 replies; 138+ messages in thread
From: David Laight @ 2011-07-19  8:26 UTC (permalink / raw)
  To: Shan Hai, Benjamin Herrenschmidt
  Cc: tony.luck, Peter Zijlstra, Peter Zijlstra, linux-kernel,
	cmetcalf, dhowells, paulus, tglx, walken, linuxppc-dev, akpm

=20
> Got it, if the fault_in_user_writeable() is designed to catch the
> exact same write permission fault problem we discuss here, so
> your patch fixed that very nicely, we should fixup it by directly
> calling handle_mm_fault like what you did because we are for sure
> to know what just happened(permission violation), its not necessary
> to check what's happened by calling gup-->follow_page, and
> further the follow_page failed to report the fault :-)

One thought I've had - and I don't know enough about the data
area in use to know if it is a problem - is what happens if
a different cpu faults on the same user page and has already
marked it 'valid' between the fault happening and the fault
handler looking at the page tables to find out why.
If any of the memory areas are shared, it might be that the
PTE (etc) might already show the page a writable by the
time the fault handler is looking at them - this might confuse it!

	David

^ permalink raw reply	[flat|nested] 138+ messages in thread

* RE: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW trackingof dirty & young
  2011-07-19  8:26                                         ` David Laight
@ 2011-07-19  8:45                                           ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-19  8:45 UTC (permalink / raw)
  To: David Laight
  Cc: Shan Hai, tony.luck, Peter Zijlstra, Peter Zijlstra,
	linux-kernel, cmetcalf, dhowells, paulus, tglx, walken,
	linuxppc-dev, akpm

On Tue, 2011-07-19 at 09:26 +0100, David Laight wrote:
> > Got it, if the fault_in_user_writeable() is designed to catch the
> > exact same write permission fault problem we discuss here, so
> > your patch fixed that very nicely, we should fixup it by directly
> > calling handle_mm_fault like what you did because we are for sure
> > to know what just happened(permission violation), its not necessary
> > to check what's happened by calling gup-->follow_page, and
> > further the follow_page failed to report the fault :-)
> 
> One thought I've had - and I don't know enough about the data
> area in use to know if it is a problem - is what happens if
> a different cpu faults on the same user page and has already
> marked it 'valid' between the fault happening and the fault
> handler looking at the page tables to find out why.
> If any of the memory areas are shared, it might be that the
> PTE (etc) might already show the page a writable by the
> time the fault handler is looking at them - this might confuse it!

The same way handle_mm_fault() deals with two CPUs faulting on the same
page at the same time :-)

All the necessary locking is in there, handle_mm_fault() and friends
will walk the page tables, take the PTE lock, will notice it's already
been all fixed up (well that it doesn't need to do a page fault at
least), will then call ptep_set_access_flags() which will itself notice
there's nothing to do ... etc

So all you'll hit is the spurious fault TLB invalidate in the write
case, which is necessary on some archs (well, we think it is tho I don't
know which archs really :-)

Cheers,
Ben.


^ permalink raw reply	[flat|nested] 138+ messages in thread

* RE: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW trackingof dirty & young
@ 2011-07-19  8:45                                           ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-19  8:45 UTC (permalink / raw)
  To: David Laight
  Cc: tony.luck, Peter Zijlstra, Shan Hai, Peter Zijlstra,
	linux-kernel, cmetcalf, dhowells, paulus, tglx, walken,
	linuxppc-dev, akpm

On Tue, 2011-07-19 at 09:26 +0100, David Laight wrote:
> > Got it, if the fault_in_user_writeable() is designed to catch the
> > exact same write permission fault problem we discuss here, so
> > your patch fixed that very nicely, we should fixup it by directly
> > calling handle_mm_fault like what you did because we are for sure
> > to know what just happened(permission violation), its not necessary
> > to check what's happened by calling gup-->follow_page, and
> > further the follow_page failed to report the fault :-)
> 
> One thought I've had - and I don't know enough about the data
> area in use to know if it is a problem - is what happens if
> a different cpu faults on the same user page and has already
> marked it 'valid' between the fault happening and the fault
> handler looking at the page tables to find out why.
> If any of the memory areas are shared, it might be that the
> PTE (etc) might already show the page a writable by the
> time the fault handler is looking at them - this might confuse it!

The same way handle_mm_fault() deals with two CPUs faulting on the same
page at the same time :-)

All the necessary locking is in there, handle_mm_fault() and friends
will walk the page tables, take the PTE lock, will notice it's already
been all fixed up (well that it doesn't need to do a page fault at
least), will then call ptep_set_access_flags() which will itself notice
there's nothing to do ... etc

So all you'll hit is the spurious fault TLB invalidate in the write
case, which is necessary on some archs (well, we think it is tho I don't
know which archs really :-)

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW trackingof dirty & young
  2011-07-19  8:26                                         ` David Laight
@ 2011-07-19  8:45                                           ` Shan Hai
  -1 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-19  8:45 UTC (permalink / raw)
  To: David Laight
  Cc: Benjamin Herrenschmidt, tony.luck, Peter Zijlstra,
	Peter Zijlstra, linux-kernel, cmetcalf, dhowells, paulus, tglx,
	walken, linuxppc-dev, akpm

On 07/19/2011 04:26 PM, David Laight wrote:
>
>> Got it, if the fault_in_user_writeable() is designed to catch the
>> exact same write permission fault problem we discuss here, so
>> your patch fixed that very nicely, we should fixup it by directly
>> calling handle_mm_fault like what you did because we are for sure
>> to know what just happened(permission violation), its not necessary
>> to check what's happened by calling gup-->follow_page, and
>> further the follow_page failed to report the fault :-)
> One thought I've had - and I don't know enough about the data
> area in use to know if it is a problem - is what happens if
> a different cpu faults on the same user page and has already
> marked it 'valid' between the fault happening and the fault
> handler looking at the page tables to find out why.
> If any of the memory areas are shared, it might be that the
> PTE (etc) might already show the page a writable by the
> time the fault handler is looking at them - this might confuse it!
>

There is no problem at all if you mean *valid* by page present
and writable, because when the fault_in_user_writeable()
is called, the pte to the shared page was already setup by
demand paging and pte.present and pte.write was set, and the
reason why the fault was taken is that because of violation of
permission on present and writable user page occurred on sw
dirty/young tracking architectures.

Thanks
Shan Hai

> 	David
>
>


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW trackingof dirty & young
@ 2011-07-19  8:45                                           ` Shan Hai
  0 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-19  8:45 UTC (permalink / raw)
  To: David Laight
  Cc: tony.luck, Peter Zijlstra, Peter Zijlstra, linux-kernel,
	cmetcalf, dhowells, paulus, tglx, walken, linuxppc-dev, akpm

On 07/19/2011 04:26 PM, David Laight wrote:
>
>> Got it, if the fault_in_user_writeable() is designed to catch the
>> exact same write permission fault problem we discuss here, so
>> your patch fixed that very nicely, we should fixup it by directly
>> calling handle_mm_fault like what you did because we are for sure
>> to know what just happened(permission violation), its not necessary
>> to check what's happened by calling gup-->follow_page, and
>> further the follow_page failed to report the fault :-)
> One thought I've had - and I don't know enough about the data
> area in use to know if it is a problem - is what happens if
> a different cpu faults on the same user page and has already
> marked it 'valid' between the fault happening and the fault
> handler looking at the page tables to find out why.
> If any of the memory areas are shared, it might be that the
> PTE (etc) might already show the page a writable by the
> time the fault handler is looking at them - this might confuse it!
>

There is no problem at all if you mean *valid* by page present
and writable, because when the fault_in_user_writeable()
is called, the pte to the shared page was already setup by
demand paging and pte.present and pte.write was set, and the
reason why the fault was taken is that because of violation of
permission on present and writable user page occurred on sw
dirty/young tracking architectures.

Thanks
Shan Hai

> 	David
>
>

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
  2011-07-19  4:29                             ` Benjamin Herrenschmidt
@ 2011-07-19 11:10                               ` Peter Zijlstra
  -1 siblings, 0 replies; 138+ messages in thread
From: Peter Zijlstra @ 2011-07-19 11:10 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Shan Hai, paulus, tglx, walken, dhowells, cmetcalf, tony.luck,
	akpm, linuxppc-dev, linux-kernel

On Tue, 2011-07-19 at 14:29 +1000, Benjamin Herrenschmidt wrote:
> The futex code currently attempts to write to user memory within
> a pagefault disabled section, and if that fails, tries to fix it
> up using get_user_pages().
> 
> This doesn't work on archs where the dirty and young bits are
> maintained by software, since they will gate access permission
> in the TLB, and will not be updated by gup().
> 
> In addition, there's an expectation on some archs that a
> spurious write fault triggers a local TLB flush, and that is
> missing from the picture as well.
> 
> I decided that adding those "features" to gup() would be too much
> for this already too complex function, and instead added a new
> simpler fixup_user_fault() which is essentially a wrapper around
> handle_mm_fault() which the futex code can call.
> 
> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> 

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
@ 2011-07-19 11:10                               ` Peter Zijlstra
  0 siblings, 0 replies; 138+ messages in thread
From: Peter Zijlstra @ 2011-07-19 11:10 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: tony.luck, Shan Hai, linux-kernel, cmetcalf, dhowells, paulus,
	tglx, walken, linuxppc-dev, akpm

On Tue, 2011-07-19 at 14:29 +1000, Benjamin Herrenschmidt wrote:
> The futex code currently attempts to write to user memory within
> a pagefault disabled section, and if that fails, tries to fix it
> up using get_user_pages().
>=20
> This doesn't work on archs where the dirty and young bits are
> maintained by software, since they will gate access permission
> in the TLB, and will not be updated by gup().
>=20
> In addition, there's an expectation on some archs that a
> spurious write fault triggers a local TLB flush, and that is
> missing from the picture as well.
>=20
> I decided that adding those "features" to gup() would be too much
> for this already too complex function, and instead added a new
> simpler fixup_user_fault() which is essentially a wrapper around
> handle_mm_fault() which the futex code can call.
>=20
> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>=20

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
  2011-07-19  4:29                             ` Benjamin Herrenschmidt
@ 2011-07-20 14:39                               ` Darren Hart
  -1 siblings, 0 replies; 138+ messages in thread
From: Darren Hart @ 2011-07-20 14:39 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Shan Hai, Peter Zijlstra, Peter Zijlstra, paulus, tglx, walken,
	dhowells, cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel

Obviously no objection from the futex side of things, looks good. Couple
nits on the function comment:

On 07/18/2011 09:29 PM, Benjamin Herrenschmidt wrote:
...
> -/**
> +/*
> + * fixup_user_fault() - manually resolve a user page  fault

s/  fault/ fault/

> + * @tsk:	the task_struct to use for page fault accounting, or
> + *		NULL if faults are not to be recorded.
> + * @mm:		mm_struct of target mm
> + * @address:	user address
> + * @fault_flags:flags to pass down to handle_mm_fault()
> + *
> + * This is meant to be called in the specific scenario where for
> + * locking reasons we try to access user memory in atomic context
> + * (within a pagefault_disable() section), this returns -EFAULT,
> + * and we want to resolve the user fault before trying again.
> + *
> + * Typically this is meant to be used by the futex code.
> + *
> + * The main difference with get_user_pages() is that this function
> + * will unconditionally call handle_mm_fault() which will in turn
> + * perform all the necessary SW fixup of the dirty and young bits
> + * in the PTE, while handle_mm_fault() only guarantees to update
> + * these in the struct page.
> + *
> + * This is important for some architectures where those bits also
> + * gate the access permission to the page because their are

s/their/they/

Thanks,

-- 
Darren Hart
Intel Open Source Technology Center
Yocto Project - Linux Kernel

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
@ 2011-07-20 14:39                               ` Darren Hart
  0 siblings, 0 replies; 138+ messages in thread
From: Darren Hart @ 2011-07-20 14:39 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: tony.luck, Peter Zijlstra, Shan Hai, Peter Zijlstra,
	linux-kernel, cmetcalf, dhowells, paulus, tglx, walken,
	linuxppc-dev, akpm

Obviously no objection from the futex side of things, looks good. Couple
nits on the function comment:

On 07/18/2011 09:29 PM, Benjamin Herrenschmidt wrote:
...
> -/**
> +/*
> + * fixup_user_fault() - manually resolve a user page  fault

s/  fault/ fault/

> + * @tsk:	the task_struct to use for page fault accounting, or
> + *		NULL if faults are not to be recorded.
> + * @mm:		mm_struct of target mm
> + * @address:	user address
> + * @fault_flags:flags to pass down to handle_mm_fault()
> + *
> + * This is meant to be called in the specific scenario where for
> + * locking reasons we try to access user memory in atomic context
> + * (within a pagefault_disable() section), this returns -EFAULT,
> + * and we want to resolve the user fault before trying again.
> + *
> + * Typically this is meant to be used by the futex code.
> + *
> + * The main difference with get_user_pages() is that this function
> + * will unconditionally call handle_mm_fault() which will in turn
> + * perform all the necessary SW fixup of the dirty and young bits
> + * in the PTE, while handle_mm_fault() only guarantees to update
> + * these in the struct page.
> + *
> + * This is important for some architectures where those bits also
> + * gate the access permission to the page because their are

s/their/they/

Thanks,

-- 
Darren Hart
Intel Open Source Technology Center
Yocto Project - Linux Kernel

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
  2011-07-19  4:29                             ` Benjamin Herrenschmidt
@ 2011-07-21 22:36                               ` Andrew Morton
  -1 siblings, 0 replies; 138+ messages in thread
From: Andrew Morton @ 2011-07-21 22:36 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Shan Hai, Peter Zijlstra, Peter Zijlstra, paulus, tglx, walken,
	dhowells, cmetcalf, tony.luck, linuxppc-dev, linux-kernel

On Tue, 19 Jul 2011 14:29:22 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> The futex code currently attempts to write to user memory within
> a pagefault disabled section, and if that fails, tries to fix it
> up using get_user_pages().
> 
> This doesn't work on archs where the dirty and young bits are
> maintained by software, since they will gate access permission
> in the TLB, and will not be updated by gup().
> 
> In addition, there's an expectation on some archs that a
> spurious write fault triggers a local TLB flush, and that is
> missing from the picture as well.
> 
> I decided that adding those "features" to gup() would be too much
> for this already too complex function, and instead added a new
> simpler fixup_user_fault() which is essentially a wrapper around
> handle_mm_fault() which the futex code can call.
> 
> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> ---
> 
> Shan, can you test this ? It might not fix the problem

um, what problem.  There's no description here of the user-visible
effects of the bug hence it's hard to work out what kernel version(s)
should receive this patch.

What kernel version(s) should receive this patch?

> since I'm
> starting to have the nasty feeling that you are hitting what is
> somewhat a subtly different issue or my previous patch should
> have worked (but then I might have done a stupid mistake as well)
> but let us know anyway.

I assume that Shan reported the secret problem so I added the
reported-by to the changelog.


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
@ 2011-07-21 22:36                               ` Andrew Morton
  0 siblings, 0 replies; 138+ messages in thread
From: Andrew Morton @ 2011-07-21 22:36 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: tony.luck, Peter Zijlstra, Shan Hai, Peter Zijlstra,
	linux-kernel, cmetcalf, dhowells, paulus, tglx, walken,
	linuxppc-dev

On Tue, 19 Jul 2011 14:29:22 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> The futex code currently attempts to write to user memory within
> a pagefault disabled section, and if that fails, tries to fix it
> up using get_user_pages().
> 
> This doesn't work on archs where the dirty and young bits are
> maintained by software, since they will gate access permission
> in the TLB, and will not be updated by gup().
> 
> In addition, there's an expectation on some archs that a
> spurious write fault triggers a local TLB flush, and that is
> missing from the picture as well.
> 
> I decided that adding those "features" to gup() would be too much
> for this already too complex function, and instead added a new
> simpler fixup_user_fault() which is essentially a wrapper around
> handle_mm_fault() which the futex code can call.
> 
> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> ---
> 
> Shan, can you test this ? It might not fix the problem

um, what problem.  There's no description here of the user-visible
effects of the bug hence it's hard to work out what kernel version(s)
should receive this patch.

What kernel version(s) should receive this patch?

> since I'm
> starting to have the nasty feeling that you are hitting what is
> somewhat a subtly different issue or my previous patch should
> have worked (but then I might have done a stupid mistake as well)
> but let us know anyway.

I assume that Shan reported the secret problem so I added the
reported-by to the changelog.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
  2011-07-21 22:36                               ` Andrew Morton
@ 2011-07-21 22:52                                 ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-21 22:52 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Shan Hai, Peter Zijlstra, Peter Zijlstra, paulus, tglx, walken,
	dhowells, cmetcalf, tony.luck, linuxppc-dev, linux-kernel

On Thu, 2011-07-21 at 15:36 -0700, Andrew Morton wrote:
> On Tue, 19 Jul 2011 14:29:22 +1000
> Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> 
> > The futex code currently attempts to write to user memory within
> > a pagefault disabled section, and if that fails, tries to fix it
> > up using get_user_pages().
> > 
> > This doesn't work on archs where the dirty and young bits are
> > maintained by software, since they will gate access permission
> > in the TLB, and will not be updated by gup().
> > 
> > In addition, there's an expectation on some archs that a
> > spurious write fault triggers a local TLB flush, and that is
> > missing from the picture as well.
> > 
> > I decided that adding those "features" to gup() would be too much
> > for this already too complex function, and instead added a new
> > simpler fixup_user_fault() which is essentially a wrapper around
> > handle_mm_fault() which the futex code can call.
> > 
> > Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> > ---
> > 
> > Shan, can you test this ? It might not fix the problem
> 
> um, what problem.  There's no description here of the user-visible
> effects of the bug hence it's hard to work out what kernel version(s)
> should receive this patch.

Shan could give you an actual example (it was in the previous thread),
but basically, livelock as the kernel keeps trying and trying the
in_atomic op and never resolves it.
 
> What kernel version(s) should receive this patch?

I haven't dug. Probably anything it applies on as far as we did that
trick of atomic + gup() for futex.

> > since I'm
> > starting to have the nasty feeling that you are hitting what is
> > somewhat a subtly different issue or my previous patch should
> > have worked (but then I might have done a stupid mistake as well)
> > but let us know anyway.
> 
> I assume that Shan reported the secret problem so I added the
> reported-by to the changelog.

He did :-) Shan, care to provide a rough explanation of what you
observed ?

Also Russell confirmed that ARM should be affected as well.

Cheers,
Ben.


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
@ 2011-07-21 22:52                                 ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-21 22:52 UTC (permalink / raw)
  To: Andrew Morton
  Cc: tony.luck, Peter Zijlstra, Shan Hai, Peter Zijlstra,
	linux-kernel, cmetcalf, dhowells, paulus, tglx, walken,
	linuxppc-dev

On Thu, 2011-07-21 at 15:36 -0700, Andrew Morton wrote:
> On Tue, 19 Jul 2011 14:29:22 +1000
> Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> 
> > The futex code currently attempts to write to user memory within
> > a pagefault disabled section, and if that fails, tries to fix it
> > up using get_user_pages().
> > 
> > This doesn't work on archs where the dirty and young bits are
> > maintained by software, since they will gate access permission
> > in the TLB, and will not be updated by gup().
> > 
> > In addition, there's an expectation on some archs that a
> > spurious write fault triggers a local TLB flush, and that is
> > missing from the picture as well.
> > 
> > I decided that adding those "features" to gup() would be too much
> > for this already too complex function, and instead added a new
> > simpler fixup_user_fault() which is essentially a wrapper around
> > handle_mm_fault() which the futex code can call.
> > 
> > Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> > ---
> > 
> > Shan, can you test this ? It might not fix the problem
> 
> um, what problem.  There's no description here of the user-visible
> effects of the bug hence it's hard to work out what kernel version(s)
> should receive this patch.

Shan could give you an actual example (it was in the previous thread),
but basically, livelock as the kernel keeps trying and trying the
in_atomic op and never resolves it.
 
> What kernel version(s) should receive this patch?

I haven't dug. Probably anything it applies on as far as we did that
trick of atomic + gup() for futex.

> > since I'm
> > starting to have the nasty feeling that you are hitting what is
> > somewhat a subtly different issue or my previous patch should
> > have worked (but then I might have done a stupid mistake as well)
> > but let us know anyway.
> 
> I assume that Shan reported the secret problem so I added the
> reported-by to the changelog.

He did :-) Shan, care to provide a rough explanation of what you
observed ?

Also Russell confirmed that ARM should be affected as well.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
  2011-07-21 22:52                                 ` Benjamin Herrenschmidt
@ 2011-07-21 22:57                                   ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-21 22:57 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Shan Hai, Peter Zijlstra, Peter Zijlstra, paulus, tglx, walken,
	dhowells, cmetcalf, tony.luck, linuxppc-dev, linux-kernel

On Fri, 2011-07-22 at 08:52 +1000, Benjamin Herrenschmidt wrote:

> > um, what problem.  There's no description here of the user-visible
> > effects of the bug hence it's hard to work out what kernel version(s)
> > should receive this patch.
> 
> Shan could give you an actual example (it was in the previous thread),
> but basically, livelock as the kernel keeps trying and trying the
> in_atomic op and never resolves it.
>  
> > What kernel version(s) should receive this patch?
> 
> I haven't dug. Probably anything it applies on as far as we did that
> trick of atomic + gup() for futex.

Oops, I just realize I didn't document the problem at all in the
changelog .. sorry. I meant to say:

On archs who use SW tracking of dirty & young, a page without dirty is
effectively mapped read-only and a page without young unaccessible in
the PTE.

Additionally, some architectures might lazily flush the TLB when
relaxing write protection (by doing only a local flush), and expect a
fault to invalidate the stale entry if it's still present on another
processor.

The futex code assumes that if the "in_atomic()" access -EFAULT's, it
can "fix it up" by causing get_user_pages() which would then be
equivalent to taking the fault.

However that isn't the case. get_user_pages() will not call
handle_mm_fault() in the case where the PTE seems to have the right
permissions, regardless of the dirty and young state. It will eventually
update those bits ... in the struct page, but not in the PTE. 

Additionally, it will not handle the lazy TLB flushing that can be
required by some architectures in the fault case.

Basically, gup is the wrong interface for the job. The patch provides a
more appropriate one which boils down to just calling handle_mm_fault()
since what we are trying to do is simulate a real page fault.

Cheers,
Ben.

> > > since I'm
> > > starting to have the nasty feeling that you are hitting what is
> > > somewhat a subtly different issue or my previous patch should
> > > have worked (but then I might have done a stupid mistake as well)
> > > but let us know anyway.
> > 
> > I assume that Shan reported the secret problem so I added the
> > reported-by to the changelog.
> 
> He did :-) Shan, care to provide a rough explanation of what you
> observed ?
> 
> Also Russell confirmed that ARM should be affected as well.
> 
> Cheers,
> Ben.



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
@ 2011-07-21 22:57                                   ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-21 22:57 UTC (permalink / raw)
  To: Andrew Morton
  Cc: tony.luck, Peter Zijlstra, Shan Hai, Peter Zijlstra,
	linux-kernel, cmetcalf, dhowells, paulus, tglx, walken,
	linuxppc-dev

On Fri, 2011-07-22 at 08:52 +1000, Benjamin Herrenschmidt wrote:

> > um, what problem.  There's no description here of the user-visible
> > effects of the bug hence it's hard to work out what kernel version(s)
> > should receive this patch.
> 
> Shan could give you an actual example (it was in the previous thread),
> but basically, livelock as the kernel keeps trying and trying the
> in_atomic op and never resolves it.
>  
> > What kernel version(s) should receive this patch?
> 
> I haven't dug. Probably anything it applies on as far as we did that
> trick of atomic + gup() for futex.

Oops, I just realize I didn't document the problem at all in the
changelog .. sorry. I meant to say:

On archs who use SW tracking of dirty & young, a page without dirty is
effectively mapped read-only and a page without young unaccessible in
the PTE.

Additionally, some architectures might lazily flush the TLB when
relaxing write protection (by doing only a local flush), and expect a
fault to invalidate the stale entry if it's still present on another
processor.

The futex code assumes that if the "in_atomic()" access -EFAULT's, it
can "fix it up" by causing get_user_pages() which would then be
equivalent to taking the fault.

However that isn't the case. get_user_pages() will not call
handle_mm_fault() in the case where the PTE seems to have the right
permissions, regardless of the dirty and young state. It will eventually
update those bits ... in the struct page, but not in the PTE. 

Additionally, it will not handle the lazy TLB flushing that can be
required by some architectures in the fault case.

Basically, gup is the wrong interface for the job. The patch provides a
more appropriate one which boils down to just calling handle_mm_fault()
since what we are trying to do is simulate a real page fault.

Cheers,
Ben.

> > > since I'm
> > > starting to have the nasty feeling that you are hitting what is
> > > somewhat a subtly different issue or my previous patch should
> > > have worked (but then I might have done a stupid mistake as well)
> > > but let us know anyway.
> > 
> > I assume that Shan reported the secret problem so I added the
> > reported-by to the changelog.
> 
> He did :-) Shan, care to provide a rough explanation of what you
> observed ?
> 
> Also Russell confirmed that ARM should be affected as well.
> 
> Cheers,
> Ben.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
  2011-07-21 22:52                                 ` Benjamin Herrenschmidt
@ 2011-07-21 22:59                                   ` Andrew Morton
  -1 siblings, 0 replies; 138+ messages in thread
From: Andrew Morton @ 2011-07-21 22:59 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Shan Hai, Peter Zijlstra, Peter Zijlstra, paulus, tglx, walken,
	dhowells, cmetcalf, tony.luck, linuxppc-dev, linux-kernel

On Fri, 22 Jul 2011 08:52:06 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> On Thu, 2011-07-21 at 15:36 -0700, Andrew Morton wrote:
> > On Tue, 19 Jul 2011 14:29:22 +1000
> > Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> > 
> > > The futex code currently attempts to write to user memory within
> > > a pagefault disabled section, and if that fails, tries to fix it
> > > up using get_user_pages().
> > > 
> > > This doesn't work on archs where the dirty and young bits are
> > > maintained by software, since they will gate access permission
> > > in the TLB, and will not be updated by gup().
> > > 
> > > In addition, there's an expectation on some archs that a
> > > spurious write fault triggers a local TLB flush, and that is
> > > missing from the picture as well.
> > > 
> > > I decided that adding those "features" to gup() would be too much
> > > for this already too complex function, and instead added a new
> > > simpler fixup_user_fault() which is essentially a wrapper around
> > > handle_mm_fault() which the futex code can call.
> > > 
> > > Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> > > ---
> > > 
> > > Shan, can you test this ? It might not fix the problem
> > 
> > um, what problem.  There's no description here of the user-visible
> > effects of the bug hence it's hard to work out what kernel version(s)
> > should receive this patch.
> 
> Shan could give you an actual example (it was in the previous thread),
> but basically, livelock as the kernel keeps trying and trying the
> in_atomic op and never resolves it.
>  
> > What kernel version(s) should receive this patch?
> 
> I haven't dug. Probably anything it applies on as far as we did that
> trick of atomic + gup() for futex.

You're not understanding me.

I need a good reason to merge this into 3.0.

The -stable maintainers need even better reasons to merge this into
earlier kernels.

Please provide those reasons!

(Documentation/stable_kernel_rules.txt, 4th bullet)

(And it's not just me and -stable maintainers.  Distro maintainers will
also look at this patch and wonder whether they should merge it)

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
@ 2011-07-21 22:59                                   ` Andrew Morton
  0 siblings, 0 replies; 138+ messages in thread
From: Andrew Morton @ 2011-07-21 22:59 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: tony.luck, Peter Zijlstra, Shan Hai, Peter Zijlstra,
	linux-kernel, cmetcalf, dhowells, paulus, tglx, walken,
	linuxppc-dev

On Fri, 22 Jul 2011 08:52:06 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> On Thu, 2011-07-21 at 15:36 -0700, Andrew Morton wrote:
> > On Tue, 19 Jul 2011 14:29:22 +1000
> > Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> > 
> > > The futex code currently attempts to write to user memory within
> > > a pagefault disabled section, and if that fails, tries to fix it
> > > up using get_user_pages().
> > > 
> > > This doesn't work on archs where the dirty and young bits are
> > > maintained by software, since they will gate access permission
> > > in the TLB, and will not be updated by gup().
> > > 
> > > In addition, there's an expectation on some archs that a
> > > spurious write fault triggers a local TLB flush, and that is
> > > missing from the picture as well.
> > > 
> > > I decided that adding those "features" to gup() would be too much
> > > for this already too complex function, and instead added a new
> > > simpler fixup_user_fault() which is essentially a wrapper around
> > > handle_mm_fault() which the futex code can call.
> > > 
> > > Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> > > ---
> > > 
> > > Shan, can you test this ? It might not fix the problem
> > 
> > um, what problem.  There's no description here of the user-visible
> > effects of the bug hence it's hard to work out what kernel version(s)
> > should receive this patch.
> 
> Shan could give you an actual example (it was in the previous thread),
> but basically, livelock as the kernel keeps trying and trying the
> in_atomic op and never resolves it.
>  
> > What kernel version(s) should receive this patch?
> 
> I haven't dug. Probably anything it applies on as far as we did that
> trick of atomic + gup() for futex.

You're not understanding me.

I need a good reason to merge this into 3.0.

The -stable maintainers need even better reasons to merge this into
earlier kernels.

Please provide those reasons!

(Documentation/stable_kernel_rules.txt, 4th bullet)

(And it's not just me and -stable maintainers.  Distro maintainers will
also look at this patch and wonder whether they should merge it)

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
  2011-07-21 22:59                                   ` Andrew Morton
@ 2011-07-22  1:40                                     ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-22  1:40 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Shan Hai, Peter Zijlstra, Peter Zijlstra, paulus, tglx, walken,
	dhowells, cmetcalf, tony.luck, linuxppc-dev, linux-kernel


> You're not understanding me.
> 
> I need a good reason to merge this into 3.0.
> 
> The -stable maintainers need even better reasons to merge this into
> earlier kernels.
> 
> Please provide those reasons!
> 
> (Documentation/stable_kernel_rules.txt, 4th bullet)
> 
> (And it's not just me and -stable maintainers.  Distro maintainers will
> also look at this patch and wonder whether they should merge it)

Well, as an arch maintainer, I can get stable maintainers to merge
anything I CC to stable :-)

Now, the good reason should have been rather obvious... it's a user
exploitable kernel lockup.

Cheers,
Ben.



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
@ 2011-07-22  1:40                                     ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-22  1:40 UTC (permalink / raw)
  To: Andrew Morton
  Cc: tony.luck, Peter Zijlstra, Shan Hai, Peter Zijlstra,
	linux-kernel, cmetcalf, dhowells, paulus, tglx, walken,
	linuxppc-dev


> You're not understanding me.
> 
> I need a good reason to merge this into 3.0.
> 
> The -stable maintainers need even better reasons to merge this into
> earlier kernels.
> 
> Please provide those reasons!
> 
> (Documentation/stable_kernel_rules.txt, 4th bullet)
> 
> (And it's not just me and -stable maintainers.  Distro maintainers will
> also look at this patch and wonder whether they should merge it)

Well, as an arch maintainer, I can get stable maintainers to merge
anything I CC to stable :-)

Now, the good reason should have been rather obvious... it's a user
exploitable kernel lockup.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
  2011-07-21 22:59                                   ` Andrew Morton
@ 2011-07-22  1:54                                     ` Shan Hai
  -1 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-22  1:54 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Benjamin Herrenschmidt, Peter Zijlstra, Peter Zijlstra, paulus,
	tglx, walken, dhowells, cmetcalf, tony.luck, linuxppc-dev,
	linux-kernel

On 07/22/2011 06:59 AM, Andrew Morton wrote:
> On Fri, 22 Jul 2011 08:52:06 +1000
> Benjamin Herrenschmidt<benh@kernel.crashing.org>  wrote:
>
>> On Thu, 2011-07-21 at 15:36 -0700, Andrew Morton wrote:
>>> On Tue, 19 Jul 2011 14:29:22 +1000
>>> Benjamin Herrenschmidt<benh@kernel.crashing.org>  wrote:
>>>
>>>> The futex code currently attempts to write to user memory within
>>>> a pagefault disabled section, and if that fails, tries to fix it
>>>> up using get_user_pages().
>>>>
>>>> This doesn't work on archs where the dirty and young bits are
>>>> maintained by software, since they will gate access permission
>>>> in the TLB, and will not be updated by gup().
>>>>
>>>> In addition, there's an expectation on some archs that a
>>>> spurious write fault triggers a local TLB flush, and that is
>>>> missing from the picture as well.
>>>>
>>>> I decided that adding those "features" to gup() would be too much
>>>> for this already too complex function, and instead added a new
>>>> simpler fixup_user_fault() which is essentially a wrapper around
>>>> handle_mm_fault() which the futex code can call.
>>>>
>>>> Signed-off-by: Benjamin Herrenschmidt<benh@kernel.crashing.org>
>>>> ---
>>>>
>>>> Shan, can you test this ? It might not fix the problem
>>> um, what problem.  There's no description here of the user-visible
>>> effects of the bug hence it's hard to work out what kernel version(s)
>>> should receive this patch.
>> Shan could give you an actual example (it was in the previous thread),
>> but basically, livelock as the kernel keeps trying and trying the
>> in_atomic op and never resolves it.
>>
>>> What kernel version(s) should receive this patch?
>> I haven't dug. Probably anything it applies on as far as we did that
>> trick of atomic + gup() for futex.
> You're not understanding me.
>
> I need a good reason to merge this into 3.0.
>
> The -stable maintainers need even better reasons to merge this into
> earlier kernels.
>
> Please provide those reasons!
>

Summary:
- Encountered a 100% CPU system usage problem on pthread_mutex allocated 
in a
     shared memory region, and the problem occurs only on setting 
PRIORITY_INHERITANCE
     to the pthread_mutex.
- ftrace result reveals that an infinite loop in the futex_lock_pi 
caused high CPU usage.
- The powerpc e500 was affected but the x86 was not.
     I have not tested on other archs so I am not sure whether the other 
archs are attacked
     by the problem.
- Tested it on 2.6.34 and 3.0-rc7, both are affected, earlier versions 
might be affected.

Please refer the threads "[PATCH 0/1] Fixup write permission of TLB on 
powerpc e500 core"
and "[PATCH 1/1] Fixup write permission of TLB on powerpc e500 core" for 
the whole story.
Provided the test case code in the [PATH 0/1].

Thanks
Shan Hai

> (Documentation/stable_kernel_rules.txt, 4th bullet)
>
> (And it's not just me and -stable maintainers.  Distro maintainers will
> also look at this patch and wonder whether they should merge it)


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
@ 2011-07-22  1:54                                     ` Shan Hai
  0 siblings, 0 replies; 138+ messages in thread
From: Shan Hai @ 2011-07-22  1:54 UTC (permalink / raw)
  To: Andrew Morton
  Cc: tony.luck, Peter Zijlstra, Peter Zijlstra, linux-kernel,
	cmetcalf, dhowells, paulus, tglx, walken, linuxppc-dev

On 07/22/2011 06:59 AM, Andrew Morton wrote:
> On Fri, 22 Jul 2011 08:52:06 +1000
> Benjamin Herrenschmidt<benh@kernel.crashing.org>  wrote:
>
>> On Thu, 2011-07-21 at 15:36 -0700, Andrew Morton wrote:
>>> On Tue, 19 Jul 2011 14:29:22 +1000
>>> Benjamin Herrenschmidt<benh@kernel.crashing.org>  wrote:
>>>
>>>> The futex code currently attempts to write to user memory within
>>>> a pagefault disabled section, and if that fails, tries to fix it
>>>> up using get_user_pages().
>>>>
>>>> This doesn't work on archs where the dirty and young bits are
>>>> maintained by software, since they will gate access permission
>>>> in the TLB, and will not be updated by gup().
>>>>
>>>> In addition, there's an expectation on some archs that a
>>>> spurious write fault triggers a local TLB flush, and that is
>>>> missing from the picture as well.
>>>>
>>>> I decided that adding those "features" to gup() would be too much
>>>> for this already too complex function, and instead added a new
>>>> simpler fixup_user_fault() which is essentially a wrapper around
>>>> handle_mm_fault() which the futex code can call.
>>>>
>>>> Signed-off-by: Benjamin Herrenschmidt<benh@kernel.crashing.org>
>>>> ---
>>>>
>>>> Shan, can you test this ? It might not fix the problem
>>> um, what problem.  There's no description here of the user-visible
>>> effects of the bug hence it's hard to work out what kernel version(s)
>>> should receive this patch.
>> Shan could give you an actual example (it was in the previous thread),
>> but basically, livelock as the kernel keeps trying and trying the
>> in_atomic op and never resolves it.
>>
>>> What kernel version(s) should receive this patch?
>> I haven't dug. Probably anything it applies on as far as we did that
>> trick of atomic + gup() for futex.
> You're not understanding me.
>
> I need a good reason to merge this into 3.0.
>
> The -stable maintainers need even better reasons to merge this into
> earlier kernels.
>
> Please provide those reasons!
>

Summary:
- Encountered a 100% CPU system usage problem on pthread_mutex allocated 
in a
     shared memory region, and the problem occurs only on setting 
PRIORITY_INHERITANCE
     to the pthread_mutex.
- ftrace result reveals that an infinite loop in the futex_lock_pi 
caused high CPU usage.
- The powerpc e500 was affected but the x86 was not.
     I have not tested on other archs so I am not sure whether the other 
archs are attacked
     by the problem.
- Tested it on 2.6.34 and 3.0-rc7, both are affected, earlier versions 
might be affected.

Please refer the threads "[PATCH 0/1] Fixup write permission of TLB on 
powerpc e500 core"
and "[PATCH 1/1] Fixup write permission of TLB on powerpc e500 core" for 
the whole story.
Provided the test case code in the [PATH 0/1].

Thanks
Shan Hai

> (Documentation/stable_kernel_rules.txt, 4th bullet)
>
> (And it's not just me and -stable maintainers.  Distro maintainers will
> also look at this patch and wonder whether they should merge it)

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
  2011-07-19  4:29                             ` Benjamin Herrenschmidt
@ 2011-07-27  6:50                               ` Mike Frysinger
  -1 siblings, 0 replies; 138+ messages in thread
From: Mike Frysinger @ 2011-07-27  6:50 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Shan Hai, Peter Zijlstra, Peter Zijlstra, paulus, tglx, walken,
	dhowells, cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel,
	uclinux-dist-devel

On Mon, Jul 18, 2011 at 21:29, Benjamin Herrenschmidt wrote:
> The futex code currently attempts to write to user memory within
> a pagefault disabled section, and if that fails, tries to fix it
> up using get_user_pages().
>
> This doesn't work on archs where the dirty and young bits are
> maintained by software, since they will gate access permission
> in the TLB, and will not be updated by gup().
>
> In addition, there's an expectation on some archs that a
> spurious write fault triggers a local TLB flush, and that is
> missing from the picture as well.
>
> I decided that adding those "features" to gup() would be too much
> for this already too complex function, and instead added a new
> simpler fixup_user_fault() which is essentially a wrapper around
> handle_mm_fault() which the futex code can call.

unfortunately, this breaks all nommu ports.  you added
fixup_user_fault() to mm/memory.c only which is not used by nommu
logic.
-mike

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
@ 2011-07-27  6:50                               ` Mike Frysinger
  0 siblings, 0 replies; 138+ messages in thread
From: Mike Frysinger @ 2011-07-27  6:50 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: tony.luck, Peter Zijlstra, Shan Hai, Peter Zijlstra,
	linux-kernel, cmetcalf, dhowells, paulus, uclinux-dist-devel,
	tglx, walken, linuxppc-dev, akpm

On Mon, Jul 18, 2011 at 21:29, Benjamin Herrenschmidt wrote:
> The futex code currently attempts to write to user memory within
> a pagefault disabled section, and if that fails, tries to fix it
> up using get_user_pages().
>
> This doesn't work on archs where the dirty and young bits are
> maintained by software, since they will gate access permission
> in the TLB, and will not be updated by gup().
>
> In addition, there's an expectation on some archs that a
> spurious write fault triggers a local TLB flush, and that is
> missing from the picture as well.
>
> I decided that adding those "features" to gup() would be too much
> for this already too complex function, and instead added a new
> simpler fixup_user_fault() which is essentially a wrapper around
> handle_mm_fault() which the futex code can call.

unfortunately, this breaks all nommu ports.  you added
fixup_user_fault() to mm/memory.c only which is not used by nommu
logic.
-mike

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
  2011-07-27  6:50                               ` Mike Frysinger
@ 2011-07-27  7:58                                 ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-27  7:58 UTC (permalink / raw)
  To: Mike Frysinger
  Cc: Shan Hai, Peter Zijlstra, Peter Zijlstra, paulus, tglx, walken,
	dhowells, cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel,
	uclinux-dist-devel

On Tue, 2011-07-26 at 23:50 -0700, Mike Frysinger wrote:
> On Mon, Jul 18, 2011 at 21:29, Benjamin Herrenschmidt wrote:
> > The futex code currently attempts to write to user memory within
> > a pagefault disabled section, and if that fails, tries to fix it
> > up using get_user_pages().
> >
> > This doesn't work on archs where the dirty and young bits are
> > maintained by software, since they will gate access permission
> > in the TLB, and will not be updated by gup().
> >
> > In addition, there's an expectation on some archs that a
> > spurious write fault triggers a local TLB flush, and that is
> > missing from the picture as well.
> >
> > I decided that adding those "features" to gup() would be too much
> > for this already too complex function, and instead added a new
> > simpler fixup_user_fault() which is essentially a wrapper around
> > handle_mm_fault() which the futex code can call.
> 
> unfortunately, this breaks all nommu ports.  you added
> fixup_user_fault() to mm/memory.c only which is not used by nommu

Argh. Andrew, do you want to send a fix ? I won't be able to do that
tonight, I have to go.

What should nommu do anyways ? it's not like there's much it can do
right ? It should never even hit the fault path to start with ...

Cheers,
Ben.



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
@ 2011-07-27  7:58                                 ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-27  7:58 UTC (permalink / raw)
  To: Mike Frysinger
  Cc: tony.luck, Peter Zijlstra, Shan Hai, Peter Zijlstra,
	linux-kernel, cmetcalf, dhowells, paulus, uclinux-dist-devel,
	tglx, walken, linuxppc-dev, akpm

On Tue, 2011-07-26 at 23:50 -0700, Mike Frysinger wrote:
> On Mon, Jul 18, 2011 at 21:29, Benjamin Herrenschmidt wrote:
> > The futex code currently attempts to write to user memory within
> > a pagefault disabled section, and if that fails, tries to fix it
> > up using get_user_pages().
> >
> > This doesn't work on archs where the dirty and young bits are
> > maintained by software, since they will gate access permission
> > in the TLB, and will not be updated by gup().
> >
> > In addition, there's an expectation on some archs that a
> > spurious write fault triggers a local TLB flush, and that is
> > missing from the picture as well.
> >
> > I decided that adding those "features" to gup() would be too much
> > for this already too complex function, and instead added a new
> > simpler fixup_user_fault() which is essentially a wrapper around
> > handle_mm_fault() which the futex code can call.
> 
> unfortunately, this breaks all nommu ports.  you added
> fixup_user_fault() to mm/memory.c only which is not used by nommu

Argh. Andrew, do you want to send a fix ? I won't be able to do that
tonight, I have to go.

What should nommu do anyways ? it's not like there's much it can do
right ? It should never even hit the fault path to start with ...

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
  2011-07-27  7:58                                 ` Benjamin Herrenschmidt
@ 2011-07-27  8:59                                   ` Peter Zijlstra
  -1 siblings, 0 replies; 138+ messages in thread
From: Peter Zijlstra @ 2011-07-27  8:59 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Mike Frysinger, Shan Hai, paulus, tglx, walken, dhowells,
	cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel,
	uclinux-dist-devel

On Wed, 2011-07-27 at 17:58 +1000, Benjamin Herrenschmidt wrote:

> What should nommu do anyways ? it's not like there's much it can do
> right ? It should never even hit the fault path to start with ...

Something like the below makes a nommu arm config build.. David, is this
indeed the correct thing to do for nommu?

---
Index: linux-2.6/mm/nommu.c
===================================================================
--- linux-2.6.orig/mm/nommu.c
+++ linux-2.6/mm/nommu.c
@@ -190,6 +190,12 @@ int get_user_pages(struct task_struct *t
 }
 EXPORT_SYMBOL(get_user_pages);
 
+int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+		     unsigned long address, unsigned int fault_flags)
+{
+	BUG(); /* nommu should never call this */
+}
+
 /**
  * follow_pfn - look up PFN at a user virtual address
  * @vma: memory mapping


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
@ 2011-07-27  8:59                                   ` Peter Zijlstra
  0 siblings, 0 replies; 138+ messages in thread
From: Peter Zijlstra @ 2011-07-27  8:59 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: tony.luck, Mike Frysinger, Shan Hai, linux-kernel, cmetcalf,
	dhowells, paulus, uclinux-dist-devel, tglx, walken, linuxppc-dev,
	akpm

On Wed, 2011-07-27 at 17:58 +1000, Benjamin Herrenschmidt wrote:

> What should nommu do anyways ? it's not like there's much it can do
> right ? It should never even hit the fault path to start with ...

Something like the below makes a nommu arm config build.. David, is this
indeed the correct thing to do for nommu?

---
Index: linux-2.6/mm/nommu.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- linux-2.6.orig/mm/nommu.c
+++ linux-2.6/mm/nommu.c
@@ -190,6 +190,12 @@ int get_user_pages(struct task_struct *t
 }
 EXPORT_SYMBOL(get_user_pages);
=20
+int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+		     unsigned long address, unsigned int fault_flags)
+{
+	BUG(); /* nommu should never call this */
+}
+
 /**
  * follow_pfn - look up PFN at a user virtual address
  * @vma: memory mapping

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
  2011-07-27  7:58                                 ` Benjamin Herrenschmidt
@ 2011-07-27 10:09                                   ` David Howells
  -1 siblings, 0 replies; 138+ messages in thread
From: David Howells @ 2011-07-27 10:09 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: dhowells, Benjamin Herrenschmidt, Mike Frysinger, Shan Hai,
	paulus, tglx, walken, cmetcalf, tony.luck, akpm, linuxppc-dev,
	linux-kernel, uclinux-dist-devel

Peter Zijlstra <peterz@infradead.org> wrote:

> > What should nommu do anyways ? it's not like there's much it can do
> > right ? It should never even hit the fault path to start with ...
> 
> Something like the below makes a nommu arm config build.. David, is this
> indeed the correct thing to do for nommu?
> 
> ---
> Index: linux-2.6/mm/nommu.c
> ===================================================================
> --- linux-2.6.orig/mm/nommu.c
> +++ linux-2.6/mm/nommu.c
> @@ -190,6 +190,12 @@ int get_user_pages(struct task_struct *t
>  }
>  EXPORT_SYMBOL(get_user_pages);
>  
> +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
> +		     unsigned long address, unsigned int fault_flags)
> +{
> +	BUG(); /* nommu should never call this */
> +}
> +
>  /**
>   * follow_pfn - look up PFN at a user virtual address
>   * @vma: memory mapping

Or perhaps send SEGV?  Can 'address' be bad at this point?

Can you inline this for the NOMMU case please?

David

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
@ 2011-07-27 10:09                                   ` David Howells
  0 siblings, 0 replies; 138+ messages in thread
From: David Howells @ 2011-07-27 10:09 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tony.luck, Mike Frysinger, Shan Hai, linux-kernel, cmetcalf,
	dhowells, paulus, uclinux-dist-devel, tglx, walken, linuxppc-dev,
	akpm

Peter Zijlstra <peterz@infradead.org> wrote:

> > What should nommu do anyways ? it's not like there's much it can do
> > right ? It should never even hit the fault path to start with ...
> 
> Something like the below makes a nommu arm config build.. David, is this
> indeed the correct thing to do for nommu?
> 
> ---
> Index: linux-2.6/mm/nommu.c
> ===================================================================
> --- linux-2.6.orig/mm/nommu.c
> +++ linux-2.6/mm/nommu.c
> @@ -190,6 +190,12 @@ int get_user_pages(struct task_struct *t
>  }
>  EXPORT_SYMBOL(get_user_pages);
>  
> +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
> +		     unsigned long address, unsigned int fault_flags)
> +{
> +	BUG(); /* nommu should never call this */
> +}
> +
>  /**
>   * follow_pfn - look up PFN at a user virtual address
>   * @vma: memory mapping

Or perhaps send SEGV?  Can 'address' be bad at this point?

Can you inline this for the NOMMU case please?

David

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
  2011-07-27 10:09                                   ` David Howells
@ 2011-07-27 10:17                                     ` Peter Zijlstra
  -1 siblings, 0 replies; 138+ messages in thread
From: Peter Zijlstra @ 2011-07-27 10:17 UTC (permalink / raw)
  To: David Howells
  Cc: Benjamin Herrenschmidt, Mike Frysinger, Shan Hai, paulus, tglx,
	walken, cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel,
	uclinux-dist-devel

On Wed, 2011-07-27 at 11:09 +0100, David Howells wrote:
> Can you inline this for the NOMMU case please?

---
Subject: mm: Fix fixup_user_fault() for MMU=n 

In commit 2efaca927 ("mm/futex: fix futex writes on archs with SW
tracking of dirty & young") we forgot about MMU=n. This patch fixes
that.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -962,6 +962,8 @@ int invalidate_inode_page(struct page *p
 #ifdef CONFIG_MMU
 extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, unsigned int flags);
+extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+			    unsigned long address, unsigned int fault_flags);
 #else
 static inline int handle_mm_fault(struct mm_struct *mm,
 			struct vm_area_struct *vma, unsigned long address,
@@ -971,6 +973,14 @@ static inline int handle_mm_fault(struct
 	BUG();
 	return VM_FAULT_SIGBUS;
 }
+static inline int fixup_user_fault(struct task_struct *tsk, 
+		struct mm_struct *mm, unsigned long address,
+		unsigned int fault_flags)
+{
+	/* should never happen if there's no MMU */
+	BUG();
+	return -EFAULT;
+}
 #endif
 
 extern int make_pages_present(unsigned long addr, unsigned long end);
@@ -988,8 +998,6 @@ int get_user_pages(struct task_struct *t
 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 			struct page **pages);
 struct page *get_dump_page(unsigned long addr);
-extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
-			    unsigned long address, unsigned int fault_flags);
 
 extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
 extern void do_invalidatepage(struct page *page, unsigned long offset);


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
@ 2011-07-27 10:17                                     ` Peter Zijlstra
  0 siblings, 0 replies; 138+ messages in thread
From: Peter Zijlstra @ 2011-07-27 10:17 UTC (permalink / raw)
  To: David Howells
  Cc: tony.luck, Mike Frysinger, Shan Hai, linux-kernel, cmetcalf,
	paulus, uclinux-dist-devel, tglx, walken, linuxppc-dev, akpm

On Wed, 2011-07-27 at 11:09 +0100, David Howells wrote:
> Can you inline this for the NOMMU case please?

---
Subject: mm: Fix fixup_user_fault() for MMU=3Dn=20

In commit 2efaca927 ("mm/futex: fix futex writes on archs with SW
tracking of dirty & young") we forgot about MMU=3Dn. This patch fixes
that.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
Index: linux-2.6/include/linux/mm.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -962,6 +962,8 @@ int invalidate_inode_page(struct page *p
 #ifdef CONFIG_MMU
 extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vm=
a,
 			unsigned long address, unsigned int flags);
+extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+			    unsigned long address, unsigned int fault_flags);
 #else
 static inline int handle_mm_fault(struct mm_struct *mm,
 			struct vm_area_struct *vma, unsigned long address,
@@ -971,6 +973,14 @@ static inline int handle_mm_fault(struct
 	BUG();
 	return VM_FAULT_SIGBUS;
 }
+static inline int fixup_user_fault(struct task_struct *tsk,=20
+		struct mm_struct *mm, unsigned long address,
+		unsigned int fault_flags)
+{
+	/* should never happen if there's no MMU */
+	BUG();
+	return -EFAULT;
+}
 #endif
=20
 extern int make_pages_present(unsigned long addr, unsigned long end);
@@ -988,8 +998,6 @@ int get_user_pages(struct task_struct *t
 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 			struct page **pages);
 struct page *get_dump_page(unsigned long addr);
-extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
-			    unsigned long address, unsigned int fault_flags);
=20
 extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
 extern void do_invalidatepage(struct page *page, unsigned long offset);

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
  2011-07-27 10:17                                     ` Peter Zijlstra
@ 2011-07-27 10:20                                       ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-27 10:20 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: David Howells, Mike Frysinger, Shan Hai, paulus, tglx, walken,
	cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel,
	uclinux-dist-devel

On Wed, 2011-07-27 at 12:17 +0200, Peter Zijlstra wrote:
> On Wed, 2011-07-27 at 11:09 +0100, David Howells wrote:
> > Can you inline this for the NOMMU case please?
> 
> ---
> Subject: mm: Fix fixup_user_fault() for MMU=n 
> 
> In commit 2efaca927 ("mm/futex: fix futex writes on archs with SW
> tracking of dirty & young") we forgot about MMU=n. This patch fixes
> that.
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

Hoping the BUG() isn't trippable by userspace but then it's no mmu, it's
not like we care what userspace can do right :-)

Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

Thanks !

Cheers,
Ben.

> ---
> Index: linux-2.6/include/linux/mm.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mm.h
> +++ linux-2.6/include/linux/mm.h
> @@ -962,6 +962,8 @@ int invalidate_inode_page(struct page *p
>  #ifdef CONFIG_MMU
>  extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  			unsigned long address, unsigned int flags);
> +extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
> +			    unsigned long address, unsigned int fault_flags);
>  #else
>  static inline int handle_mm_fault(struct mm_struct *mm,
>  			struct vm_area_struct *vma, unsigned long address,
> @@ -971,6 +973,14 @@ static inline int handle_mm_fault(struct
>  	BUG();
>  	return VM_FAULT_SIGBUS;
>  }
> +static inline int fixup_user_fault(struct task_struct *tsk, 
> +		struct mm_struct *mm, unsigned long address,
> +		unsigned int fault_flags)
> +{
> +	/* should never happen if there's no MMU */
> +	BUG();
> +	return -EFAULT;
> +}
>  #endif
>  
>  extern int make_pages_present(unsigned long addr, unsigned long end);
> @@ -988,8 +998,6 @@ int get_user_pages(struct task_struct *t
>  int get_user_pages_fast(unsigned long start, int nr_pages, int write,
>  			struct page **pages);
>  struct page *get_dump_page(unsigned long addr);
> -extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
> -			    unsigned long address, unsigned int fault_flags);
>  
>  extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
>  extern void do_invalidatepage(struct page *page, unsigned long offset);
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
@ 2011-07-27 10:20                                       ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin Herrenschmidt @ 2011-07-27 10:20 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tony.luck, Mike Frysinger, Shan Hai, linux-kernel, cmetcalf,
	David Howells, paulus, uclinux-dist-devel, tglx, walken,
	linuxppc-dev, akpm

On Wed, 2011-07-27 at 12:17 +0200, Peter Zijlstra wrote:
> On Wed, 2011-07-27 at 11:09 +0100, David Howells wrote:
> > Can you inline this for the NOMMU case please?
> 
> ---
> Subject: mm: Fix fixup_user_fault() for MMU=n 
> 
> In commit 2efaca927 ("mm/futex: fix futex writes on archs with SW
> tracking of dirty & young") we forgot about MMU=n. This patch fixes
> that.
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

Hoping the BUG() isn't trippable by userspace but then it's no mmu, it's
not like we care what userspace can do right :-)

Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

Thanks !

Cheers,
Ben.

> ---
> Index: linux-2.6/include/linux/mm.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mm.h
> +++ linux-2.6/include/linux/mm.h
> @@ -962,6 +962,8 @@ int invalidate_inode_page(struct page *p
>  #ifdef CONFIG_MMU
>  extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  			unsigned long address, unsigned int flags);
> +extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
> +			    unsigned long address, unsigned int fault_flags);
>  #else
>  static inline int handle_mm_fault(struct mm_struct *mm,
>  			struct vm_area_struct *vma, unsigned long address,
> @@ -971,6 +973,14 @@ static inline int handle_mm_fault(struct
>  	BUG();
>  	return VM_FAULT_SIGBUS;
>  }
> +static inline int fixup_user_fault(struct task_struct *tsk, 
> +		struct mm_struct *mm, unsigned long address,
> +		unsigned int fault_flags)
> +{
> +	/* should never happen if there's no MMU */
> +	BUG();
> +	return -EFAULT;
> +}
>  #endif
>  
>  extern int make_pages_present(unsigned long addr, unsigned long end);
> @@ -988,8 +998,6 @@ int get_user_pages(struct task_struct *t
>  int get_user_pages_fast(unsigned long start, int nr_pages, int write,
>  			struct page **pages);
>  struct page *get_dump_page(unsigned long addr);
> -extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
> -			    unsigned long address, unsigned int fault_flags);
>  
>  extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
>  extern void do_invalidatepage(struct page *page, unsigned long offset);
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
  2011-07-27 10:20                                       ` Benjamin Herrenschmidt
@ 2011-07-28  0:12                                         ` Mike Frysinger
  -1 siblings, 0 replies; 138+ messages in thread
From: Mike Frysinger @ 2011-07-28  0:12 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Peter Zijlstra, David Howells, Shan Hai, paulus, tglx, walken,
	cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel,
	uclinux-dist-devel

On Wed, Jul 27, 2011 at 03:20, Benjamin Herrenschmidt wrote:
> Hoping the BUG() isn't trippable by userspace but then it's no mmu, it's
> not like we care what userspace can do right :-)

side note ... common misconception that "no mmu" == "no memory
protection".  a few of the nommu processors have memory protection,
just no virtual<->physical translation.

thanks for the patch !
-mike

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
@ 2011-07-28  0:12                                         ` Mike Frysinger
  0 siblings, 0 replies; 138+ messages in thread
From: Mike Frysinger @ 2011-07-28  0:12 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: tony.luck, Shan Hai, Peter Zijlstra, linux-kernel, cmetcalf,
	David Howells, paulus, uclinux-dist-devel, tglx, walken,
	linuxppc-dev, akpm

On Wed, Jul 27, 2011 at 03:20, Benjamin Herrenschmidt wrote:
> Hoping the BUG() isn't trippable by userspace but then it's no mmu, it's
> not like we care what userspace can do right :-)

side note ... common misconception that "no mmu" == "no memory
protection".  a few of the nommu processors have memory protection,
just no virtual<->physical translation.

thanks for the patch !
-mike

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
  2011-07-27 10:09                                   ` David Howells
@ 2011-07-28 10:55                                     ` David Howells
  -1 siblings, 0 replies; 138+ messages in thread
From: David Howells @ 2011-07-28 10:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: dhowells, Benjamin Herrenschmidt, Mike Frysinger, Shan Hai,
	paulus, tglx, walken, cmetcalf, tony.luck, akpm, linuxppc-dev,
	linux-kernel, uclinux-dist-devel

Peter Zijlstra <peterz@infradead.org> wrote:

> Subject: mm: Fix fixup_user_fault() for MMU=n 
> 
> In commit 2efaca927 ("mm/futex: fix futex writes on archs with SW
> tracking of dirty & young") we forgot about MMU=n. This patch fixes
> that.
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

Acked-by: David Howells <dhowells@redhat.com>

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
@ 2011-07-28 10:55                                     ` David Howells
  0 siblings, 0 replies; 138+ messages in thread
From: David Howells @ 2011-07-28 10:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tony.luck, Mike Frysinger, Shan Hai, linux-kernel, cmetcalf,
	dhowells, paulus, uclinux-dist-devel, tglx, walken, linuxppc-dev,
	akpm

Peter Zijlstra <peterz@infradead.org> wrote:

> Subject: mm: Fix fixup_user_fault() for MMU=n 
> 
> In commit 2efaca927 ("mm/futex: fix futex writes on archs with SW
> tracking of dirty & young") we forgot about MMU=n. This patch fixes
> that.
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

Acked-by: David Howells <dhowells@redhat.com>

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
  2011-07-27 10:17                                     ` Peter Zijlstra
@ 2011-08-08  2:31                                       ` Mike Frysinger
  -1 siblings, 0 replies; 138+ messages in thread
From: Mike Frysinger @ 2011-08-08  2:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: David Howells, Benjamin Herrenschmidt, Shan Hai, paulus, tglx,
	walken, cmetcalf, tony.luck, akpm, linuxppc-dev, linux-kernel,
	uclinux-dist-devel, stable

On Wed, Jul 27, 2011 at 06:17, Peter Zijlstra wrote:
> On Wed, 2011-07-27 at 11:09 +0100, David Howells wrote:
>> Can you inline this for the NOMMU case please?
>
> ---
> Subject: mm: Fix fixup_user_fault() for MMU=n
>
> In commit 2efaca927 ("mm/futex: fix futex writes on archs with SW
> tracking of dirty & young") we forgot about MMU=n. This patch fixes
> that.

can we get this merged ?  mainline is now broken, linux-next is still
broken, and it seems gregkh is pulling this into the stable 3.0.1
which means that's going to be broken too.
-mike

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young
@ 2011-08-08  2:31                                       ` Mike Frysinger
  0 siblings, 0 replies; 138+ messages in thread
From: Mike Frysinger @ 2011-08-08  2:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tony.luck, Shan Hai, linux-kernel, cmetcalf, stable,
	David Howells, paulus, uclinux-dist-devel, tglx, walken,
	linuxppc-dev, akpm

On Wed, Jul 27, 2011 at 06:17, Peter Zijlstra wrote:
> On Wed, 2011-07-27 at 11:09 +0100, David Howells wrote:
>> Can you inline this for the NOMMU case please?
>
> ---
> Subject: mm: Fix fixup_user_fault() for MMU=n
>
> In commit 2efaca927 ("mm/futex: fix futex writes on archs with SW
> tracking of dirty & young") we forgot about MMU=n. This patch fixes
> that.

can we get this merged ?  mainline is now broken, linux-next is still
broken, and it seems gregkh is pulling this into the stable 3.0.1
which means that's going to be broken too.
-mike

^ permalink raw reply	[flat|nested] 138+ messages in thread

end of thread, other threads:[~2011-08-08  2:32 UTC | newest]

Thread overview: 138+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-07-15  8:07 [PATCH 0/1] Fixup write permission of TLB on powerpc e500 core Shan Hai
2011-07-15  8:07 ` Shan Hai
2011-07-15  8:07 ` [PATCH 1/1] " Shan Hai
2011-07-15  8:07   ` Shan Hai
2011-07-15 10:23   ` Peter Zijlstra
2011-07-15 10:23     ` Peter Zijlstra
2011-07-15 15:18     ` Shan Hai
2011-07-15 15:18       ` Shan Hai
2011-07-15 15:24       ` Peter Zijlstra
2011-07-15 15:24         ` Peter Zijlstra
2011-07-16 15:36         ` Shan Hai
2011-07-16 15:36           ` Shan Hai
2011-07-16 14:50     ` Shan Hai
2011-07-16 14:50       ` Shan Hai
2011-07-16 23:49       ` Benjamin Herrenschmidt
2011-07-16 23:49         ` Benjamin Herrenschmidt
2011-07-17  9:38         ` Peter Zijlstra
2011-07-17  9:38           ` Peter Zijlstra
2011-07-17 14:29           ` Benjamin Herrenschmidt
2011-07-17 14:29             ` Benjamin Herrenschmidt
2011-07-17 23:14             ` Benjamin Herrenschmidt
2011-07-17 23:14               ` Benjamin Herrenschmidt
2011-07-18  3:53               ` Benjamin Herrenschmidt
2011-07-18  3:53                 ` Benjamin Herrenschmidt
2011-07-18  4:02                 ` Benjamin Herrenschmidt
2011-07-18  4:02                   ` Benjamin Herrenschmidt
2011-07-18  4:01               ` Benjamin Herrenschmidt
2011-07-18  4:01                 ` Benjamin Herrenschmidt
2011-07-18  6:48                 ` Shan Hai
2011-07-18  6:48                   ` Shan Hai
2011-07-18  7:01                   ` Benjamin Herrenschmidt
2011-07-18  7:01                     ` Benjamin Herrenschmidt
2011-07-18  7:26                     ` Shan Hai
2011-07-18  7:26                       ` Shan Hai
2011-07-18  7:36                       ` Benjamin Herrenschmidt
2011-07-18  7:36                         ` Benjamin Herrenschmidt
2011-07-18  7:50                         ` Shan Hai
2011-07-18  7:50                           ` Shan Hai
2011-07-19  3:30                         ` Shan Hai
2011-07-19  3:30                           ` Shan Hai
2011-07-19  4:20                           ` Benjamin Herrenschmidt
2011-07-19  4:20                             ` Benjamin Herrenschmidt
2011-07-19  4:29                           ` [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of dirty & young Benjamin Herrenschmidt
2011-07-19  4:29                             ` Benjamin Herrenschmidt
2011-07-19  4:55                             ` Shan Hai
2011-07-19  4:55                               ` Shan Hai
2011-07-19  5:17                             ` Shan Hai
2011-07-19  5:17                               ` Shan Hai
2011-07-19  5:24                               ` Benjamin Herrenschmidt
2011-07-19  5:24                                 ` Benjamin Herrenschmidt
2011-07-19  5:38                                 ` Shan Hai
2011-07-19  5:38                                   ` Shan Hai
2011-07-19  7:46                                   ` Benjamin Herrenschmidt
2011-07-19  7:46                                     ` Benjamin Herrenschmidt
2011-07-19  8:24                                     ` Shan Hai
2011-07-19  8:24                                       ` Shan Hai
2011-07-19  8:26                                       ` [RFC/PATCH] mm/futex: Fix futex writes on archs with SW trackingof " David Laight
2011-07-19  8:26                                         ` David Laight
2011-07-19  8:45                                         ` Benjamin Herrenschmidt
2011-07-19  8:45                                           ` Benjamin Herrenschmidt
2011-07-19  8:45                                         ` Shan Hai
2011-07-19  8:45                                           ` Shan Hai
2011-07-19 11:10                             ` [RFC/PATCH] mm/futex: Fix futex writes on archs with SW tracking of " Peter Zijlstra
2011-07-19 11:10                               ` Peter Zijlstra
2011-07-20 14:39                             ` Darren Hart
2011-07-20 14:39                               ` Darren Hart
2011-07-21 22:36                             ` Andrew Morton
2011-07-21 22:36                               ` Andrew Morton
2011-07-21 22:52                               ` Benjamin Herrenschmidt
2011-07-21 22:52                                 ` Benjamin Herrenschmidt
2011-07-21 22:57                                 ` Benjamin Herrenschmidt
2011-07-21 22:57                                   ` Benjamin Herrenschmidt
2011-07-21 22:59                                 ` Andrew Morton
2011-07-21 22:59                                   ` Andrew Morton
2011-07-22  1:40                                   ` Benjamin Herrenschmidt
2011-07-22  1:40                                     ` Benjamin Herrenschmidt
2011-07-22  1:54                                   ` Shan Hai
2011-07-22  1:54                                     ` Shan Hai
2011-07-27  6:50                             ` Mike Frysinger
2011-07-27  6:50                               ` Mike Frysinger
2011-07-27  7:58                               ` Benjamin Herrenschmidt
2011-07-27  7:58                                 ` Benjamin Herrenschmidt
2011-07-27  8:59                                 ` Peter Zijlstra
2011-07-27  8:59                                   ` Peter Zijlstra
2011-07-27 10:09                                 ` David Howells
2011-07-27 10:09                                   ` David Howells
2011-07-27 10:17                                   ` Peter Zijlstra
2011-07-27 10:17                                     ` Peter Zijlstra
2011-07-27 10:20                                     ` Benjamin Herrenschmidt
2011-07-27 10:20                                       ` Benjamin Herrenschmidt
2011-07-28  0:12                                       ` Mike Frysinger
2011-07-28  0:12                                         ` Mike Frysinger
2011-08-08  2:31                                     ` Mike Frysinger
2011-08-08  2:31                                       ` Mike Frysinger
2011-07-28 10:55                                   ` David Howells
2011-07-28 10:55                                     ` David Howells
2011-07-17 11:02         ` [PATCH 1/1] Fixup write permission of TLB on powerpc e500 core Peter Zijlstra
2011-07-17 11:02           ` Peter Zijlstra
2011-07-17 13:33           ` Shan Hai
2011-07-17 13:33             ` Shan Hai
2011-07-17 14:48             ` Benjamin Herrenschmidt
2011-07-17 14:48               ` Benjamin Herrenschmidt
2011-07-17 15:40               ` Shan Hai
2011-07-17 15:40                 ` Shan Hai
2011-07-17 22:34                 ` Benjamin Herrenschmidt
2011-07-17 22:34                   ` Benjamin Herrenschmidt
2011-07-17 14:34           ` Benjamin Herrenschmidt
2011-07-17 14:34             ` Benjamin Herrenschmidt
2011-07-15  8:20 ` [PATCH 0/1] " Peter Zijlstra
2011-07-15  8:20   ` Peter Zijlstra
2011-07-15  8:38   ` MailingLists
2011-07-15  8:38     ` MailingLists
2011-07-15  8:44     ` Peter Zijlstra
2011-07-15  8:44       ` Peter Zijlstra
2011-07-15  9:08       ` Shan Hai
2011-07-15  9:08         ` Shan Hai
2011-07-15  9:12         ` Benjamin Herrenschmidt
2011-07-15  9:12           ` Benjamin Herrenschmidt
2011-07-15  9:50         ` Peter Zijlstra
2011-07-15  9:50           ` Peter Zijlstra
2011-07-15 10:06           ` Shan Hai
2011-07-15 10:06             ` Shan Hai
2011-07-15 10:32             ` David Laight
2011-07-15 10:32               ` David Laight
2011-07-15 10:39               ` Peter Zijlstra
2011-07-15 10:39                 ` Peter Zijlstra
2011-07-15 15:32               ` Shan Hai
2011-07-15 15:32                 ` Shan Hai
2011-07-16  0:20                 ` Benjamin Herrenschmidt
2011-07-16  0:20                   ` Benjamin Herrenschmidt
2011-07-16 15:03                   ` Shan Hai
2011-07-16 15:03                     ` Shan Hai
2011-07-15 23:47               ` Benjamin Herrenschmidt
2011-07-15 23:47                 ` Benjamin Herrenschmidt
2011-07-15  9:07     ` Benjamin Herrenschmidt
2011-07-15  9:07       ` Benjamin Herrenschmidt
2011-07-15  9:05   ` Benjamin Herrenschmidt
2011-07-15  9:05     ` Benjamin Herrenschmidt

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.