linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8
@ 2002-09-27 16:24 Ingo Molnar
  2002-09-27 16:26 ` Linus Torvalds
  0 siblings, 1 reply; 24+ messages in thread
From: Ingo Molnar @ 2002-09-27 16:24 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, Rusty Russell, linux-kernel


the attached patch implements the virtual => physical cache. Right now
only the COW code calls the invalidation function, because futexes do not
need notification on unmap.

I have fixed a new futex bug as well: pin_page() alone does not guarantee
that the mapping does not change magically, only taking the MM semaphore
in write-mode does.

this also enables us to avoid taking the vcache_lock in the COW fastpath,
so the overhead should be pretty low, unless there are futexes in the
given hash-list, in which case we obviously have to take the vcache_lock.

with this patch applied to BK-curr all our fork testcases pass just fine.

taking the MM write-lock is a bit strong, but i found no good way to avoid
it - we have to hold on to the pte mapping. get_user_pages() without the
mm lock taken is pretty meaningless - any other thread could in parallel
COW the given page and fault in a completely different page. Are other
get_user_page() users [such as direct-IO] protected against this scenario?

	Ingo

--- linux/include/linux/vcache.h.orig	Fri Sep 27 15:05:33 2002
+++ linux/include/linux/vcache.h	Fri Sep 27 16:07:42 2002
@@ -0,0 +1,26 @@
+/*
+ * virtual => physical mapping cache support.
+ */
+#ifndef _LINUX_VCACHE_H
+#define _LINUX_VCACHE_H
+
+typedef struct vcache_s {
+	unsigned long address;
+	struct mm_struct *mm;
+	struct list_head hash_entry;
+	void (*callback)(struct vcache_s *data, struct page *new_page);
+} vcache_t;
+
+extern spinlock_t vcache_lock;
+
+extern void __attach_vcache(vcache_t *vcache,
+		unsigned long address,
+		struct mm_struct *mm,
+		void (*callback)(struct vcache_s *data, struct page *new_page));
+
+extern void detach_vcache(vcache_t *vcache);
+
+extern void invalidate_vcache(unsigned long address, struct mm_struct *mm,
+				struct page *new_page);
+
+#endif
--- linux/include/linux/futex.h.orig	Fri Sep 27 16:14:10 2002
+++ linux/include/linux/futex.h	Fri Sep 27 16:14:16 2002
@@ -6,6 +6,6 @@
 #define FUTEX_WAKE (1)
 #define FUTEX_FD (2)
 
-extern asmlinkage int sys_futex(void *uaddr, int op, int val, struct timespec *utime);
+extern asmlinkage int sys_futex(unsigned long uaddr, int op, int val, struct timespec *utime);
 
 #endif
--- linux/kernel/futex.c.orig	Fri Sep 20 17:20:21 2002
+++ linux/kernel/futex.c	Fri Sep 27 18:11:14 2002
@@ -31,6 +31,7 @@
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/futex.h>
+#include <linux/vcache.h>
 #include <linux/highmem.h>
 #include <linux/time.h>
 #include <linux/pagemap.h>
@@ -38,7 +39,6 @@
 #include <linux/poll.h>
 #include <linux/file.h>
 #include <linux/dcache.h>
-#include <asm/uaccess.h>
 
 /* Simple "sleep if unchanged" interface. */
 
@@ -55,9 +55,14 @@
 struct futex_q {
 	struct list_head list;
 	wait_queue_head_t waiters;
+
 	/* Page struct and offset within it. */
 	struct page *page;
 	unsigned int offset;
+
+	/* the virtual => physical cache */
+	vcache_t vcache;
+
 	/* For fd, sigio sent using these. */
 	int fd;
 	struct file *filp;
@@ -87,9 +92,6 @@
 
 static inline void unpin_page(struct page *page)
 {
-	/* Avoid releasing the page which is on the LRU list.  I don't
-           know if this is correct, but it stops the BUG() in
-           __free_pages_ok(). */
 	page_cache_release(page);
 }
 
@@ -113,31 +115,61 @@
 		}
 	}
 	spin_unlock(&futex_lock);
+
+	up_write(&current->mm->mmap_sem);
+
 	return num_woken;
 }
 
+static void futex_vcache_callback(vcache_t *vcache, struct page *new_page)
+{
+	struct futex_q *q = container_of(vcache, struct futex_q, vcache);
+	struct list_head *head;
+
+	spin_lock(&futex_lock);
+
+	q->page = new_page;
+	head = hash_futex(new_page, q->offset);
+	BUG_ON(list_empty(&q->list));
+	list_del_init(&q->list);
+	list_add_tail(&q->list, head);
+
+	spin_unlock(&futex_lock);
+}
+
 /* Add at end to avoid starvation */
 static inline void queue_me(struct list_head *head,
 			    struct futex_q *q,
 			    struct page *page,
 			    unsigned int offset,
 			    int fd,
-			    struct file *filp)
+			    struct file *filp,
+			    unsigned long uaddr)
 {
 	q->page = page;
 	q->offset = offset;
 	q->fd = fd;
 	q->filp = filp;
 
+	spin_lock(&vcache_lock);
+	/*
+	 * We register a futex callback to this virtual address,
+	 * to make sure a COW properly rehashes the futex-queue.
+	 */
+	__attach_vcache(&q->vcache, uaddr, current->mm, futex_vcache_callback);
+
 	spin_lock(&futex_lock);
 	list_add_tail(&q->list, head);
 	spin_unlock(&futex_lock);
+	spin_unlock(&vcache_lock);
 }
 
 /* Return 1 if we were still queued (ie. 0 means we were woken) */
 static inline int unqueue_me(struct futex_q *q)
 {
 	int ret = 0;
+
+	detach_vcache(&q->vcache);
 	spin_lock(&futex_lock);
 	if (!list_empty(&q->list)) {
 		list_del(&q->list);
@@ -151,17 +183,15 @@
 static struct page *pin_page(unsigned long page_start)
 {
 	struct mm_struct *mm = current->mm;
-	struct page *page;
+	struct page *page = NULL;
 	int err;
 
-	down_read(&mm->mmap_sem);
 	err = get_user_pages(current, mm, page_start,
 			     1 /* one page */,
-			     0 /* writable not important */,
+			     1 /* writable */,
 			     0 /* don't force */,
 			     &page,
 			     NULL /* don't return vmas */);
-	up_read(&mm->mmap_sem);
 
 	if (err < 0)
 		return ERR_PTR(err);
@@ -172,8 +202,8 @@
 		      struct page *page,
 		      int offset,
 		      int val,
-		      int *uaddr,
-		      unsigned long time)
+		      unsigned long time,
+		      unsigned long uaddr)
 {
 	int curval;
 	struct futex_q q;
@@ -183,10 +213,12 @@
 	set_current_state(TASK_INTERRUPTIBLE);
 	init_waitqueue_head(&q.waiters);
 	add_wait_queue(&q.waiters, &wait);
-	queue_me(head, &q, page, offset, -1, NULL);
+	queue_me(head, &q, page, offset, -1, NULL, uaddr);
+
+	up_write(&current->mm->mmap_sem);
 
 	/* Page is pinned, but may no longer be in this address space. */
-	if (get_user(curval, uaddr) != 0) {
+	if (get_user(curval, (int *)uaddr) != 0) {
 		ret = -EFAULT;
 		goto out;
 	}
@@ -254,22 +286,25 @@
 static int futex_fd(struct list_head *head,
 		    struct page *page,
 		    int offset,
-		    int signal)
+		    int signal,
+		    unsigned long uaddr)
 {
-	int fd;
 	struct futex_q *q;
 	struct file *filp;
+	int fd;
 
+	fd = -EINVAL;
 	if (signal < 0 || signal > _NSIG)
-		return -EINVAL;
+		goto out;
 
 	fd = get_unused_fd();
 	if (fd < 0)
-		return fd;
+		goto out;
 	filp = get_empty_filp();
 	if (!filp) {
 		put_unused_fd(fd);
-		return -ENFILE;
+		fd = -ENFILE;
+		goto out;
 	}
 	filp->f_op = &futex_fops;
 	filp->f_vfsmnt = mntget(futex_mnt);
@@ -282,7 +317,8 @@
 		if (ret) {
 			put_unused_fd(fd);
 			put_filp(filp);
-			return ret;
+			fd = ret;
+			goto out;
 		}
 		filp->f_owner.signum = signal;
 	}
@@ -291,26 +327,29 @@
 	if (!q) {
 		put_unused_fd(fd);
 		put_filp(filp);
-		return -ENOMEM;
+		fd = -ENOMEM;
+		goto out;
 	}
 
 	/* Initialize queue structure, and add to hash table. */
 	filp->private_data = q;
 	init_waitqueue_head(&q->waiters);
-	queue_me(head, q, page, offset, fd, filp);
+	queue_me(head, q, page, offset, fd, filp, uaddr);
 
 	/* Now we map fd to filp, so userspace can access it */
 	fd_install(fd, filp);
+out:
+	up_write(&current->mm->mmap_sem);
 	return fd;
 }
 
-asmlinkage int sys_futex(void *uaddr, int op, int val, struct timespec *utime)
+asmlinkage int sys_futex(unsigned long uaddr, int op, int val, struct timespec *utime)
 {
-	int ret;
+	unsigned long time = MAX_SCHEDULE_TIMEOUT;
 	unsigned long pos_in_page;
 	struct list_head *head;
 	struct page *page;
-	unsigned long time = MAX_SCHEDULE_TIMEOUT;
+	int ret;
 
 	if (utime) {
 		struct timespec t;
@@ -319,38 +358,43 @@
 		time = timespec_to_jiffies(&t) + 1;
 	}
 
-	pos_in_page = ((unsigned long)uaddr) % PAGE_SIZE;
+	pos_in_page = uaddr % PAGE_SIZE;
 
 	/* Must be "naturally" aligned, and not on page boundary. */
 	if ((pos_in_page % __alignof__(int)) != 0
 	    || pos_in_page + sizeof(int) > PAGE_SIZE)
 		return -EINVAL;
 
-	/* Simpler if it doesn't vanish underneath us. */
-	page = pin_page((unsigned long)uaddr - pos_in_page);
-	if (IS_ERR(page))
-		return PTR_ERR(page);
+	/* Simpler if it mappings do not change underneath us. */
+	down_write(&current->mm->mmap_sem);
+
+	page = pin_page(uaddr - pos_in_page);
+	ret = IS_ERR(page);
+	if (ret)
+		goto out;
 
 	head = hash_futex(page, pos_in_page);
 	switch (op) {
 	case FUTEX_WAIT:
-		ret = futex_wait(head, page, pos_in_page, val, uaddr, time);
+		ret = futex_wait(head, page, pos_in_page, val, time, uaddr);
 		break;
 	case FUTEX_WAKE:
 		ret = futex_wake(head, page, pos_in_page, val);
 		break;
 	case FUTEX_FD:
 		/* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
-		ret = futex_fd(head, page, pos_in_page, val);
+		ret = futex_fd(head, page, pos_in_page, val, uaddr);
 		if (ret >= 0)
 			/* Leave page pinned (attached to fd). */
-			return ret;
+			goto out;
 		break;
 	default:
+		up_write(&current->mm->mmap_sem);
 		ret = -EINVAL;
 	}
 	unpin_page(page);
 
+out:
 	return ret;
 }
 
--- linux/kernel/fork.c.orig	Fri Sep 27 16:20:01 2002
+++ linux/kernel/fork.c	Fri Sep 27 16:20:22 2002
@@ -381,7 +381,7 @@
 		 * not set up a proper pointer then tough luck.
 		 */
 		put_user(0, tsk->user_tid);
-		sys_futex(tsk->user_tid, FUTEX_WAKE, 1, NULL);
+		sys_futex((unsigned long)tsk->user_tid, FUTEX_WAKE, 1, NULL);
 	}
 }
 
--- linux/mm/memory.c.orig	Fri Sep 20 17:20:25 2002
+++ linux/mm/memory.c	Fri Sep 27 16:48:12 2002
@@ -43,6 +43,7 @@
 #include <linux/iobuf.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/vcache.h>
 
 #include <asm/pgalloc.h>
 #include <asm/rmap.h>
@@ -973,6 +974,7 @@
 static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, 
 		pte_t *page_table)
 {
+	invalidate_vcache(address, vma->vm_mm, new_page);
 	flush_page_to_ram(new_page);
 	flush_cache_page(vma, address);
 	establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
--- linux/mm/vcache.c.orig	Fri Sep 27 14:58:10 2002
+++ linux/mm/vcache.c	Fri Sep 27 18:11:07 2002
@@ -0,0 +1,93 @@
+/*
+ *  linux/mm/vcache.c
+ *
+ *  virtual => physical page mapping cache. Users of this mechanism
+ *  register callbacks for a given (virt,mm,phys) page mapping, and
+ *  the kernel guarantees to call back when this mapping is invalidated.
+ *  (ie. upon COW or unmap.)
+ *
+ *  Started by Ingo Molnar, Copyright (C) 2002
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/vcache.h>
+
+#define VCACHE_HASHBITS 8
+#define VCACHE_HASHSIZE (1 << VCACHE_HASHBITS)
+
+spinlock_t vcache_lock = SPIN_LOCK_UNLOCKED;
+
+static struct list_head hash[VCACHE_HASHSIZE];
+
+static struct list_head *hash_vcache(unsigned long address,
+					struct mm_struct *mm)
+{
+        return &hash[hash_long(address + (unsigned long)mm, VCACHE_HASHBITS)];
+}
+
+void __attach_vcache(vcache_t *vcache,
+		unsigned long address,
+		struct mm_struct *mm,
+		void (*callback)(struct vcache_s *data, struct page *new))
+{
+	struct list_head *hash_head;
+
+	address &= PAGE_MASK;
+	vcache->address = address;
+	vcache->mm = mm;
+	vcache->callback = callback;
+
+	hash_head = hash_vcache(address, mm);
+
+	list_add(&vcache->hash_entry, hash_head);
+}
+
+void detach_vcache(vcache_t *vcache)
+{
+	spin_lock(&vcache_lock);
+	list_del(&vcache->hash_entry);
+	spin_unlock(&vcache_lock);
+}
+
+void invalidate_vcache(unsigned long address, struct mm_struct *mm,
+				struct page *new_page)
+{
+	struct list_head *l, *hash_head;
+	vcache_t *vcache;
+
+	address &= PAGE_MASK;
+
+	hash_head = hash_vcache(address, mm);
+	/*
+	 * This is safe, because this path is called with the mm
+	 * semaphore read-held, and the add/remove path calls with the
+	 * mm semaphore write-held. So while other mm's might add new
+	 * entries in parallel, and *this* mm is locked out, so if the
+	 * list is empty now then we do not have to take the vcache
+	 * lock to see it's really empty.
+	 */
+	if (likely(list_empty(hash_head)))
+		return;
+
+	spin_lock(&vcache_lock);
+	list_for_each(l, hash_head) {
+		vcache = list_entry(l, vcache_t, hash_entry);
+		if (vcache->address != address || vcache->mm != mm)
+			continue;
+		vcache->callback(vcache, new_page);
+	}
+	spin_unlock(&vcache_lock);
+}
+
+static int __init vcache_init(void)
+{
+        unsigned int i;
+
+	for (i = 0; i < VCACHE_HASHSIZE; i++)
+		INIT_LIST_HEAD(hash + i);
+	return 0;
+}
+__initcall(vcache_init);
+
--- linux/mm/Makefile.orig	Fri Sep 27 15:03:25 2002
+++ linux/mm/Makefile	Fri Sep 27 15:03:31 2002
@@ -9,6 +9,6 @@
 	    vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
 	    page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
 	    shmem.o highmem.o mempool.o msync.o mincore.o readahead.o \
-	    pdflush.o page-writeback.o rmap.o madvise.o
+	    pdflush.o page-writeback.o rmap.o madvise.o vcache.o
 
 include $(TOPDIR)/Rules.make


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8
  2002-09-27 16:24 [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 Ingo Molnar
@ 2002-09-27 16:26 ` Linus Torvalds
  2002-09-27 16:42   ` Ingo Molnar
  2002-09-28 10:22   ` [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 Arjan van de Ven
  0 siblings, 2 replies; 24+ messages in thread
From: Linus Torvalds @ 2002-09-27 16:26 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andrew Morton, Rusty Russell, linux-kernel


On Fri, 27 Sep 2002, Ingo Molnar wrote:
> 
> the attached patch implements the virtual => physical cache. Right now
> only the COW code calls the invalidation function, because futexes do not
> need notification on unmap.

Ok, looks good. Except you make get_user_page() do a write fault on the 
page, and one of the points of this approach was that that shouldn't even 
be needed. Or did I miss some case that does need it?

> I have fixed a new futex bug as well: pin_page() alone does not guarantee
> that the mapping does not change magically, only taking the MM semaphore
> in write-mode does.

And this makes no sense to me.

A read lock on the semaphore should give you all the same protection as a
write lock does.

To protect against the swapper etc, you really need to get the mm
spinlock, not the semaphore. And once you have the spinlock, you should be
totally safe.  Please explain what you saw?

		Linus


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8
  2002-09-27 16:26 ` Linus Torvalds
@ 2002-09-27 16:42   ` Ingo Molnar
  2002-09-27 16:47     ` Linus Torvalds
  2002-09-28 10:22   ` [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 Arjan van de Ven
  1 sibling, 1 reply; 24+ messages in thread
From: Ingo Molnar @ 2002-09-27 16:42 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, Rusty Russell, linux-kernel


On Fri, 27 Sep 2002, Linus Torvalds wrote:

> > I have fixed a new futex bug as well: pin_page() alone does not guarantee
> > that the mapping does not change magically, only taking the MM semaphore
> > in write-mode does.
> 
> And this makes no sense to me.
> 
> A read lock on the semaphore should give you all the same protection as
> a write lock does.
> 
> To protect against the swapper etc, you really need to get the mm
> spinlock, not the semaphore. And once you have the spinlock, you should
> be totally safe.  Please explain what you saw?

the futex code needs to 'get to the physical page', 'hash the futex queue'
and 'hash the vcache entry' atomically.

otherwise a physical page might magically change under us - eg. another
thread doing a fork(), which marks the futex's mapping COW, then the same
thread also hits the COW - all this after the get_user_page() call, but
before the 'hash futex and vcache' thing. I took the mm semaphore in
write-mode to exclude COWs [or parallel munmap()s].

The swapper is something i did not think about, but that makes things even
worse. So it appears that all 3 operations have to be done at once, with
the page_table_lock held? Ie. first a slow (and possibly blocking)  
get_user_pages() call, then we would take the page_table_lock, call a
(new) atomic version of get_user_pages() which just re-checks the virt =>
phys mapping without blocking or taking any spinlock, then if the page has
not changed meanwhile we hash the futex and the vcache. If the page has 
changed meanwhile then we re-try the slow get_user_pages() call.

am i missing something?

	Ingo


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8
  2002-09-27 16:42   ` Ingo Molnar
@ 2002-09-27 16:47     ` Linus Torvalds
  2002-09-27 17:01       ` Ingo Molnar
  0 siblings, 1 reply; 24+ messages in thread
From: Linus Torvalds @ 2002-09-27 16:47 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andrew Morton, Rusty Russell, linux-kernel


On Fri, 27 Sep 2002, Ingo Molnar wrote:
> 
> the futex code needs to 'get to the physical page', 'hash the futex queue'
> and 'hash the vcache entry' atomically.

I really don't think it needs to.

It really needs to do
 - hash the vcache entry _first_
 - not care about the rest

Once it hashes the vcache entry, we already know that all future COW's 
will tell us about themselves, and since out callback will serialize them 
with the futex lock, we're now done.

That's kind of the whole point of the vcache - it's the "race avoidance"  
thing.

And the vcache doesn't need to know what the physical page was: it only 
needs the address and the VM, which we have a priori. So we _can_ do all 
of this first.

You may want to do something clever to avoid taking the vcache lock in the 
COW path unless there is real reason to believe that you have to, but as 
far as I can tell, you can do that by simply adding a memory barrier to 
the end of the "insert vcache entry" thing.

Because once you do that, and you make sure that the COW thing does the 
vcache callback _after_ changing the page tables, the COW code can 

 - do the hash (which is invariant and has no races, since it depends 
   solely on the virtual address and the VM)
 - check if the hash queue is empty
 - only if the hash queue is non-empty does the COW code need to get the 
   vcache lock (and obviously it needs to re-load the hash entry from the 
   queue after it got the lock, but the hash is still valid)

And the COW case is the "hard" one. The swapper one is simple, just 
handled by gatting the page table spinlock in get_user_pages() (which we 
must already be doing, or we'd already be screwed).

Maybe I'm missing something, but the locking really doesn't look all that 
problematic if we just do things in the obvious order..

		Linus


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8
  2002-09-27 16:47     ` Linus Torvalds
@ 2002-09-27 17:01       ` Ingo Molnar
  2002-09-27 17:05         ` Linus Torvalds
  0 siblings, 1 reply; 24+ messages in thread
From: Ingo Molnar @ 2002-09-27 17:01 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, Rusty Russell, linux-kernel


On Fri, 27 Sep 2002, Linus Torvalds wrote:

> You may want to do something clever to avoid taking the vcache lock in
> the COW path unless there is real reason to believe that you have to,
> but as far as I can tell, you can do that by simply adding a memory
> barrier to the end of the "insert vcache entry" thing.

the COW path lookup code already avoids taking the vcache lock - unless
the hash-head is non-empty.

> Because once you do that, and you make sure that the COW thing does the 
> vcache callback _after_ changing the page tables, the COW code can 
>
>  - do the hash (which is invariant and has no races, since it depends 
>    solely on the virtual address and the VM)
>  - check if the hash queue is empty
>  - only if the hash queue is non-empty does the COW code need to get the 
>    vcache lock (and obviously it needs to re-load the hash entry from the 
>    queue after it got the lock, but the hash is still valid)

yeah, this is the optimization i have described in the previous mail, and
which is in the patch.

> Maybe I'm missing something, but the locking really doesn't look all
> that problematic if we just do things in the obvious order..

i agree, first hashing the vcache should work. There are some details:  
if we first hash the vcache then we have to set up the queue in a way for
the callback function to notice that this queue is not futex-hashed (ie.  
not live) yet. Otherwise the callback function might attempt to rehash it.  
This means one more branch in the callback function, not a problem.

	Ingo


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8
  2002-09-27 17:01       ` Ingo Molnar
@ 2002-09-27 17:05         ` Linus Torvalds
  2002-09-27 17:33           ` Ingo Molnar
  0 siblings, 1 reply; 24+ messages in thread
From: Linus Torvalds @ 2002-09-27 17:05 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andrew Morton, Rusty Russell, linux-kernel


On Fri, 27 Sep 2002, Ingo Molnar wrote:
> 
> i agree, first hashing the vcache should work. There are some details:  
> if we first hash the vcache then we have to set up the queue in a way for
> the callback function to notice that this queue is not futex-hashed (ie.  
> not live) yet. Otherwise the callback function might attempt to rehash it.  
> This means one more branch in the callback function, not a problem.

Not necessarily. You can solve _this_ problem by just holding the futex 
lock over the whole operation. If the COW happens "early" on another CPU, 
it will then be serialized by the futex lock in the callback function.

The futex lock itself is hopefully not going to be contended very much,
making the "get it early" approach acceptable. Although if it later _does_
get contended, we'll need to split it up some way: it cannot be hashed
sanely with the above approach, since there are no unique sub-indexes to
hash off as the physical page and offset have zero correlation with the VM
/ virtual address.

		Linus


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8
  2002-09-27 17:33           ` Ingo Molnar
@ 2002-09-27 17:32             ` Linus Torvalds
  2002-09-27 17:42               ` Ingo Molnar
  2002-09-27 17:44             ` [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 Ingo Molnar
  1 sibling, 1 reply; 24+ messages in thread
From: Linus Torvalds @ 2002-09-27 17:32 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andrew Morton, Rusty Russell, linux-kernel


On Fri, 27 Sep 2002, Ingo Molnar wrote:
> 
> the problem is that we want to catch all COW events of the virtual
> address, *and* we want to have a correct (physpage,offset) futex hash.

So?

	spin_lock(&futex_lock);

>         q.page = NULL;
>         attach_vcache(&q.vcache, uaddr, current->mm, futex_vcache_callback);
> 
>         page = pin_page(uaddr - offset);
>         ret = IS_ERR(page);
>         if (ret)
>                 goto out;
>         head = hash_futex(page, offset);
>         set_current_state(TASK_INTERRUPTIBLE);
>         init_waitqueue_head(&q.waiters);
>         add_wait_queue(&q.waiters, &wait);
>         queue_me(head, &q, page, offset, -1, NULL, uaddr);

	spin_unlock(&futex_lock);

And get the futex_lock in the callback.

		Linus


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8
  2002-09-27 17:05         ` Linus Torvalds
@ 2002-09-27 17:33           ` Ingo Molnar
  2002-09-27 17:32             ` Linus Torvalds
  2002-09-27 17:44             ` [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 Ingo Molnar
  0 siblings, 2 replies; 24+ messages in thread
From: Ingo Molnar @ 2002-09-27 17:33 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, Rusty Russell, linux-kernel


thinking about it some more, first attaching the vcache does not help.

the problem is that we want to catch all COW events of the virtual
address, *and* we want to have a correct (physpage,offset) futex hash.

if we do the following:

        q.page = NULL;
        attach_vcache(&q.vcache, uaddr, current->mm, futex_vcache_callback);

        page = pin_page(uaddr - offset);
        ret = IS_ERR(page);
        if (ret)
                goto out;
        head = hash_futex(page, offset);
(*)
        set_current_state(TASK_INTERRUPTIBLE);
        init_waitqueue_head(&q.waiters);
        add_wait_queue(&q.waiters, &wait);
        queue_me(head, &q, page, offset, -1, NULL, uaddr);

then we have the following race at (*): the futex is not yet hashed, but
it's already in the vcache hash. Nevertheless a parallel COW only
generates a useless callback - the futex is not hashed yet. The effect is
that the above 'page' address is in fact the old COW page [after the race
only mapped into some other MM], and the queue_me() futex hashing uses the
old page.

the only way i can see to do this race-less is what i suggested two mails
ago: when hashing the futex we have to take the pagetable lock and have to
check the actual current page that is mapped.

in fact there's another race as well: what if a parallel thread COWs the
page, faults it in (thus creates a new physical page), which then kswapd
swaps out. Ie. purely checking the pte (and using any potential new page
for the hash) is not enough, we really have to re-try the pin_pages() call
if the pte has changed meanwhile.

	Ingo


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8
  2002-09-27 17:32             ` Linus Torvalds
@ 2002-09-27 17:42               ` Ingo Molnar
  2002-09-27 17:48                 ` Linus Torvalds
  0 siblings, 1 reply; 24+ messages in thread
From: Ingo Molnar @ 2002-09-27 17:42 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, Rusty Russell, linux-kernel


i'd love to do this, but:

> 	spin_lock(&futex_lock);
> 
> >         q.page = NULL;
> >         attach_vcache(&q.vcache, uaddr, current->mm, futex_vcache_callback);
> > 
> >         page = pin_page(uaddr - offset);

            [ LOCKUP ]

pin_page() calls get_user_pages(), which might block in handle_mm_fault().

	Ingo


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8
  2002-09-27 17:44             ` [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 Ingo Molnar
@ 2002-09-27 17:42               ` Chris Wedgwood
  2002-09-27 17:53                 ` Ingo Molnar
  0 siblings, 1 reply; 24+ messages in thread
From: Chris Wedgwood @ 2002-09-27 17:42 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Linus Torvalds, Andrew Morton, Rusty Russell, linux-kernel

On Fri, Sep 27, 2002 at 07:44:08PM +0200, Ingo Molnar wrote:

> and i'm not quite sure how other users of get_user_pages()
> (direct-IO) handle these kinds of COW races.

O_DIRECT is allow to break if someone does something silly :)  I think
XFS tried hard to make sure things are consistent, but e2fs (at least
at one point) allowed you to read data from a file O_DIRECT which was
very different to what someone else read via the page-cache.


  --cw

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8
  2002-09-27 17:33           ` Ingo Molnar
  2002-09-27 17:32             ` Linus Torvalds
@ 2002-09-27 17:44             ` Ingo Molnar
  2002-09-27 17:42               ` Chris Wedgwood
  1 sibling, 1 reply; 24+ messages in thread
From: Ingo Molnar @ 2002-09-27 17:44 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, Rusty Russell, linux-kernel


and i'm not quite sure how other users of get_user_pages() (direct-IO)
handle these kinds of COW races. A COW can invalidate a physical page
anytime, so the DMA might go to the fork()ed child process, creating very
unexpected results. We are protected against kswapd via the elevated page
count, but are not protected against COW.

	Ingo


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8
  2002-09-27 17:42               ` Ingo Molnar
@ 2002-09-27 17:48                 ` Linus Torvalds
  2002-09-27 18:54                   ` [patch] 'virtual => physical page mapping cache' take #2, vcache-2.5.38-C4 Ingo Molnar
  0 siblings, 1 reply; 24+ messages in thread
From: Linus Torvalds @ 2002-09-27 17:48 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andrew Morton, Rusty Russell, linux-kernel


On Fri, 27 Sep 2002, Ingo Molnar wrote:
> 
> pin_page() calls get_user_pages(), which might block in handle_mm_fault().

Ok, I admit that sucks. Maybe that damn writable down() is the right thing 
to do after all. And let's hope most futexes don't block.

		Linus


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8
  2002-09-27 17:42               ` Chris Wedgwood
@ 2002-09-27 17:53                 ` Ingo Molnar
  2002-09-27 17:59                   ` Chris Wedgwood
  2002-09-27 18:01                   ` Alan Cox
  0 siblings, 2 replies; 24+ messages in thread
From: Ingo Molnar @ 2002-09-27 17:53 UTC (permalink / raw)
  To: Chris Wedgwood; +Cc: Linus Torvalds, Andrew Morton, Rusty Russell, linux-kernel


On Fri, 27 Sep 2002, Chris Wedgwood wrote:

> O_DIRECT is allow to break if someone does something silly :)  [...]

to DMA into a page that does not belong to the process anymore? I doubt
that.

	Ingo


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8
  2002-09-27 17:53                 ` Ingo Molnar
@ 2002-09-27 17:59                   ` Chris Wedgwood
  2002-09-27 18:01                   ` Alan Cox
  1 sibling, 0 replies; 24+ messages in thread
From: Chris Wedgwood @ 2002-09-27 17:59 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Linus Torvalds, Andrew Morton, Rusty Russell, linux-kernel

On Fri, Sep 27, 2002 at 07:53:23PM +0200, Ingo Molnar wrote:

> to DMA into a page that does not belong to the process anymore? I
> doubt that.

ah, ok ... sure, that isn't (shouldn't be) allowed

i was thinking of reading/writing to/from data during COW


  --cw


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8
  2002-09-27 17:53                 ` Ingo Molnar
  2002-09-27 17:59                   ` Chris Wedgwood
@ 2002-09-27 18:01                   ` Alan Cox
  2002-09-27 18:35                     ` [patch] 'virtual => physical page mapping cache',vcache-2.5.38-B8 Andrew Morton
  1 sibling, 1 reply; 24+ messages in thread
From: Alan Cox @ 2002-09-27 18:01 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Chris Wedgwood, Linus Torvalds, Andrew Morton, Rusty Russell,
	linux-kernel

On Fri, 2002-09-27 at 18:53, Ingo Molnar wrote:
> 
> On Fri, 27 Sep 2002, Chris Wedgwood wrote:
> 
> > O_DIRECT is allow to break if someone does something silly :)  [...]
> 
> to DMA into a page that does not belong to the process anymore? I doubt
> that.

Try doing a truncate on a file as an O_DIRECT write is occuring.
Scribbling into pages you dont own is the least of your problems.



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [patch] 'virtual => physical page mapping cache',vcache-2.5.38-B8
  2002-09-27 18:01                   ` Alan Cox
@ 2002-09-27 18:35                     ` Andrew Morton
  2002-09-27 19:16                       ` Alan Cox
  0 siblings, 1 reply; 24+ messages in thread
From: Andrew Morton @ 2002-09-27 18:35 UTC (permalink / raw)
  To: Alan Cox
  Cc: Ingo Molnar, Chris Wedgwood, Linus Torvalds, Rusty Russell, linux-kernel

Alan Cox wrote:
> 
> On Fri, 2002-09-27 at 18:53, Ingo Molnar wrote:
> >
> > On Fri, 27 Sep 2002, Chris Wedgwood wrote:
> >
> > > O_DIRECT is allow to break if someone does something silly :)  [...]
> >
> > to DMA into a page that does not belong to the process anymore? I doubt
> > that.
> 
> Try doing a truncate on a file as an O_DIRECT write is occuring.
> Scribbling into pages you dont own is the least of your problems.

O_DIRECT writes operate under i_sem, which provides exclusion
from trucate.  Do you know something which I don't??

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [patch] 'virtual => physical page mapping cache' take #2, vcache-2.5.38-C4
  2002-09-27 17:48                 ` Linus Torvalds
@ 2002-09-27 18:54                   ` Ingo Molnar
  2002-09-27 19:12                     ` Ingo Molnar
  2002-10-15  5:15                     ` Rusty Russell
  0 siblings, 2 replies; 24+ messages in thread
From: Ingo Molnar @ 2002-09-27 18:54 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, Rusty Russell, linux-kernel


On Fri, 27 Sep 2002, Linus Torvalds wrote:

> > pin_page() calls get_user_pages(), which might block in handle_mm_fault().
> 
> Ok, I admit that sucks. Maybe that damn writable down() is the right
> thing to do after all. And let's hope most futexes don't block.

i think my plan works too - see the attached patch. The method is to do a
cheap physpage lookup (ie. only the read-lock is taken, the page is forced
writable), but we insert it into the vcache and futex hash atomically:

        spin_lock(&vcache_lock);
        spin_lock(&futex_lock);
        spin_lock(&current->mm->page_table_lock);

        /*
         * Has the mapping changed meanwhile?
         */
        tmp = follow_page(current->mm, uaddr, 0);

        if (tmp == page) {
                q->page = page;
                list_add_tail(&q->list, head);
                /*
                 * We register a futex callback to this virtual address,
                 * to make sure a COW properly rehashes the futex-queue.
                 */
                __attach_vcache(&q->vcache, uaddr, current->mm, futex_vcache_callback);
        } else
                ret = 1;

        spin_unlock(&current->mm->page_table_lock);
        spin_unlock(&futex_lock);
        spin_unlock(&vcache_lock);

follow_page() is completely lock-less and it's lightweight.

if we race then we release the page and re-do the pin_page call. (should
be very rare.) This is conceptually similar to other race avoidance code
in the VM layer.

this method should also work fine if a futex wait is initiated *after* the
COW - the mapping is not forced writable and we'll be properly notified
once the COW happens. Ie. a nonintrusive futex-wait will not cause extra
COW-copying if a fork()+exec() [or fork()+exit()] happens, where the
FUTEX_WAIT is between the fork() and the exit(), and the FUTEX_WAKE is
after the exit().

in fact this means that futex wakeups can be done across fork(), as long
as the page is not COW-ed in. (glibc could perhaps make use of this
somewhere.)

so every scenario i can think of works fine in the common case and it
should scale well. I had to reorganize the futex code significantly, but
all the testcases pass now so it should be fine.

the futex 'slow path' is in fact one of our major scheduling points for
threaded apps that have userspace synchronization objects, so i think it's
worth the effort trying to improve it.

	Ingo

--- linux/include/linux/vcache.h.orig	Fri Sep 27 15:05:33 2002
+++ linux/include/linux/vcache.h	Fri Sep 27 20:33:30 2002
@@ -0,0 +1,26 @@
+/*
+ * virtual => physical mapping cache support.
+ */
+#ifndef _LINUX_VCACHE_H
+#define _LINUX_VCACHE_H
+
+typedef struct vcache_s {
+	unsigned long address;
+	struct mm_struct *mm;
+	struct list_head hash_entry;
+	void (*callback)(struct vcache_s *data, struct page *new_page);
+} vcache_t;
+
+extern spinlock_t vcache_lock;
+
+extern void __attach_vcache(vcache_t *vcache,
+		unsigned long address,
+		struct mm_struct *mm,
+		void (*callback)(struct vcache_s *data, struct page *new_page));
+
+extern void detach_vcache(vcache_t *vcache);
+
+extern void invalidate_vcache(unsigned long address, struct mm_struct *mm,
+				struct page *new_page);
+
+#endif
--- linux/include/linux/mm.h.orig	Fri Sep 27 20:40:56 2002
+++ linux/include/linux/mm.h	Fri Sep 27 20:41:06 2002
@@ -374,6 +374,7 @@
 extern int make_pages_present(unsigned long addr, unsigned long end);
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
 
+extern struct page * follow_page(struct mm_struct *mm, unsigned long address, int write);
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
 		int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
 
--- linux/include/linux/futex.h.orig	Fri Sep 27 16:14:10 2002
+++ linux/include/linux/futex.h	Fri Sep 27 16:14:16 2002
@@ -6,6 +6,6 @@
 #define FUTEX_WAKE (1)
 #define FUTEX_FD (2)
 
-extern asmlinkage int sys_futex(void *uaddr, int op, int val, struct timespec *utime);
+extern asmlinkage int sys_futex(unsigned long uaddr, int op, int val, struct timespec *utime);
 
 #endif
--- linux/kernel/futex.c.orig	Fri Sep 20 17:20:21 2002
+++ linux/kernel/futex.c	Fri Sep 27 20:43:08 2002
@@ -31,6 +31,7 @@
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/futex.h>
+#include <linux/vcache.h>
 #include <linux/highmem.h>
 #include <linux/time.h>
 #include <linux/pagemap.h>
@@ -38,7 +39,6 @@
 #include <linux/poll.h>
 #include <linux/file.h>
 #include <linux/dcache.h>
-#include <asm/uaccess.h>
 
 /* Simple "sleep if unchanged" interface. */
 
@@ -55,9 +55,14 @@
 struct futex_q {
 	struct list_head list;
 	wait_queue_head_t waiters;
+
 	/* Page struct and offset within it. */
 	struct page *page;
 	unsigned int offset;
+
+	/* the virtual => physical cache */
+	vcache_t vcache;
+
 	/* For fd, sigio sent using these. */
 	int fd;
 	struct file *filp;
@@ -85,21 +90,43 @@
 		send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
 }
 
+/* Get kernel address of the user page and pin it. */
+static struct page *pin_page(unsigned long page_start)
+{
+	struct mm_struct *mm = current->mm;
+	struct page *page = NULL;
+	int err;
+
+	down_read(&mm->mmap_sem);
+
+	err = get_user_pages(current, mm, page_start,
+			     1 /* one page */,
+			     0 /* not writable */,
+			     0 /* don't force */,
+			     &page,
+			     NULL /* don't return vmas */);
+	up_read(&mm->mmap_sem);
+	if (err < 0)
+		return ERR_PTR(err);
+	return page;
+}
+
 static inline void unpin_page(struct page *page)
 {
-	/* Avoid releasing the page which is on the LRU list.  I don't
-           know if this is correct, but it stops the BUG() in
-           __free_pages_ok(). */
 	page_cache_release(page);
 }
 
-static int futex_wake(struct list_head *head,
-		      struct page *page,
-		      unsigned int offset,
-		      int num)
+static int futex_wake(unsigned long uaddr, unsigned int offset, int num)
 {
-	struct list_head *i, *next;
-	int num_woken = 0;
+	struct list_head *i, *next, *head;
+	struct page *page;
+	int ret;
+
+	page = pin_page(uaddr - offset);
+	ret = IS_ERR(page);
+	if (ret)
+		goto out;
+	head = hash_futex(page, offset);
 
 	spin_lock(&futex_lock);
 	list_for_each_safe(i, next, head) {
@@ -108,36 +135,81 @@
 		if (this->page == page && this->offset == offset) {
 			list_del_init(i);
 			tell_waiter(this);
-			num_woken++;
-			if (num_woken >= num) break;
+			ret++;
+			if (ret >= num)
+				break;
 		}
 	}
 	spin_unlock(&futex_lock);
-	return num_woken;
+	unpin_page(page);
+out:
+	return ret;
+}
+
+static void futex_vcache_callback(vcache_t *vcache, struct page *new_page)
+{
+	struct futex_q *q = container_of(vcache, struct futex_q, vcache);
+	struct list_head *head = hash_futex(new_page, q->offset);
+
+	BUG_ON(list_empty(&q->list));
+
+	spin_lock(&futex_lock);
+
+	q->page = new_page;
+	list_del_init(&q->list);
+	list_add_tail(&q->list, head);
+
+	spin_unlock(&futex_lock);
 }
 
 /* Add at end to avoid starvation */
-static inline void queue_me(struct list_head *head,
+static inline int queue_me(struct list_head *head,
 			    struct futex_q *q,
 			    struct page *page,
 			    unsigned int offset,
 			    int fd,
-			    struct file *filp)
+			    struct file *filp,
+			    unsigned long uaddr)
 {
-	q->page = page;
+	struct page *tmp;
+	int ret = 0;
+
 	q->offset = offset;
 	q->fd = fd;
 	q->filp = filp;
 
+	spin_lock(&vcache_lock);
 	spin_lock(&futex_lock);
-	list_add_tail(&q->list, head);
+	spin_lock(&current->mm->page_table_lock);
+
+	/*
+	 * Has the mapping changed meanwhile?
+	 */
+	tmp = follow_page(current->mm, uaddr, 0);
+
+	if (tmp == page) {
+		q->page = page;
+		list_add_tail(&q->list, head);
+		/*
+		 * We register a futex callback to this virtual address,
+		 * to make sure a COW properly rehashes the futex-queue.
+		 */
+		__attach_vcache(&q->vcache, uaddr, current->mm, futex_vcache_callback);
+	} else
+		ret = 1;
+
+	spin_unlock(&current->mm->page_table_lock);
 	spin_unlock(&futex_lock);
+	spin_unlock(&vcache_lock);
+
+	return ret;
 }
 
 /* Return 1 if we were still queued (ie. 0 means we were woken) */
 static inline int unqueue_me(struct futex_q *q)
 {
 	int ret = 0;
+
 	spin_lock(&futex_lock);
 	if (!list_empty(&q->list)) {
 		list_del(&q->list);
@@ -147,46 +219,34 @@
 	return ret;
 }
 
-/* Get kernel address of the user page and pin it. */
-static struct page *pin_page(unsigned long page_start)
-{
-	struct mm_struct *mm = current->mm;
-	struct page *page;
-	int err;
-
-	down_read(&mm->mmap_sem);
-	err = get_user_pages(current, mm, page_start,
-			     1 /* one page */,
-			     0 /* writable not important */,
-			     0 /* don't force */,
-			     &page,
-			     NULL /* don't return vmas */);
-	up_read(&mm->mmap_sem);
-
-	if (err < 0)
-		return ERR_PTR(err);
-	return page;
-}
-
-static int futex_wait(struct list_head *head,
-		      struct page *page,
+static int futex_wait(unsigned long uaddr,
 		      int offset,
 		      int val,
-		      int *uaddr,
 		      unsigned long time)
 {
-	int curval;
-	struct futex_q q;
 	DECLARE_WAITQUEUE(wait, current);
-	int ret = 0;
+	struct list_head *head;
+	int ret = 0, curval;
+	struct page *page;
+	struct futex_q q;
+
+repeat_lookup:
+	page = pin_page(uaddr - offset);
+	ret = IS_ERR(page);
+	if (ret)
+		goto out;
+	head = hash_futex(page, offset);
 
 	set_current_state(TASK_INTERRUPTIBLE);
 	init_waitqueue_head(&q.waiters);
 	add_wait_queue(&q.waiters, &wait);
-	queue_me(head, &q, page, offset, -1, NULL);
+	if (queue_me(head, &q, page, offset, -1, NULL, uaddr)) {
+		unpin_page(page);
+		goto repeat_lookup;
+	}
 
 	/* Page is pinned, but may no longer be in this address space. */
-	if (get_user(curval, uaddr) != 0) {
+	if (get_user(curval, (int *)uaddr) != 0) {
 		ret = -EFAULT;
 		goto out;
 	}
@@ -204,11 +264,15 @@
 		ret = -EINTR;
 		goto out;
 	}
- out:
+out:
+	detach_vcache(&q.vcache);
 	set_current_state(TASK_RUNNING);
 	/* Were we woken up anyway? */
 	if (!unqueue_me(&q))
-		return 0;
+		ret = 0;
+	if (page)
+		unpin_page(page);
+
 	return ret;
 }
 
@@ -251,25 +315,26 @@
 
 /* Signal allows caller to avoid the race which would occur if they
    set the sigio stuff up afterwards. */
-static int futex_fd(struct list_head *head,
-		    struct page *page,
-		    int offset,
-		    int signal)
+static int futex_fd(unsigned long uaddr, int offset, int signal)
 {
-	int fd;
+	struct page *page = NULL;
+	struct list_head *head;
 	struct futex_q *q;
 	struct file *filp;
+	int ret;
 
+	ret = -EINVAL;
 	if (signal < 0 || signal > _NSIG)
-		return -EINVAL;
+		goto out;
 
-	fd = get_unused_fd();
-	if (fd < 0)
-		return fd;
+	ret = get_unused_fd();
+	if (ret < 0)
+		goto out;
 	filp = get_empty_filp();
 	if (!filp) {
-		put_unused_fd(fd);
-		return -ENFILE;
+		put_unused_fd(ret);
+		ret = -ENFILE;
+		goto out;
 	}
 	filp->f_op = &futex_fops;
 	filp->f_vfsmnt = mntget(futex_mnt);
@@ -280,37 +345,55 @@
 		
 		ret = f_setown(filp, current->tgid, 1);
 		if (ret) {
-			put_unused_fd(fd);
+			put_unused_fd(ret);
 			put_filp(filp);
-			return ret;
+			goto out;
 		}
 		filp->f_owner.signum = signal;
 	}
 
 	q = kmalloc(sizeof(*q), GFP_KERNEL);
 	if (!q) {
-		put_unused_fd(fd);
+		put_unused_fd(ret);
+		put_filp(filp);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+repeat_lookup:
+	page = pin_page(uaddr - offset);
+	ret = IS_ERR(page);
+	if (ret) {
+		put_unused_fd(ret);
 		put_filp(filp);
-		return -ENOMEM;
+		kfree(q);
+		page = NULL;
+		goto out;
 	}
+	head = hash_futex(page, offset);
 
 	/* Initialize queue structure, and add to hash table. */
 	filp->private_data = q;
 	init_waitqueue_head(&q->waiters);
-	queue_me(head, q, page, offset, fd, filp);
+	if (queue_me(head, q, page, offset, ret, filp, uaddr)) {
+		unpin_page(page);
+		goto repeat_lookup;
+	}
 
 	/* Now we map fd to filp, so userspace can access it */
-	fd_install(fd, filp);
-	return fd;
+	fd_install(ret, filp);
+	page = NULL;
+out:
+	if (page)
+		unpin_page(page);
+	return ret;
 }
 
-asmlinkage int sys_futex(void *uaddr, int op, int val, struct timespec *utime)
+asmlinkage int sys_futex(unsigned long uaddr, int op, int val, struct timespec *utime)
 {
-	int ret;
-	unsigned long pos_in_page;
-	struct list_head *head;
-	struct page *page;
 	unsigned long time = MAX_SCHEDULE_TIMEOUT;
+	unsigned long pos_in_page;
+	int ret;
 
 	if (utime) {
 		struct timespec t;
@@ -319,38 +402,27 @@
 		time = timespec_to_jiffies(&t) + 1;
 	}
 
-	pos_in_page = ((unsigned long)uaddr) % PAGE_SIZE;
+	pos_in_page = uaddr % PAGE_SIZE;
 
 	/* Must be "naturally" aligned, and not on page boundary. */
 	if ((pos_in_page % __alignof__(int)) != 0
 	    || pos_in_page + sizeof(int) > PAGE_SIZE)
 		return -EINVAL;
 
-	/* Simpler if it doesn't vanish underneath us. */
-	page = pin_page((unsigned long)uaddr - pos_in_page);
-	if (IS_ERR(page))
-		return PTR_ERR(page);
-
-	head = hash_futex(page, pos_in_page);
 	switch (op) {
 	case FUTEX_WAIT:
-		ret = futex_wait(head, page, pos_in_page, val, uaddr, time);
+		ret = futex_wait(uaddr, pos_in_page, val, time);
 		break;
 	case FUTEX_WAKE:
-		ret = futex_wake(head, page, pos_in_page, val);
+		ret = futex_wake(uaddr, pos_in_page, val);
 		break;
 	case FUTEX_FD:
 		/* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
-		ret = futex_fd(head, page, pos_in_page, val);
-		if (ret >= 0)
-			/* Leave page pinned (attached to fd). */
-			return ret;
+		ret = futex_fd(uaddr, pos_in_page, val);
 		break;
 	default:
 		ret = -EINVAL;
 	}
-	unpin_page(page);
-
 	return ret;
 }
 
--- linux/kernel/fork.c.orig	Fri Sep 27 16:20:01 2002
+++ linux/kernel/fork.c	Fri Sep 27 16:20:22 2002
@@ -381,7 +381,7 @@
 		 * not set up a proper pointer then tough luck.
 		 */
 		put_user(0, tsk->user_tid);
-		sys_futex(tsk->user_tid, FUTEX_WAKE, 1, NULL);
+		sys_futex((unsigned long)tsk->user_tid, FUTEX_WAKE, 1, NULL);
 	}
 }
 
--- linux/mm/memory.c.orig	Fri Sep 20 17:20:25 2002
+++ linux/mm/memory.c	Fri Sep 27 20:40:52 2002
@@ -43,6 +43,7 @@
 #include <linux/iobuf.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/vcache.h>
 
 #include <asm/pgalloc.h>
 #include <asm/rmap.h>
@@ -463,7 +464,7 @@
  * Do a quick page-table lookup for a single page.
  * mm->page_table_lock must be held.
  */
-static inline struct page *
+struct page *
 follow_page(struct mm_struct *mm, unsigned long address, int write) 
 {
 	pgd_t *pgd;
@@ -494,7 +495,7 @@
 	}
 
 out:
-	return 0;
+	return NULL;
 }
 
 /* 
@@ -973,6 +974,7 @@
 static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, 
 		pte_t *page_table)
 {
+	invalidate_vcache(address, vma->vm_mm, new_page);
 	flush_page_to_ram(new_page);
 	flush_cache_page(vma, address);
 	establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
--- linux/mm/vcache.c.orig	Fri Sep 27 14:58:10 2002
+++ linux/mm/vcache.c	Fri Sep 27 20:33:22 2002
@@ -0,0 +1,93 @@
+/*
+ *  linux/mm/vcache.c
+ *
+ *  virtual => physical page mapping cache. Users of this mechanism
+ *  register callbacks for a given (virt,mm,phys) page mapping, and
+ *  the kernel guarantees to call back when this mapping is invalidated.
+ *  (ie. upon COW or unmap.)
+ *
+ *  Started by Ingo Molnar, Copyright (C) 2002
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/vcache.h>
+
+#define VCACHE_HASHBITS 8
+#define VCACHE_HASHSIZE (1 << VCACHE_HASHBITS)
+
+spinlock_t vcache_lock = SPIN_LOCK_UNLOCKED;
+
+static struct list_head hash[VCACHE_HASHSIZE];
+
+static struct list_head *hash_vcache(unsigned long address,
+					struct mm_struct *mm)
+{
+        return &hash[hash_long(address + (unsigned long)mm, VCACHE_HASHBITS)];
+}
+
+void __attach_vcache(vcache_t *vcache,
+		unsigned long address,
+		struct mm_struct *mm,
+		void (*callback)(struct vcache_s *data, struct page *new))
+{
+	struct list_head *hash_head;
+
+	address &= PAGE_MASK;
+	vcache->address = address;
+	vcache->mm = mm;
+	vcache->callback = callback;
+
+	hash_head = hash_vcache(address, mm);
+
+	list_add(&vcache->hash_entry, hash_head);
+}
+
+void detach_vcache(vcache_t *vcache)
+{
+	spin_lock(&vcache_lock);
+	list_del(&vcache->hash_entry);
+	spin_unlock(&vcache_lock);
+}
+
+void invalidate_vcache(unsigned long address, struct mm_struct *mm,
+				struct page *new_page)
+{
+	struct list_head *l, *hash_head;
+	vcache_t *vcache;
+
+	address &= PAGE_MASK;
+
+	hash_head = hash_vcache(address, mm);
+	/*
+	 * This is safe, because this path is called with the mm
+	 * semaphore read-held, and the add/remove path calls with the
+	 * mm semaphore write-held. So while other mm's might add new
+	 * entries in parallel, and *this* mm is locked out, so if the
+	 * list is empty now then we do not have to take the vcache
+	 * lock to see it's really empty.
+	 */
+	if (likely(list_empty(hash_head)))
+		return;
+
+	spin_lock(&vcache_lock);
+	list_for_each(l, hash_head) {
+		vcache = list_entry(l, vcache_t, hash_entry);
+		if (vcache->address != address || vcache->mm != mm)
+			continue;
+		vcache->callback(vcache, new_page);
+	}
+	spin_unlock(&vcache_lock);
+}
+
+static int __init vcache_init(void)
+{
+        unsigned int i;
+
+	for (i = 0; i < VCACHE_HASHSIZE; i++)
+		INIT_LIST_HEAD(hash + i);
+	return 0;
+}
+__initcall(vcache_init);
+
--- linux/mm/Makefile.orig	Fri Sep 27 15:03:25 2002
+++ linux/mm/Makefile	Fri Sep 27 15:03:31 2002
@@ -9,6 +9,6 @@
 	    vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
 	    page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
 	    shmem.o highmem.o mempool.o msync.o mincore.o readahead.o \
-	    pdflush.o page-writeback.o rmap.o madvise.o
+	    pdflush.o page-writeback.o rmap.o madvise.o vcache.o
 
 include $(TOPDIR)/Rules.make


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [patch] 'virtual => physical page mapping cache' take #2, vcache-2.5.38-C4
  2002-09-27 18:54                   ` [patch] 'virtual => physical page mapping cache' take #2, vcache-2.5.38-C4 Ingo Molnar
@ 2002-09-27 19:12                     ` Ingo Molnar
  2002-10-15  5:15                     ` Rusty Russell
  1 sibling, 0 replies; 24+ messages in thread
From: Ingo Molnar @ 2002-09-27 19:12 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, Rusty Russell, linux-kernel


> [...] The method is to do a cheap physpage lookup (ie. only the
> read-lock is taken, the page is forced writable), [...]
                                 ^---not


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [patch] 'virtual => physical page mapping cache',vcache-2.5.38-B8
  2002-09-27 18:35                     ` [patch] 'virtual => physical page mapping cache',vcache-2.5.38-B8 Andrew Morton
@ 2002-09-27 19:16                       ` Alan Cox
  2002-09-27 19:25                         ` [patch] 'virtual => physical page mappingcache',vcache-2.5.38-B8 Andrew Morton
  0 siblings, 1 reply; 24+ messages in thread
From: Alan Cox @ 2002-09-27 19:16 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ingo Molnar, Chris Wedgwood, Linus Torvalds, Rusty Russell, linux-kernel

On Fri, 2002-09-27 at 19:35, Andrew Morton wrote:
> O_DIRECT writes operate under i_sem, which provides exclusion
> from trucate.  Do you know something which I don't??

So it does, hidden away in generic_file_write rather than the lower
layers. Interesting. So now I have a new question to resolve, which is
why doing O_DIRECT and truncate together corrupted my disk when I tried
it trying to break stuff



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [patch] 'virtual => physical page mappingcache',vcache-2.5.38-B8
  2002-09-27 19:16                       ` Alan Cox
@ 2002-09-27 19:25                         ` Andrew Morton
  2002-09-27 20:02                           ` Alan Cox
  0 siblings, 1 reply; 24+ messages in thread
From: Andrew Morton @ 2002-09-27 19:25 UTC (permalink / raw)
  To: Alan Cox; +Cc: linux-kernel

Alan Cox wrote:
> 
> On Fri, 2002-09-27 at 19:35, Andrew Morton wrote:
> > O_DIRECT writes operate under i_sem, which provides exclusion
> > from trucate.  Do you know something which I don't??
> 
> So it does, hidden away in generic_file_write rather than the lower
> layers.

Well that's sort of the same level at which truncate takes it.
It's a pretty big lock.

> Interesting. So now I have a new question to resolve, which is
> why doing O_DIRECT and truncate together corrupted my disk when I tried
> it trying to break stuff

Interesting indeed.  Possibly invalidate_inode_pages2() accidentally
left some dirty buffers detached from the mapping.  Hard to see how
it could do that.

What kernel were you testing?

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [patch] 'virtual => physical page mappingcache',vcache-2.5.38-B8
  2002-09-27 19:25                         ` [patch] 'virtual => physical page mappingcache',vcache-2.5.38-B8 Andrew Morton
@ 2002-09-27 20:02                           ` Alan Cox
  0 siblings, 0 replies; 24+ messages in thread
From: Alan Cox @ 2002-09-27 20:02 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

> > Interesting. So now I have a new question to resolve, which is
> > why doing O_DIRECT and truncate together corrupted my disk when I tried
> > it trying to break stuff
> 
> Interesting indeed.  Possibly invalidate_inode_pages2() accidentally
> left some dirty buffers detached from the mapping.  Hard to see how
> it could do that.
> 
> What kernel were you testing?

2.4.20pre5-ac with new IDE. So there are several other candidates 8)


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8
  2002-09-27 16:26 ` Linus Torvalds
  2002-09-27 16:42   ` Ingo Molnar
@ 2002-09-28 10:22   ` Arjan van de Ven
  1 sibling, 0 replies; 24+ messages in thread
From: Arjan van de Ven @ 2002-09-28 10:22 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Ingo Molnar, Andrew Morton, Rusty Russell, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 673 bytes --]

On Fri, 2002-09-27 at 18:26, Linus Torvalds wrote:
> 
> On Fri, 27 Sep 2002, Ingo Molnar wrote:
> > 
> > the attached patch implements the virtual => physical cache. Right now
> > only the COW code calls the invalidation function, because futexes do not
> > need notification on unmap.
> 
> Ok, looks good. Except you make get_user_page() do a write fault on the 
> page, and one of the points of this approach was that that shouldn't even 
> be needed. Or did I miss some case that does need it?

get_user_page() cannot/should not ever do a pagefault via the pagefault
code otherwise the coredump code will take the mmap semaphore
recursively and deadlock.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [patch] 'virtual => physical page mapping cache' take #2, vcache-2.5.38-C4
  2002-09-27 18:54                   ` [patch] 'virtual => physical page mapping cache' take #2, vcache-2.5.38-C4 Ingo Molnar
  2002-09-27 19:12                     ` Ingo Molnar
@ 2002-10-15  5:15                     ` Rusty Russell
  1 sibling, 0 replies; 24+ messages in thread
From: Rusty Russell @ 2002-10-15  5:15 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Linus Torvalds, Andrew Morton, linux-kernel

In message <Pine.LNX.4.44.0209272043260.17678-100000@localhost.localdomain> you
 write:
> +	page = pin_page(uaddr - offset);
> +	ret = IS_ERR(page);
> +	if (ret)
> +		goto out;
> +	head = hash_futex(page, offset);

Um, you mean:
	if (IS_ERR(page)) {
		ret = PTR_ERR(page);
		goto out;
	}

Cheers,
Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [patch] 'virtual => physical page mapping cache' take #2, vcache-2.5.38-C4
@ 2002-09-27 21:18 David Mosberger-Tang
  0 siblings, 0 replies; 24+ messages in thread
From: David Mosberger-Tang @ 2002-09-27 21:18 UTC (permalink / raw)
  To: linux-kernel

>>>>> On Fri, 27 Sep 2002 20:50:08 +0200, Ingo Molnar <mingo@elte.hu> said:

  Ingo> tmp = follow_page(current->mm, uaddr, 0);

How about making this a platform-specific routine?  I think a fair
number of arches provide special instruction for this (at least when
mm == current->mm).

	--david
--
Interested in learning more about IA-64 Linux?  Try http://www.lia64.org/book/

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2002-10-15  5:21 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-09-27 16:24 [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 Ingo Molnar
2002-09-27 16:26 ` Linus Torvalds
2002-09-27 16:42   ` Ingo Molnar
2002-09-27 16:47     ` Linus Torvalds
2002-09-27 17:01       ` Ingo Molnar
2002-09-27 17:05         ` Linus Torvalds
2002-09-27 17:33           ` Ingo Molnar
2002-09-27 17:32             ` Linus Torvalds
2002-09-27 17:42               ` Ingo Molnar
2002-09-27 17:48                 ` Linus Torvalds
2002-09-27 18:54                   ` [patch] 'virtual => physical page mapping cache' take #2, vcache-2.5.38-C4 Ingo Molnar
2002-09-27 19:12                     ` Ingo Molnar
2002-10-15  5:15                     ` Rusty Russell
2002-09-27 17:44             ` [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 Ingo Molnar
2002-09-27 17:42               ` Chris Wedgwood
2002-09-27 17:53                 ` Ingo Molnar
2002-09-27 17:59                   ` Chris Wedgwood
2002-09-27 18:01                   ` Alan Cox
2002-09-27 18:35                     ` [patch] 'virtual => physical page mapping cache',vcache-2.5.38-B8 Andrew Morton
2002-09-27 19:16                       ` Alan Cox
2002-09-27 19:25                         ` [patch] 'virtual => physical page mappingcache',vcache-2.5.38-B8 Andrew Morton
2002-09-27 20:02                           ` Alan Cox
2002-09-28 10:22   ` [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 Arjan van de Ven
2002-09-27 21:18 [patch] 'virtual => physical page mapping cache' take #2, vcache-2.5.38-C4 David Mosberger-Tang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).