* [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 @ 2002-09-27 16:24 Ingo Molnar 2002-09-27 16:26 ` Linus Torvalds 0 siblings, 1 reply; 23+ messages in thread From: Ingo Molnar @ 2002-09-27 16:24 UTC (permalink / raw) To: Linus Torvalds; +Cc: Andrew Morton, Rusty Russell, linux-kernel the attached patch implements the virtual => physical cache. Right now only the COW code calls the invalidation function, because futexes do not need notification on unmap. I have fixed a new futex bug as well: pin_page() alone does not guarantee that the mapping does not change magically, only taking the MM semaphore in write-mode does. this also enables us to avoid taking the vcache_lock in the COW fastpath, so the overhead should be pretty low, unless there are futexes in the given hash-list, in which case we obviously have to take the vcache_lock. with this patch applied to BK-curr all our fork testcases pass just fine. taking the MM write-lock is a bit strong, but i found no good way to avoid it - we have to hold on to the pte mapping. get_user_pages() without the mm lock taken is pretty meaningless - any other thread could in parallel COW the given page and fault in a completely different page. Are other get_user_page() users [such as direct-IO] protected against this scenario? Ingo --- linux/include/linux/vcache.h.orig Fri Sep 27 15:05:33 2002 +++ linux/include/linux/vcache.h Fri Sep 27 16:07:42 2002 @@ -0,0 +1,26 @@ +/* + * virtual => physical mapping cache support. + */ +#ifndef _LINUX_VCACHE_H +#define _LINUX_VCACHE_H + +typedef struct vcache_s { + unsigned long address; + struct mm_struct *mm; + struct list_head hash_entry; + void (*callback)(struct vcache_s *data, struct page *new_page); +} vcache_t; + +extern spinlock_t vcache_lock; + +extern void __attach_vcache(vcache_t *vcache, + unsigned long address, + struct mm_struct *mm, + void (*callback)(struct vcache_s *data, struct page *new_page)); + +extern void detach_vcache(vcache_t *vcache); + +extern void invalidate_vcache(unsigned long address, struct mm_struct *mm, + struct page *new_page); + +#endif --- linux/include/linux/futex.h.orig Fri Sep 27 16:14:10 2002 +++ linux/include/linux/futex.h Fri Sep 27 16:14:16 2002 @@ -6,6 +6,6 @@ #define FUTEX_WAKE (1) #define FUTEX_FD (2) -extern asmlinkage int sys_futex(void *uaddr, int op, int val, struct timespec *utime); +extern asmlinkage int sys_futex(unsigned long uaddr, int op, int val, struct timespec *utime); #endif --- linux/kernel/futex.c.orig Fri Sep 20 17:20:21 2002 +++ linux/kernel/futex.c Fri Sep 27 18:11:14 2002 @@ -31,6 +31,7 @@ #include <linux/init.h> #include <linux/fs.h> #include <linux/futex.h> +#include <linux/vcache.h> #include <linux/highmem.h> #include <linux/time.h> #include <linux/pagemap.h> @@ -38,7 +39,6 @@ #include <linux/poll.h> #include <linux/file.h> #include <linux/dcache.h> -#include <asm/uaccess.h> /* Simple "sleep if unchanged" interface. */ @@ -55,9 +55,14 @@ struct futex_q { struct list_head list; wait_queue_head_t waiters; + /* Page struct and offset within it. */ struct page *page; unsigned int offset; + + /* the virtual => physical cache */ + vcache_t vcache; + /* For fd, sigio sent using these. */ int fd; struct file *filp; @@ -87,9 +92,6 @@ static inline void unpin_page(struct page *page) { - /* Avoid releasing the page which is on the LRU list. I don't - know if this is correct, but it stops the BUG() in - __free_pages_ok(). */ page_cache_release(page); } @@ -113,31 +115,61 @@ } } spin_unlock(&futex_lock); + + up_write(¤t->mm->mmap_sem); + return num_woken; } +static void futex_vcache_callback(vcache_t *vcache, struct page *new_page) +{ + struct futex_q *q = container_of(vcache, struct futex_q, vcache); + struct list_head *head; + + spin_lock(&futex_lock); + + q->page = new_page; + head = hash_futex(new_page, q->offset); + BUG_ON(list_empty(&q->list)); + list_del_init(&q->list); + list_add_tail(&q->list, head); + + spin_unlock(&futex_lock); +} + /* Add at end to avoid starvation */ static inline void queue_me(struct list_head *head, struct futex_q *q, struct page *page, unsigned int offset, int fd, - struct file *filp) + struct file *filp, + unsigned long uaddr) { q->page = page; q->offset = offset; q->fd = fd; q->filp = filp; + spin_lock(&vcache_lock); + /* + * We register a futex callback to this virtual address, + * to make sure a COW properly rehashes the futex-queue. + */ + __attach_vcache(&q->vcache, uaddr, current->mm, futex_vcache_callback); + spin_lock(&futex_lock); list_add_tail(&q->list, head); spin_unlock(&futex_lock); + spin_unlock(&vcache_lock); } /* Return 1 if we were still queued (ie. 0 means we were woken) */ static inline int unqueue_me(struct futex_q *q) { int ret = 0; + + detach_vcache(&q->vcache); spin_lock(&futex_lock); if (!list_empty(&q->list)) { list_del(&q->list); @@ -151,17 +183,15 @@ static struct page *pin_page(unsigned long page_start) { struct mm_struct *mm = current->mm; - struct page *page; + struct page *page = NULL; int err; - down_read(&mm->mmap_sem); err = get_user_pages(current, mm, page_start, 1 /* one page */, - 0 /* writable not important */, + 1 /* writable */, 0 /* don't force */, &page, NULL /* don't return vmas */); - up_read(&mm->mmap_sem); if (err < 0) return ERR_PTR(err); @@ -172,8 +202,8 @@ struct page *page, int offset, int val, - int *uaddr, - unsigned long time) + unsigned long time, + unsigned long uaddr) { int curval; struct futex_q q; @@ -183,10 +213,12 @@ set_current_state(TASK_INTERRUPTIBLE); init_waitqueue_head(&q.waiters); add_wait_queue(&q.waiters, &wait); - queue_me(head, &q, page, offset, -1, NULL); + queue_me(head, &q, page, offset, -1, NULL, uaddr); + + up_write(¤t->mm->mmap_sem); /* Page is pinned, but may no longer be in this address space. */ - if (get_user(curval, uaddr) != 0) { + if (get_user(curval, (int *)uaddr) != 0) { ret = -EFAULT; goto out; } @@ -254,22 +286,25 @@ static int futex_fd(struct list_head *head, struct page *page, int offset, - int signal) + int signal, + unsigned long uaddr) { - int fd; struct futex_q *q; struct file *filp; + int fd; + fd = -EINVAL; if (signal < 0 || signal > _NSIG) - return -EINVAL; + goto out; fd = get_unused_fd(); if (fd < 0) - return fd; + goto out; filp = get_empty_filp(); if (!filp) { put_unused_fd(fd); - return -ENFILE; + fd = -ENFILE; + goto out; } filp->f_op = &futex_fops; filp->f_vfsmnt = mntget(futex_mnt); @@ -282,7 +317,8 @@ if (ret) { put_unused_fd(fd); put_filp(filp); - return ret; + fd = ret; + goto out; } filp->f_owner.signum = signal; } @@ -291,26 +327,29 @@ if (!q) { put_unused_fd(fd); put_filp(filp); - return -ENOMEM; + fd = -ENOMEM; + goto out; } /* Initialize queue structure, and add to hash table. */ filp->private_data = q; init_waitqueue_head(&q->waiters); - queue_me(head, q, page, offset, fd, filp); + queue_me(head, q, page, offset, fd, filp, uaddr); /* Now we map fd to filp, so userspace can access it */ fd_install(fd, filp); +out: + up_write(¤t->mm->mmap_sem); return fd; } -asmlinkage int sys_futex(void *uaddr, int op, int val, struct timespec *utime) +asmlinkage int sys_futex(unsigned long uaddr, int op, int val, struct timespec *utime) { - int ret; + unsigned long time = MAX_SCHEDULE_TIMEOUT; unsigned long pos_in_page; struct list_head *head; struct page *page; - unsigned long time = MAX_SCHEDULE_TIMEOUT; + int ret; if (utime) { struct timespec t; @@ -319,38 +358,43 @@ time = timespec_to_jiffies(&t) + 1; } - pos_in_page = ((unsigned long)uaddr) % PAGE_SIZE; + pos_in_page = uaddr % PAGE_SIZE; /* Must be "naturally" aligned, and not on page boundary. */ if ((pos_in_page % __alignof__(int)) != 0 || pos_in_page + sizeof(int) > PAGE_SIZE) return -EINVAL; - /* Simpler if it doesn't vanish underneath us. */ - page = pin_page((unsigned long)uaddr - pos_in_page); - if (IS_ERR(page)) - return PTR_ERR(page); + /* Simpler if it mappings do not change underneath us. */ + down_write(¤t->mm->mmap_sem); + + page = pin_page(uaddr - pos_in_page); + ret = IS_ERR(page); + if (ret) + goto out; head = hash_futex(page, pos_in_page); switch (op) { case FUTEX_WAIT: - ret = futex_wait(head, page, pos_in_page, val, uaddr, time); + ret = futex_wait(head, page, pos_in_page, val, time, uaddr); break; case FUTEX_WAKE: ret = futex_wake(head, page, pos_in_page, val); break; case FUTEX_FD: /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */ - ret = futex_fd(head, page, pos_in_page, val); + ret = futex_fd(head, page, pos_in_page, val, uaddr); if (ret >= 0) /* Leave page pinned (attached to fd). */ - return ret; + goto out; break; default: + up_write(¤t->mm->mmap_sem); ret = -EINVAL; } unpin_page(page); +out: return ret; } --- linux/kernel/fork.c.orig Fri Sep 27 16:20:01 2002 +++ linux/kernel/fork.c Fri Sep 27 16:20:22 2002 @@ -381,7 +381,7 @@ * not set up a proper pointer then tough luck. */ put_user(0, tsk->user_tid); - sys_futex(tsk->user_tid, FUTEX_WAKE, 1, NULL); + sys_futex((unsigned long)tsk->user_tid, FUTEX_WAKE, 1, NULL); } } --- linux/mm/memory.c.orig Fri Sep 20 17:20:25 2002 +++ linux/mm/memory.c Fri Sep 27 16:48:12 2002 @@ -43,6 +43,7 @@ #include <linux/iobuf.h> #include <linux/highmem.h> #include <linux/pagemap.h> +#include <linux/vcache.h> #include <asm/pgalloc.h> #include <asm/rmap.h> @@ -973,6 +974,7 @@ static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, pte_t *page_table) { + invalidate_vcache(address, vma->vm_mm, new_page); flush_page_to_ram(new_page); flush_cache_page(vma, address); establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); --- linux/mm/vcache.c.orig Fri Sep 27 14:58:10 2002 +++ linux/mm/vcache.c Fri Sep 27 18:11:07 2002 @@ -0,0 +1,93 @@ +/* + * linux/mm/vcache.c + * + * virtual => physical page mapping cache. Users of this mechanism + * register callbacks for a given (virt,mm,phys) page mapping, and + * the kernel guarantees to call back when this mapping is invalidated. + * (ie. upon COW or unmap.) + * + * Started by Ingo Molnar, Copyright (C) 2002 + */ + +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/hash.h> +#include <linux/vcache.h> + +#define VCACHE_HASHBITS 8 +#define VCACHE_HASHSIZE (1 << VCACHE_HASHBITS) + +spinlock_t vcache_lock = SPIN_LOCK_UNLOCKED; + +static struct list_head hash[VCACHE_HASHSIZE]; + +static struct list_head *hash_vcache(unsigned long address, + struct mm_struct *mm) +{ + return &hash[hash_long(address + (unsigned long)mm, VCACHE_HASHBITS)]; +} + +void __attach_vcache(vcache_t *vcache, + unsigned long address, + struct mm_struct *mm, + void (*callback)(struct vcache_s *data, struct page *new)) +{ + struct list_head *hash_head; + + address &= PAGE_MASK; + vcache->address = address; + vcache->mm = mm; + vcache->callback = callback; + + hash_head = hash_vcache(address, mm); + + list_add(&vcache->hash_entry, hash_head); +} + +void detach_vcache(vcache_t *vcache) +{ + spin_lock(&vcache_lock); + list_del(&vcache->hash_entry); + spin_unlock(&vcache_lock); +} + +void invalidate_vcache(unsigned long address, struct mm_struct *mm, + struct page *new_page) +{ + struct list_head *l, *hash_head; + vcache_t *vcache; + + address &= PAGE_MASK; + + hash_head = hash_vcache(address, mm); + /* + * This is safe, because this path is called with the mm + * semaphore read-held, and the add/remove path calls with the + * mm semaphore write-held. So while other mm's might add new + * entries in parallel, and *this* mm is locked out, so if the + * list is empty now then we do not have to take the vcache + * lock to see it's really empty. + */ + if (likely(list_empty(hash_head))) + return; + + spin_lock(&vcache_lock); + list_for_each(l, hash_head) { + vcache = list_entry(l, vcache_t, hash_entry); + if (vcache->address != address || vcache->mm != mm) + continue; + vcache->callback(vcache, new_page); + } + spin_unlock(&vcache_lock); +} + +static int __init vcache_init(void) +{ + unsigned int i; + + for (i = 0; i < VCACHE_HASHSIZE; i++) + INIT_LIST_HEAD(hash + i); + return 0; +} +__initcall(vcache_init); + --- linux/mm/Makefile.orig Fri Sep 27 15:03:25 2002 +++ linux/mm/Makefile Fri Sep 27 15:03:31 2002 @@ -9,6 +9,6 @@ vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \ shmem.o highmem.o mempool.o msync.o mincore.o readahead.o \ - pdflush.o page-writeback.o rmap.o madvise.o + pdflush.o page-writeback.o rmap.o madvise.o vcache.o include $(TOPDIR)/Rules.make ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 2002-09-27 16:24 [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 Ingo Molnar @ 2002-09-27 16:26 ` Linus Torvalds 2002-09-27 16:42 ` Ingo Molnar 2002-09-28 10:22 ` [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 Arjan van de Ven 0 siblings, 2 replies; 23+ messages in thread From: Linus Torvalds @ 2002-09-27 16:26 UTC (permalink / raw) To: Ingo Molnar; +Cc: Andrew Morton, Rusty Russell, linux-kernel On Fri, 27 Sep 2002, Ingo Molnar wrote: > > the attached patch implements the virtual => physical cache. Right now > only the COW code calls the invalidation function, because futexes do not > need notification on unmap. Ok, looks good. Except you make get_user_page() do a write fault on the page, and one of the points of this approach was that that shouldn't even be needed. Or did I miss some case that does need it? > I have fixed a new futex bug as well: pin_page() alone does not guarantee > that the mapping does not change magically, only taking the MM semaphore > in write-mode does. And this makes no sense to me. A read lock on the semaphore should give you all the same protection as a write lock does. To protect against the swapper etc, you really need to get the mm spinlock, not the semaphore. And once you have the spinlock, you should be totally safe. Please explain what you saw? Linus ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 2002-09-27 16:26 ` Linus Torvalds @ 2002-09-27 16:42 ` Ingo Molnar 2002-09-27 16:47 ` Linus Torvalds 2002-09-28 10:22 ` [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 Arjan van de Ven 1 sibling, 1 reply; 23+ messages in thread From: Ingo Molnar @ 2002-09-27 16:42 UTC (permalink / raw) To: Linus Torvalds; +Cc: Andrew Morton, Rusty Russell, linux-kernel On Fri, 27 Sep 2002, Linus Torvalds wrote: > > I have fixed a new futex bug as well: pin_page() alone does not guarantee > > that the mapping does not change magically, only taking the MM semaphore > > in write-mode does. > > And this makes no sense to me. > > A read lock on the semaphore should give you all the same protection as > a write lock does. > > To protect against the swapper etc, you really need to get the mm > spinlock, not the semaphore. And once you have the spinlock, you should > be totally safe. Please explain what you saw? the futex code needs to 'get to the physical page', 'hash the futex queue' and 'hash the vcache entry' atomically. otherwise a physical page might magically change under us - eg. another thread doing a fork(), which marks the futex's mapping COW, then the same thread also hits the COW - all this after the get_user_page() call, but before the 'hash futex and vcache' thing. I took the mm semaphore in write-mode to exclude COWs [or parallel munmap()s]. The swapper is something i did not think about, but that makes things even worse. So it appears that all 3 operations have to be done at once, with the page_table_lock held? Ie. first a slow (and possibly blocking) get_user_pages() call, then we would take the page_table_lock, call a (new) atomic version of get_user_pages() which just re-checks the virt => phys mapping without blocking or taking any spinlock, then if the page has not changed meanwhile we hash the futex and the vcache. If the page has changed meanwhile then we re-try the slow get_user_pages() call. am i missing something? Ingo ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 2002-09-27 16:42 ` Ingo Molnar @ 2002-09-27 16:47 ` Linus Torvalds 2002-09-27 17:01 ` Ingo Molnar 0 siblings, 1 reply; 23+ messages in thread From: Linus Torvalds @ 2002-09-27 16:47 UTC (permalink / raw) To: Ingo Molnar; +Cc: Andrew Morton, Rusty Russell, linux-kernel On Fri, 27 Sep 2002, Ingo Molnar wrote: > > the futex code needs to 'get to the physical page', 'hash the futex queue' > and 'hash the vcache entry' atomically. I really don't think it needs to. It really needs to do - hash the vcache entry _first_ - not care about the rest Once it hashes the vcache entry, we already know that all future COW's will tell us about themselves, and since out callback will serialize them with the futex lock, we're now done. That's kind of the whole point of the vcache - it's the "race avoidance" thing. And the vcache doesn't need to know what the physical page was: it only needs the address and the VM, which we have a priori. So we _can_ do all of this first. You may want to do something clever to avoid taking the vcache lock in the COW path unless there is real reason to believe that you have to, but as far as I can tell, you can do that by simply adding a memory barrier to the end of the "insert vcache entry" thing. Because once you do that, and you make sure that the COW thing does the vcache callback _after_ changing the page tables, the COW code can - do the hash (which is invariant and has no races, since it depends solely on the virtual address and the VM) - check if the hash queue is empty - only if the hash queue is non-empty does the COW code need to get the vcache lock (and obviously it needs to re-load the hash entry from the queue after it got the lock, but the hash is still valid) And the COW case is the "hard" one. The swapper one is simple, just handled by gatting the page table spinlock in get_user_pages() (which we must already be doing, or we'd already be screwed). Maybe I'm missing something, but the locking really doesn't look all that problematic if we just do things in the obvious order.. Linus ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 2002-09-27 16:47 ` Linus Torvalds @ 2002-09-27 17:01 ` Ingo Molnar 2002-09-27 17:05 ` Linus Torvalds 0 siblings, 1 reply; 23+ messages in thread From: Ingo Molnar @ 2002-09-27 17:01 UTC (permalink / raw) To: Linus Torvalds; +Cc: Andrew Morton, Rusty Russell, linux-kernel On Fri, 27 Sep 2002, Linus Torvalds wrote: > You may want to do something clever to avoid taking the vcache lock in > the COW path unless there is real reason to believe that you have to, > but as far as I can tell, you can do that by simply adding a memory > barrier to the end of the "insert vcache entry" thing. the COW path lookup code already avoids taking the vcache lock - unless the hash-head is non-empty. > Because once you do that, and you make sure that the COW thing does the > vcache callback _after_ changing the page tables, the COW code can > > - do the hash (which is invariant and has no races, since it depends > solely on the virtual address and the VM) > - check if the hash queue is empty > - only if the hash queue is non-empty does the COW code need to get the > vcache lock (and obviously it needs to re-load the hash entry from the > queue after it got the lock, but the hash is still valid) yeah, this is the optimization i have described in the previous mail, and which is in the patch. > Maybe I'm missing something, but the locking really doesn't look all > that problematic if we just do things in the obvious order.. i agree, first hashing the vcache should work. There are some details: if we first hash the vcache then we have to set up the queue in a way for the callback function to notice that this queue is not futex-hashed (ie. not live) yet. Otherwise the callback function might attempt to rehash it. This means one more branch in the callback function, not a problem. Ingo ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 2002-09-27 17:01 ` Ingo Molnar @ 2002-09-27 17:05 ` Linus Torvalds 2002-09-27 17:33 ` Ingo Molnar 0 siblings, 1 reply; 23+ messages in thread From: Linus Torvalds @ 2002-09-27 17:05 UTC (permalink / raw) To: Ingo Molnar; +Cc: Andrew Morton, Rusty Russell, linux-kernel On Fri, 27 Sep 2002, Ingo Molnar wrote: > > i agree, first hashing the vcache should work. There are some details: > if we first hash the vcache then we have to set up the queue in a way for > the callback function to notice that this queue is not futex-hashed (ie. > not live) yet. Otherwise the callback function might attempt to rehash it. > This means one more branch in the callback function, not a problem. Not necessarily. You can solve _this_ problem by just holding the futex lock over the whole operation. If the COW happens "early" on another CPU, it will then be serialized by the futex lock in the callback function. The futex lock itself is hopefully not going to be contended very much, making the "get it early" approach acceptable. Although if it later _does_ get contended, we'll need to split it up some way: it cannot be hashed sanely with the above approach, since there are no unique sub-indexes to hash off as the physical page and offset have zero correlation with the VM / virtual address. Linus ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 2002-09-27 17:05 ` Linus Torvalds @ 2002-09-27 17:33 ` Ingo Molnar 2002-09-27 17:32 ` Linus Torvalds 2002-09-27 17:44 ` [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 Ingo Molnar 0 siblings, 2 replies; 23+ messages in thread From: Ingo Molnar @ 2002-09-27 17:33 UTC (permalink / raw) To: Linus Torvalds; +Cc: Andrew Morton, Rusty Russell, linux-kernel thinking about it some more, first attaching the vcache does not help. the problem is that we want to catch all COW events of the virtual address, *and* we want to have a correct (physpage,offset) futex hash. if we do the following: q.page = NULL; attach_vcache(&q.vcache, uaddr, current->mm, futex_vcache_callback); page = pin_page(uaddr - offset); ret = IS_ERR(page); if (ret) goto out; head = hash_futex(page, offset); (*) set_current_state(TASK_INTERRUPTIBLE); init_waitqueue_head(&q.waiters); add_wait_queue(&q.waiters, &wait); queue_me(head, &q, page, offset, -1, NULL, uaddr); then we have the following race at (*): the futex is not yet hashed, but it's already in the vcache hash. Nevertheless a parallel COW only generates a useless callback - the futex is not hashed yet. The effect is that the above 'page' address is in fact the old COW page [after the race only mapped into some other MM], and the queue_me() futex hashing uses the old page. the only way i can see to do this race-less is what i suggested two mails ago: when hashing the futex we have to take the pagetable lock and have to check the actual current page that is mapped. in fact there's another race as well: what if a parallel thread COWs the page, faults it in (thus creates a new physical page), which then kswapd swaps out. Ie. purely checking the pte (and using any potential new page for the hash) is not enough, we really have to re-try the pin_pages() call if the pte has changed meanwhile. Ingo ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 2002-09-27 17:33 ` Ingo Molnar @ 2002-09-27 17:32 ` Linus Torvalds 2002-09-27 17:42 ` Ingo Molnar 2002-09-27 17:44 ` [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 Ingo Molnar 1 sibling, 1 reply; 23+ messages in thread From: Linus Torvalds @ 2002-09-27 17:32 UTC (permalink / raw) To: Ingo Molnar; +Cc: Andrew Morton, Rusty Russell, linux-kernel On Fri, 27 Sep 2002, Ingo Molnar wrote: > > the problem is that we want to catch all COW events of the virtual > address, *and* we want to have a correct (physpage,offset) futex hash. So? spin_lock(&futex_lock); > q.page = NULL; > attach_vcache(&q.vcache, uaddr, current->mm, futex_vcache_callback); > > page = pin_page(uaddr - offset); > ret = IS_ERR(page); > if (ret) > goto out; > head = hash_futex(page, offset); > set_current_state(TASK_INTERRUPTIBLE); > init_waitqueue_head(&q.waiters); > add_wait_queue(&q.waiters, &wait); > queue_me(head, &q, page, offset, -1, NULL, uaddr); spin_unlock(&futex_lock); And get the futex_lock in the callback. Linus ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 2002-09-27 17:32 ` Linus Torvalds @ 2002-09-27 17:42 ` Ingo Molnar 2002-09-27 17:48 ` Linus Torvalds 0 siblings, 1 reply; 23+ messages in thread From: Ingo Molnar @ 2002-09-27 17:42 UTC (permalink / raw) To: Linus Torvalds; +Cc: Andrew Morton, Rusty Russell, linux-kernel i'd love to do this, but: > spin_lock(&futex_lock); > > > q.page = NULL; > > attach_vcache(&q.vcache, uaddr, current->mm, futex_vcache_callback); > > > > page = pin_page(uaddr - offset); [ LOCKUP ] pin_page() calls get_user_pages(), which might block in handle_mm_fault(). Ingo ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 2002-09-27 17:42 ` Ingo Molnar @ 2002-09-27 17:48 ` Linus Torvalds 2002-09-27 18:54 ` [patch] 'virtual => physical page mapping cache' take #2, vcache-2.5.38-C4 Ingo Molnar 0 siblings, 1 reply; 23+ messages in thread From: Linus Torvalds @ 2002-09-27 17:48 UTC (permalink / raw) To: Ingo Molnar; +Cc: Andrew Morton, Rusty Russell, linux-kernel On Fri, 27 Sep 2002, Ingo Molnar wrote: > > pin_page() calls get_user_pages(), which might block in handle_mm_fault(). Ok, I admit that sucks. Maybe that damn writable down() is the right thing to do after all. And let's hope most futexes don't block. Linus ^ permalink raw reply [flat|nested] 23+ messages in thread
* [patch] 'virtual => physical page mapping cache' take #2, vcache-2.5.38-C4 2002-09-27 17:48 ` Linus Torvalds @ 2002-09-27 18:54 ` Ingo Molnar 2002-09-27 19:12 ` Ingo Molnar 2002-10-15 5:15 ` Rusty Russell 0 siblings, 2 replies; 23+ messages in thread From: Ingo Molnar @ 2002-09-27 18:54 UTC (permalink / raw) To: Linus Torvalds; +Cc: Andrew Morton, Rusty Russell, linux-kernel On Fri, 27 Sep 2002, Linus Torvalds wrote: > > pin_page() calls get_user_pages(), which might block in handle_mm_fault(). > > Ok, I admit that sucks. Maybe that damn writable down() is the right > thing to do after all. And let's hope most futexes don't block. i think my plan works too - see the attached patch. The method is to do a cheap physpage lookup (ie. only the read-lock is taken, the page is forced writable), but we insert it into the vcache and futex hash atomically: spin_lock(&vcache_lock); spin_lock(&futex_lock); spin_lock(¤t->mm->page_table_lock); /* * Has the mapping changed meanwhile? */ tmp = follow_page(current->mm, uaddr, 0); if (tmp == page) { q->page = page; list_add_tail(&q->list, head); /* * We register a futex callback to this virtual address, * to make sure a COW properly rehashes the futex-queue. */ __attach_vcache(&q->vcache, uaddr, current->mm, futex_vcache_callback); } else ret = 1; spin_unlock(¤t->mm->page_table_lock); spin_unlock(&futex_lock); spin_unlock(&vcache_lock); follow_page() is completely lock-less and it's lightweight. if we race then we release the page and re-do the pin_page call. (should be very rare.) This is conceptually similar to other race avoidance code in the VM layer. this method should also work fine if a futex wait is initiated *after* the COW - the mapping is not forced writable and we'll be properly notified once the COW happens. Ie. a nonintrusive futex-wait will not cause extra COW-copying if a fork()+exec() [or fork()+exit()] happens, where the FUTEX_WAIT is between the fork() and the exit(), and the FUTEX_WAKE is after the exit(). in fact this means that futex wakeups can be done across fork(), as long as the page is not COW-ed in. (glibc could perhaps make use of this somewhere.) so every scenario i can think of works fine in the common case and it should scale well. I had to reorganize the futex code significantly, but all the testcases pass now so it should be fine. the futex 'slow path' is in fact one of our major scheduling points for threaded apps that have userspace synchronization objects, so i think it's worth the effort trying to improve it. Ingo --- linux/include/linux/vcache.h.orig Fri Sep 27 15:05:33 2002 +++ linux/include/linux/vcache.h Fri Sep 27 20:33:30 2002 @@ -0,0 +1,26 @@ +/* + * virtual => physical mapping cache support. + */ +#ifndef _LINUX_VCACHE_H +#define _LINUX_VCACHE_H + +typedef struct vcache_s { + unsigned long address; + struct mm_struct *mm; + struct list_head hash_entry; + void (*callback)(struct vcache_s *data, struct page *new_page); +} vcache_t; + +extern spinlock_t vcache_lock; + +extern void __attach_vcache(vcache_t *vcache, + unsigned long address, + struct mm_struct *mm, + void (*callback)(struct vcache_s *data, struct page *new_page)); + +extern void detach_vcache(vcache_t *vcache); + +extern void invalidate_vcache(unsigned long address, struct mm_struct *mm, + struct page *new_page); + +#endif --- linux/include/linux/mm.h.orig Fri Sep 27 20:40:56 2002 +++ linux/include/linux/mm.h Fri Sep 27 20:41:06 2002 @@ -374,6 +374,7 @@ extern int make_pages_present(unsigned long addr, unsigned long end); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); +extern struct page * follow_page(struct mm_struct *mm, unsigned long address, int write); int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); --- linux/include/linux/futex.h.orig Fri Sep 27 16:14:10 2002 +++ linux/include/linux/futex.h Fri Sep 27 16:14:16 2002 @@ -6,6 +6,6 @@ #define FUTEX_WAKE (1) #define FUTEX_FD (2) -extern asmlinkage int sys_futex(void *uaddr, int op, int val, struct timespec *utime); +extern asmlinkage int sys_futex(unsigned long uaddr, int op, int val, struct timespec *utime); #endif --- linux/kernel/futex.c.orig Fri Sep 20 17:20:21 2002 +++ linux/kernel/futex.c Fri Sep 27 20:43:08 2002 @@ -31,6 +31,7 @@ #include <linux/init.h> #include <linux/fs.h> #include <linux/futex.h> +#include <linux/vcache.h> #include <linux/highmem.h> #include <linux/time.h> #include <linux/pagemap.h> @@ -38,7 +39,6 @@ #include <linux/poll.h> #include <linux/file.h> #include <linux/dcache.h> -#include <asm/uaccess.h> /* Simple "sleep if unchanged" interface. */ @@ -55,9 +55,14 @@ struct futex_q { struct list_head list; wait_queue_head_t waiters; + /* Page struct and offset within it. */ struct page *page; unsigned int offset; + + /* the virtual => physical cache */ + vcache_t vcache; + /* For fd, sigio sent using these. */ int fd; struct file *filp; @@ -85,21 +90,43 @@ send_sigio(&q->filp->f_owner, q->fd, POLL_IN); } +/* Get kernel address of the user page and pin it. */ +static struct page *pin_page(unsigned long page_start) +{ + struct mm_struct *mm = current->mm; + struct page *page = NULL; + int err; + + down_read(&mm->mmap_sem); + + err = get_user_pages(current, mm, page_start, + 1 /* one page */, + 0 /* not writable */, + 0 /* don't force */, + &page, + NULL /* don't return vmas */); + up_read(&mm->mmap_sem); + if (err < 0) + return ERR_PTR(err); + return page; +} + static inline void unpin_page(struct page *page) { - /* Avoid releasing the page which is on the LRU list. I don't - know if this is correct, but it stops the BUG() in - __free_pages_ok(). */ page_cache_release(page); } -static int futex_wake(struct list_head *head, - struct page *page, - unsigned int offset, - int num) +static int futex_wake(unsigned long uaddr, unsigned int offset, int num) { - struct list_head *i, *next; - int num_woken = 0; + struct list_head *i, *next, *head; + struct page *page; + int ret; + + page = pin_page(uaddr - offset); + ret = IS_ERR(page); + if (ret) + goto out; + head = hash_futex(page, offset); spin_lock(&futex_lock); list_for_each_safe(i, next, head) { @@ -108,36 +135,81 @@ if (this->page == page && this->offset == offset) { list_del_init(i); tell_waiter(this); - num_woken++; - if (num_woken >= num) break; + ret++; + if (ret >= num) + break; } } spin_unlock(&futex_lock); - return num_woken; + unpin_page(page); +out: + return ret; +} + +static void futex_vcache_callback(vcache_t *vcache, struct page *new_page) +{ + struct futex_q *q = container_of(vcache, struct futex_q, vcache); + struct list_head *head = hash_futex(new_page, q->offset); + + BUG_ON(list_empty(&q->list)); + + spin_lock(&futex_lock); + + q->page = new_page; + list_del_init(&q->list); + list_add_tail(&q->list, head); + + spin_unlock(&futex_lock); } /* Add at end to avoid starvation */ -static inline void queue_me(struct list_head *head, +static inline int queue_me(struct list_head *head, struct futex_q *q, struct page *page, unsigned int offset, int fd, - struct file *filp) + struct file *filp, + unsigned long uaddr) { - q->page = page; + struct page *tmp; + int ret = 0; + q->offset = offset; q->fd = fd; q->filp = filp; + spin_lock(&vcache_lock); spin_lock(&futex_lock); - list_add_tail(&q->list, head); + spin_lock(¤t->mm->page_table_lock); + + /* + * Has the mapping changed meanwhile? + */ + tmp = follow_page(current->mm, uaddr, 0); + + if (tmp == page) { + q->page = page; + list_add_tail(&q->list, head); + /* + * We register a futex callback to this virtual address, + * to make sure a COW properly rehashes the futex-queue. + */ + __attach_vcache(&q->vcache, uaddr, current->mm, futex_vcache_callback); + } else + ret = 1; + + spin_unlock(¤t->mm->page_table_lock); spin_unlock(&futex_lock); + spin_unlock(&vcache_lock); + + return ret; } /* Return 1 if we were still queued (ie. 0 means we were woken) */ static inline int unqueue_me(struct futex_q *q) { int ret = 0; + spin_lock(&futex_lock); if (!list_empty(&q->list)) { list_del(&q->list); @@ -147,46 +219,34 @@ return ret; } -/* Get kernel address of the user page and pin it. */ -static struct page *pin_page(unsigned long page_start) -{ - struct mm_struct *mm = current->mm; - struct page *page; - int err; - - down_read(&mm->mmap_sem); - err = get_user_pages(current, mm, page_start, - 1 /* one page */, - 0 /* writable not important */, - 0 /* don't force */, - &page, - NULL /* don't return vmas */); - up_read(&mm->mmap_sem); - - if (err < 0) - return ERR_PTR(err); - return page; -} - -static int futex_wait(struct list_head *head, - struct page *page, +static int futex_wait(unsigned long uaddr, int offset, int val, - int *uaddr, unsigned long time) { - int curval; - struct futex_q q; DECLARE_WAITQUEUE(wait, current); - int ret = 0; + struct list_head *head; + int ret = 0, curval; + struct page *page; + struct futex_q q; + +repeat_lookup: + page = pin_page(uaddr - offset); + ret = IS_ERR(page); + if (ret) + goto out; + head = hash_futex(page, offset); set_current_state(TASK_INTERRUPTIBLE); init_waitqueue_head(&q.waiters); add_wait_queue(&q.waiters, &wait); - queue_me(head, &q, page, offset, -1, NULL); + if (queue_me(head, &q, page, offset, -1, NULL, uaddr)) { + unpin_page(page); + goto repeat_lookup; + } /* Page is pinned, but may no longer be in this address space. */ - if (get_user(curval, uaddr) != 0) { + if (get_user(curval, (int *)uaddr) != 0) { ret = -EFAULT; goto out; } @@ -204,11 +264,15 @@ ret = -EINTR; goto out; } - out: +out: + detach_vcache(&q.vcache); set_current_state(TASK_RUNNING); /* Were we woken up anyway? */ if (!unqueue_me(&q)) - return 0; + ret = 0; + if (page) + unpin_page(page); + return ret; } @@ -251,25 +315,26 @@ /* Signal allows caller to avoid the race which would occur if they set the sigio stuff up afterwards. */ -static int futex_fd(struct list_head *head, - struct page *page, - int offset, - int signal) +static int futex_fd(unsigned long uaddr, int offset, int signal) { - int fd; + struct page *page = NULL; + struct list_head *head; struct futex_q *q; struct file *filp; + int ret; + ret = -EINVAL; if (signal < 0 || signal > _NSIG) - return -EINVAL; + goto out; - fd = get_unused_fd(); - if (fd < 0) - return fd; + ret = get_unused_fd(); + if (ret < 0) + goto out; filp = get_empty_filp(); if (!filp) { - put_unused_fd(fd); - return -ENFILE; + put_unused_fd(ret); + ret = -ENFILE; + goto out; } filp->f_op = &futex_fops; filp->f_vfsmnt = mntget(futex_mnt); @@ -280,37 +345,55 @@ ret = f_setown(filp, current->tgid, 1); if (ret) { - put_unused_fd(fd); + put_unused_fd(ret); put_filp(filp); - return ret; + goto out; } filp->f_owner.signum = signal; } q = kmalloc(sizeof(*q), GFP_KERNEL); if (!q) { - put_unused_fd(fd); + put_unused_fd(ret); + put_filp(filp); + ret = -ENOMEM; + goto out; + } + +repeat_lookup: + page = pin_page(uaddr - offset); + ret = IS_ERR(page); + if (ret) { + put_unused_fd(ret); put_filp(filp); - return -ENOMEM; + kfree(q); + page = NULL; + goto out; } + head = hash_futex(page, offset); /* Initialize queue structure, and add to hash table. */ filp->private_data = q; init_waitqueue_head(&q->waiters); - queue_me(head, q, page, offset, fd, filp); + if (queue_me(head, q, page, offset, ret, filp, uaddr)) { + unpin_page(page); + goto repeat_lookup; + } /* Now we map fd to filp, so userspace can access it */ - fd_install(fd, filp); - return fd; + fd_install(ret, filp); + page = NULL; +out: + if (page) + unpin_page(page); + return ret; } -asmlinkage int sys_futex(void *uaddr, int op, int val, struct timespec *utime) +asmlinkage int sys_futex(unsigned long uaddr, int op, int val, struct timespec *utime) { - int ret; - unsigned long pos_in_page; - struct list_head *head; - struct page *page; unsigned long time = MAX_SCHEDULE_TIMEOUT; + unsigned long pos_in_page; + int ret; if (utime) { struct timespec t; @@ -319,38 +402,27 @@ time = timespec_to_jiffies(&t) + 1; } - pos_in_page = ((unsigned long)uaddr) % PAGE_SIZE; + pos_in_page = uaddr % PAGE_SIZE; /* Must be "naturally" aligned, and not on page boundary. */ if ((pos_in_page % __alignof__(int)) != 0 || pos_in_page + sizeof(int) > PAGE_SIZE) return -EINVAL; - /* Simpler if it doesn't vanish underneath us. */ - page = pin_page((unsigned long)uaddr - pos_in_page); - if (IS_ERR(page)) - return PTR_ERR(page); - - head = hash_futex(page, pos_in_page); switch (op) { case FUTEX_WAIT: - ret = futex_wait(head, page, pos_in_page, val, uaddr, time); + ret = futex_wait(uaddr, pos_in_page, val, time); break; case FUTEX_WAKE: - ret = futex_wake(head, page, pos_in_page, val); + ret = futex_wake(uaddr, pos_in_page, val); break; case FUTEX_FD: /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */ - ret = futex_fd(head, page, pos_in_page, val); - if (ret >= 0) - /* Leave page pinned (attached to fd). */ - return ret; + ret = futex_fd(uaddr, pos_in_page, val); break; default: ret = -EINVAL; } - unpin_page(page); - return ret; } --- linux/kernel/fork.c.orig Fri Sep 27 16:20:01 2002 +++ linux/kernel/fork.c Fri Sep 27 16:20:22 2002 @@ -381,7 +381,7 @@ * not set up a proper pointer then tough luck. */ put_user(0, tsk->user_tid); - sys_futex(tsk->user_tid, FUTEX_WAKE, 1, NULL); + sys_futex((unsigned long)tsk->user_tid, FUTEX_WAKE, 1, NULL); } } --- linux/mm/memory.c.orig Fri Sep 20 17:20:25 2002 +++ linux/mm/memory.c Fri Sep 27 20:40:52 2002 @@ -43,6 +43,7 @@ #include <linux/iobuf.h> #include <linux/highmem.h> #include <linux/pagemap.h> +#include <linux/vcache.h> #include <asm/pgalloc.h> #include <asm/rmap.h> @@ -463,7 +464,7 @@ * Do a quick page-table lookup for a single page. * mm->page_table_lock must be held. */ -static inline struct page * +struct page * follow_page(struct mm_struct *mm, unsigned long address, int write) { pgd_t *pgd; @@ -494,7 +495,7 @@ } out: - return 0; + return NULL; } /* @@ -973,6 +974,7 @@ static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, pte_t *page_table) { + invalidate_vcache(address, vma->vm_mm, new_page); flush_page_to_ram(new_page); flush_cache_page(vma, address); establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); --- linux/mm/vcache.c.orig Fri Sep 27 14:58:10 2002 +++ linux/mm/vcache.c Fri Sep 27 20:33:22 2002 @@ -0,0 +1,93 @@ +/* + * linux/mm/vcache.c + * + * virtual => physical page mapping cache. Users of this mechanism + * register callbacks for a given (virt,mm,phys) page mapping, and + * the kernel guarantees to call back when this mapping is invalidated. + * (ie. upon COW or unmap.) + * + * Started by Ingo Molnar, Copyright (C) 2002 + */ + +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/hash.h> +#include <linux/vcache.h> + +#define VCACHE_HASHBITS 8 +#define VCACHE_HASHSIZE (1 << VCACHE_HASHBITS) + +spinlock_t vcache_lock = SPIN_LOCK_UNLOCKED; + +static struct list_head hash[VCACHE_HASHSIZE]; + +static struct list_head *hash_vcache(unsigned long address, + struct mm_struct *mm) +{ + return &hash[hash_long(address + (unsigned long)mm, VCACHE_HASHBITS)]; +} + +void __attach_vcache(vcache_t *vcache, + unsigned long address, + struct mm_struct *mm, + void (*callback)(struct vcache_s *data, struct page *new)) +{ + struct list_head *hash_head; + + address &= PAGE_MASK; + vcache->address = address; + vcache->mm = mm; + vcache->callback = callback; + + hash_head = hash_vcache(address, mm); + + list_add(&vcache->hash_entry, hash_head); +} + +void detach_vcache(vcache_t *vcache) +{ + spin_lock(&vcache_lock); + list_del(&vcache->hash_entry); + spin_unlock(&vcache_lock); +} + +void invalidate_vcache(unsigned long address, struct mm_struct *mm, + struct page *new_page) +{ + struct list_head *l, *hash_head; + vcache_t *vcache; + + address &= PAGE_MASK; + + hash_head = hash_vcache(address, mm); + /* + * This is safe, because this path is called with the mm + * semaphore read-held, and the add/remove path calls with the + * mm semaphore write-held. So while other mm's might add new + * entries in parallel, and *this* mm is locked out, so if the + * list is empty now then we do not have to take the vcache + * lock to see it's really empty. + */ + if (likely(list_empty(hash_head))) + return; + + spin_lock(&vcache_lock); + list_for_each(l, hash_head) { + vcache = list_entry(l, vcache_t, hash_entry); + if (vcache->address != address || vcache->mm != mm) + continue; + vcache->callback(vcache, new_page); + } + spin_unlock(&vcache_lock); +} + +static int __init vcache_init(void) +{ + unsigned int i; + + for (i = 0; i < VCACHE_HASHSIZE; i++) + INIT_LIST_HEAD(hash + i); + return 0; +} +__initcall(vcache_init); + --- linux/mm/Makefile.orig Fri Sep 27 15:03:25 2002 +++ linux/mm/Makefile Fri Sep 27 15:03:31 2002 @@ -9,6 +9,6 @@ vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \ shmem.o highmem.o mempool.o msync.o mincore.o readahead.o \ - pdflush.o page-writeback.o rmap.o madvise.o + pdflush.o page-writeback.o rmap.o madvise.o vcache.o include $(TOPDIR)/Rules.make ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] 'virtual => physical page mapping cache' take #2, vcache-2.5.38-C4 2002-09-27 18:54 ` [patch] 'virtual => physical page mapping cache' take #2, vcache-2.5.38-C4 Ingo Molnar @ 2002-09-27 19:12 ` Ingo Molnar 2002-10-15 5:15 ` Rusty Russell 1 sibling, 0 replies; 23+ messages in thread From: Ingo Molnar @ 2002-09-27 19:12 UTC (permalink / raw) To: Linus Torvalds; +Cc: Andrew Morton, Rusty Russell, linux-kernel > [...] The method is to do a cheap physpage lookup (ie. only the > read-lock is taken, the page is forced writable), [...] ^---not ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] 'virtual => physical page mapping cache' take #2, vcache-2.5.38-C4 2002-09-27 18:54 ` [patch] 'virtual => physical page mapping cache' take #2, vcache-2.5.38-C4 Ingo Molnar 2002-09-27 19:12 ` Ingo Molnar @ 2002-10-15 5:15 ` Rusty Russell 1 sibling, 0 replies; 23+ messages in thread From: Rusty Russell @ 2002-10-15 5:15 UTC (permalink / raw) To: Ingo Molnar; +Cc: Linus Torvalds, Andrew Morton, linux-kernel In message <Pine.LNX.4.44.0209272043260.17678-100000@localhost.localdomain> you write: > + page = pin_page(uaddr - offset); > + ret = IS_ERR(page); > + if (ret) > + goto out; > + head = hash_futex(page, offset); Um, you mean: if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; } Cheers, Rusty. -- Anyone who quotes me in their sig is an idiot. -- Rusty Russell. ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 2002-09-27 17:33 ` Ingo Molnar 2002-09-27 17:32 ` Linus Torvalds @ 2002-09-27 17:44 ` Ingo Molnar 2002-09-27 17:42 ` Chris Wedgwood 1 sibling, 1 reply; 23+ messages in thread From: Ingo Molnar @ 2002-09-27 17:44 UTC (permalink / raw) To: Linus Torvalds; +Cc: Andrew Morton, Rusty Russell, linux-kernel and i'm not quite sure how other users of get_user_pages() (direct-IO) handle these kinds of COW races. A COW can invalidate a physical page anytime, so the DMA might go to the fork()ed child process, creating very unexpected results. We are protected against kswapd via the elevated page count, but are not protected against COW. Ingo ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 2002-09-27 17:44 ` [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 Ingo Molnar @ 2002-09-27 17:42 ` Chris Wedgwood 2002-09-27 17:53 ` Ingo Molnar 0 siblings, 1 reply; 23+ messages in thread From: Chris Wedgwood @ 2002-09-27 17:42 UTC (permalink / raw) To: Ingo Molnar; +Cc: Linus Torvalds, Andrew Morton, Rusty Russell, linux-kernel On Fri, Sep 27, 2002 at 07:44:08PM +0200, Ingo Molnar wrote: > and i'm not quite sure how other users of get_user_pages() > (direct-IO) handle these kinds of COW races. O_DIRECT is allow to break if someone does something silly :) I think XFS tried hard to make sure things are consistent, but e2fs (at least at one point) allowed you to read data from a file O_DIRECT which was very different to what someone else read via the page-cache. --cw ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 2002-09-27 17:42 ` Chris Wedgwood @ 2002-09-27 17:53 ` Ingo Molnar 2002-09-27 17:59 ` Chris Wedgwood 2002-09-27 18:01 ` Alan Cox 0 siblings, 2 replies; 23+ messages in thread From: Ingo Molnar @ 2002-09-27 17:53 UTC (permalink / raw) To: Chris Wedgwood; +Cc: Linus Torvalds, Andrew Morton, Rusty Russell, linux-kernel On Fri, 27 Sep 2002, Chris Wedgwood wrote: > O_DIRECT is allow to break if someone does something silly :) [...] to DMA into a page that does not belong to the process anymore? I doubt that. Ingo ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 2002-09-27 17:53 ` Ingo Molnar @ 2002-09-27 17:59 ` Chris Wedgwood 2002-09-27 18:01 ` Alan Cox 1 sibling, 0 replies; 23+ messages in thread From: Chris Wedgwood @ 2002-09-27 17:59 UTC (permalink / raw) To: Ingo Molnar; +Cc: Linus Torvalds, Andrew Morton, Rusty Russell, linux-kernel On Fri, Sep 27, 2002 at 07:53:23PM +0200, Ingo Molnar wrote: > to DMA into a page that does not belong to the process anymore? I > doubt that. ah, ok ... sure, that isn't (shouldn't be) allowed i was thinking of reading/writing to/from data during COW --cw ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 2002-09-27 17:53 ` Ingo Molnar 2002-09-27 17:59 ` Chris Wedgwood @ 2002-09-27 18:01 ` Alan Cox 2002-09-27 18:35 ` [patch] 'virtual => physical page mapping cache',vcache-2.5.38-B8 Andrew Morton 1 sibling, 1 reply; 23+ messages in thread From: Alan Cox @ 2002-09-27 18:01 UTC (permalink / raw) To: Ingo Molnar Cc: Chris Wedgwood, Linus Torvalds, Andrew Morton, Rusty Russell, linux-kernel On Fri, 2002-09-27 at 18:53, Ingo Molnar wrote: > > On Fri, 27 Sep 2002, Chris Wedgwood wrote: > > > O_DIRECT is allow to break if someone does something silly :) [...] > > to DMA into a page that does not belong to the process anymore? I doubt > that. Try doing a truncate on a file as an O_DIRECT write is occuring. Scribbling into pages you dont own is the least of your problems. ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] 'virtual => physical page mapping cache',vcache-2.5.38-B8 2002-09-27 18:01 ` Alan Cox @ 2002-09-27 18:35 ` Andrew Morton 2002-09-27 19:16 ` Alan Cox 0 siblings, 1 reply; 23+ messages in thread From: Andrew Morton @ 2002-09-27 18:35 UTC (permalink / raw) To: Alan Cox Cc: Ingo Molnar, Chris Wedgwood, Linus Torvalds, Rusty Russell, linux-kernel Alan Cox wrote: > > On Fri, 2002-09-27 at 18:53, Ingo Molnar wrote: > > > > On Fri, 27 Sep 2002, Chris Wedgwood wrote: > > > > > O_DIRECT is allow to break if someone does something silly :) [...] > > > > to DMA into a page that does not belong to the process anymore? I doubt > > that. > > Try doing a truncate on a file as an O_DIRECT write is occuring. > Scribbling into pages you dont own is the least of your problems. O_DIRECT writes operate under i_sem, which provides exclusion from trucate. Do you know something which I don't?? ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] 'virtual => physical page mapping cache',vcache-2.5.38-B8 2002-09-27 18:35 ` [patch] 'virtual => physical page mapping cache',vcache-2.5.38-B8 Andrew Morton @ 2002-09-27 19:16 ` Alan Cox 2002-09-27 19:25 ` [patch] 'virtual => physical page mappingcache',vcache-2.5.38-B8 Andrew Morton 0 siblings, 1 reply; 23+ messages in thread From: Alan Cox @ 2002-09-27 19:16 UTC (permalink / raw) To: Andrew Morton Cc: Ingo Molnar, Chris Wedgwood, Linus Torvalds, Rusty Russell, linux-kernel On Fri, 2002-09-27 at 19:35, Andrew Morton wrote: > O_DIRECT writes operate under i_sem, which provides exclusion > from trucate. Do you know something which I don't?? So it does, hidden away in generic_file_write rather than the lower layers. Interesting. So now I have a new question to resolve, which is why doing O_DIRECT and truncate together corrupted my disk when I tried it trying to break stuff ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] 'virtual => physical page mappingcache',vcache-2.5.38-B8 2002-09-27 19:16 ` Alan Cox @ 2002-09-27 19:25 ` Andrew Morton 2002-09-27 20:02 ` Alan Cox 0 siblings, 1 reply; 23+ messages in thread From: Andrew Morton @ 2002-09-27 19:25 UTC (permalink / raw) To: Alan Cox; +Cc: linux-kernel Alan Cox wrote: > > On Fri, 2002-09-27 at 19:35, Andrew Morton wrote: > > O_DIRECT writes operate under i_sem, which provides exclusion > > from trucate. Do you know something which I don't?? > > So it does, hidden away in generic_file_write rather than the lower > layers. Well that's sort of the same level at which truncate takes it. It's a pretty big lock. > Interesting. So now I have a new question to resolve, which is > why doing O_DIRECT and truncate together corrupted my disk when I tried > it trying to break stuff Interesting indeed. Possibly invalidate_inode_pages2() accidentally left some dirty buffers detached from the mapping. Hard to see how it could do that. What kernel were you testing? ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] 'virtual => physical page mappingcache',vcache-2.5.38-B8 2002-09-27 19:25 ` [patch] 'virtual => physical page mappingcache',vcache-2.5.38-B8 Andrew Morton @ 2002-09-27 20:02 ` Alan Cox 0 siblings, 0 replies; 23+ messages in thread From: Alan Cox @ 2002-09-27 20:02 UTC (permalink / raw) To: Andrew Morton; +Cc: linux-kernel > > Interesting. So now I have a new question to resolve, which is > > why doing O_DIRECT and truncate together corrupted my disk when I tried > > it trying to break stuff > > Interesting indeed. Possibly invalidate_inode_pages2() accidentally > left some dirty buffers detached from the mapping. Hard to see how > it could do that. > > What kernel were you testing? 2.4.20pre5-ac with new IDE. So there are several other candidates 8) ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 2002-09-27 16:26 ` Linus Torvalds 2002-09-27 16:42 ` Ingo Molnar @ 2002-09-28 10:22 ` Arjan van de Ven 1 sibling, 0 replies; 23+ messages in thread From: Arjan van de Ven @ 2002-09-28 10:22 UTC (permalink / raw) To: Linus Torvalds; +Cc: Ingo Molnar, Andrew Morton, Rusty Russell, linux-kernel [-- Attachment #1: Type: text/plain, Size: 673 bytes --] On Fri, 2002-09-27 at 18:26, Linus Torvalds wrote: > > On Fri, 27 Sep 2002, Ingo Molnar wrote: > > > > the attached patch implements the virtual => physical cache. Right now > > only the COW code calls the invalidation function, because futexes do not > > need notification on unmap. > > Ok, looks good. Except you make get_user_page() do a write fault on the > page, and one of the points of this approach was that that shouldn't even > be needed. Or did I miss some case that does need it? get_user_page() cannot/should not ever do a pagefault via the pagefault code otherwise the coredump code will take the mmap semaphore recursively and deadlock. [-- Attachment #2: This is a digitally signed message part --] [-- Type: application/pgp-signature, Size: 189 bytes --] ^ permalink raw reply [flat|nested] 23+ messages in thread
end of thread, other threads:[~2002-10-15 5:21 UTC | newest] Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2002-09-27 16:24 [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 Ingo Molnar 2002-09-27 16:26 ` Linus Torvalds 2002-09-27 16:42 ` Ingo Molnar 2002-09-27 16:47 ` Linus Torvalds 2002-09-27 17:01 ` Ingo Molnar 2002-09-27 17:05 ` Linus Torvalds 2002-09-27 17:33 ` Ingo Molnar 2002-09-27 17:32 ` Linus Torvalds 2002-09-27 17:42 ` Ingo Molnar 2002-09-27 17:48 ` Linus Torvalds 2002-09-27 18:54 ` [patch] 'virtual => physical page mapping cache' take #2, vcache-2.5.38-C4 Ingo Molnar 2002-09-27 19:12 ` Ingo Molnar 2002-10-15 5:15 ` Rusty Russell 2002-09-27 17:44 ` [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 Ingo Molnar 2002-09-27 17:42 ` Chris Wedgwood 2002-09-27 17:53 ` Ingo Molnar 2002-09-27 17:59 ` Chris Wedgwood 2002-09-27 18:01 ` Alan Cox 2002-09-27 18:35 ` [patch] 'virtual => physical page mapping cache',vcache-2.5.38-B8 Andrew Morton 2002-09-27 19:16 ` Alan Cox 2002-09-27 19:25 ` [patch] 'virtual => physical page mappingcache',vcache-2.5.38-B8 Andrew Morton 2002-09-27 20:02 ` Alan Cox 2002-09-28 10:22 ` [patch] 'virtual => physical page mapping cache', vcache-2.5.38-B8 Arjan van de Ven
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).