linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] Futex waiters take an mm or inode reference
@ 2003-09-30  7:42 Jamie Lokier
  2003-09-30  8:22 ` [PATCH] Bumbling follow-on to previous "perfect" futex patch Jamie Lokier
  0 siblings, 1 reply; 2+ messages in thread
From: Jamie Lokier @ 2003-09-30  7:42 UTC (permalink / raw)
  To: torvalds
  Cc: akpm, Hu, Boris, linux-kernel, Hugh Dickins, Ulrich Drepper,
	Jamie Lokier

Patch: futex_refs-2.6.0-test6

This patch fixes an important hole in 2.6.0-test6 futexes.  At the
moment, a program can use FUTEX_FD to create futexes on mm's or inodes
which are recycled, and thereby steal wakeups from unrelated tasks.

Rusty Russell wrote:
> But why not solve the problem by just holding an mm reference, too?

Exactly.  With this patch each futex hold a reference to the mm or inode
that it is keyed on.

Concerning FUTEX_REQUEUE:
> Why not make the code a *whole* lot more readable (and only marginally
> slower, if at all) by doing it in two passes: pull them off onto a
> (on-stack) list in one pass, then requeue them all in another.

Rusty will be glad that I have reimplemented futex_requeue() exactly as
suggested: in two passes.  This also removes that unpleasant double-lock
for the two hash buckets: now only one at a time is locked.

The set_current_state() logic in FUTEX_WAIT has been simplified too.

Linus, please apply.

Thanks all,
-- Jamie


diff -urN --exclude-from=dontdiff orig-2.6.0-test6/kernel/futex.c dual-2.6.0-test6/kernel/futex.c
--- orig-2.6.0-test6/kernel/futex.c	2003-09-30 05:41:14.000000000 +0100
+++ dual-2.6.0-test6/kernel/futex.c	2003-09-30 06:43:17.000000000 +0100
@@ -45,6 +45,9 @@
  * Futexes are matched on equal values of this key.
  * The key type depends on whether it's a shared or private mapping.
  * Don't rearrange members without looking at hash_futex().
+ *
+ * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
+ * We set bit 0 to indicate if it's an inode-based key.
  */
 union futex_key {
 	struct {
@@ -172,9 +175,10 @@
 	}
 
 	/*
-	 * Linear mappings are also simple.
+	 * Linear file mappings are also simple.
 	 */
 	key->shared.inode = vma->vm_file->f_dentry->d_inode;
+	key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
 	if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
 		key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT)
 				     + vma->vm_pgoff);
@@ -214,16 +218,55 @@
 	return err;
 }
 
+/*
+ * Take a reference to the resource addressed by a key.
+ *
+ * NOTE: mmap_sem MUST be held between get_futex_key() and calling this
+ * function, if it is called at all.  mmap_sem keeps key->shared.inode valid.
+ */
+static inline void get_key_refs(union futex_key *key)
+{
+	if (key->both.ptr != 0) {
+		if (key->both.offset & 1)
+			atomic_inc(&key->shared.inode->i_count);
+		else
+			atomic_inc(&key->private.mm->mm_count);
+	}
+}
+
+/*
+ * Drop a reference to the resource addressed by a key.
+ * The hash bucket lock must not be held.
+ */
+static inline void drop_key_refs(union futex_key *key)
+{
+	if (key->both.ptr != 0) {
+		if (key->both.offset & 1)
+			iput(key->shared.inode);
+		else
+			mmdrop(key->private.mm);
+	}
+}
+
+/* The hash bucket lock must be held when this is called. */
+static inline void wake_futex(struct futex_q *q)
+{
+	list_del_init(&q->list);
+	wake_up_all(&q->waiters);
+	if (q->filp)
+		send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
+}
 
 /*
  * Wake up all waiters hashed on the physical page that is mapped
  * to this virtual address:
  */
-static int futex_wake(unsigned long uaddr, int num)
+static int futex_wake(unsigned long uaddr, int nr_wake)
 {
-	struct list_head *i, *next, *head;
-	struct futex_hash_bucket *bh;
 	union futex_key key;
+	struct futex_hash_bucket *bh;
+	struct list_head *head;
+	struct futex_q *this, *next;
 	int ret;
 
 	down_read(&current->mm->mmap_sem);
@@ -236,21 +279,15 @@
 	spin_lock(&bh->lock);
 	head = &bh->chain;
 
-	list_for_each_safe(i, next, head) {
-		struct futex_q *this = list_entry(i, struct futex_q, list);
-
+	list_for_each_entry_safe(this, next, head, list) {
 		if (match_futex (&this->key, &key)) {
-			list_del_init(i);
-			wake_up_all(&this->waiters);
-			if (this->filp)
-				send_sigio(&this->filp->f_owner, this->fd, POLL_IN);
-			ret++;
-			if (ret >= num)
+			wake_futex(this);
+			if (++ret >= nr_wake)
 				break;
 		}
 	}
-	spin_unlock(&bh->lock);
 
+	spin_unlock(&bh->lock);
 out:
 	up_read(&current->mm->mmap_sem);
 	return ret;
@@ -263,9 +300,10 @@
 static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
 				int nr_wake, int nr_requeue)
 {
-	struct list_head *i, *next, *head1, *head2;
-	struct futex_hash_bucket *bh1, *bh2;
 	union futex_key key1, key2;
+	struct futex_hash_bucket *bh;
+	struct list_head *head, moved;
+	struct futex_q *this, *next;
 	int ret;
 
 	down_read(&current->mm->mmap_sem);
@@ -277,65 +315,65 @@
 	if (unlikely(ret != 0))
 		goto out;
 
-	bh1 = hash_futex(&key1);
-	bh2 = hash_futex(&key2);
-	if (bh1 < bh2) {
-		spin_lock(&bh1->lock);
-		spin_lock(&bh2->lock);
-	} else {
-		spin_lock(&bh2->lock);
-		if (bh1 > bh2)
-			spin_lock(&bh1->lock);
-	}
-	head1 = &bh1->chain;
-	head2 = &bh2->chain;
-
-	list_for_each_safe(i, next, head1) {
-		struct futex_q *this = list_entry(i, struct futex_q, list);
+	bh = hash_futex(&key1);
+	spin_lock(&bh->lock);
+	head = &bh->chain;
 
+	INIT_LIST_HEAD(&moved);
+	list_for_each_entry_safe(this, next, head, list) {
 		if (match_futex (&this->key, &key1)) {
-			list_del_init(i);
 			if (++ret <= nr_wake) {
-				wake_up_all(&this->waiters);
-				if (this->filp)
-					send_sigio(&this->filp->f_owner,
-							this->fd, POLL_IN);
+				wake_futex(this);
 			} else {
-				list_add_tail(i, head2);
-				this->key = key2;
+				list_move_tail(&this->list, &moved);
 				if (ret - nr_wake >= nr_requeue)
 					break;
-				/* Make sure to stop if key1 == key2 */
-				if (head1 == head2 && head1 != next)
-					head1 = i;
 			}
 		}
 	}
-	if (bh1 < bh2) {
-		spin_unlock(&bh2->lock);
-		spin_unlock(&bh1->lock);
-	} else {
-		if (bh1 > bh2)
-			spin_unlock(&bh1->lock);
-		spin_unlock(&bh2->lock);
+
+	spin_unlock(&bh->lock);
+
+	if (!list_empty(&moved)) {
+		bh = hash_futex(&key2);
+		head = &bh->chain;
+		list_for_each_entry_safe(this, next, &moved, list) {
+			/* Don't hold the spinlock during drop_key_refs(). */
+			drop_key_refs(&this->key);
+			this->key = key2;
+			get_key_refs(&this->key);
+
+			spin_lock(&bh->lock);
+			list_add_tail(&this->list, head);
+			spin_unlock(&bh->lock);
+		}
 	}
+
 out:
 	up_read(&current->mm->mmap_sem);
 	return ret;
 }
 
-static inline void queue_me(struct futex_q *q, union futex_key *key,
-			    int fd, struct file *filp)
+/*
+ * queue_me and unqueue_me must be called as a pair, each
+ * exactly once.  They are called with the hashed spinlock held.
+ */
+
+/* The key must be already stored in q->key. */
+static inline void queue_me(struct futex_q *q, int fd, struct file *filp)
 {
-	struct futex_hash_bucket *bh = hash_futex(key);
-	struct list_head *head = &bh->chain;
+	struct futex_hash_bucket *bh;
 
-	q->key = *key;
 	q->fd = fd;
 	q->filp = filp;
 
+	init_waitqueue_head(&q->waiters);
+
+	get_key_refs(&q->key);
+	bh = hash_futex(&q->key);
+
 	spin_lock(&bh->lock);
-	list_add_tail(&q->list, head);
+	list_add_tail(&q->list, &bh->chain);
 	spin_unlock(&bh->lock);
 }
 
@@ -351,6 +389,8 @@
 		ret = 1;
 	}
 	spin_unlock(&bh->lock);
+
+	drop_key_refs(&q->key);
 	return ret;
 }
 
@@ -358,19 +398,16 @@
 {
 	DECLARE_WAITQUEUE(wait, current);
 	int ret, curval;
-	union futex_key key;
 	struct futex_q q;
-	struct futex_hash_bucket *bh = NULL;
-
-	init_waitqueue_head(&q.waiters);
+	struct futex_hash_bucket *bh;
 
 	down_read(&current->mm->mmap_sem);
 
-	ret = get_futex_key(uaddr, &key);
+	ret = get_futex_key(uaddr, &q.key);
 	if (unlikely(ret != 0))
 		goto out_release_sem;
 
-	queue_me(&q, &key, -1, NULL);
+	queue_me(&q, -1, NULL);
 
 	/*
 	 * Access the page after the futex is queued.
@@ -401,22 +438,18 @@
 	 * wakes us up.
 	 */
 	add_wait_queue(&q.waiters, &wait);
-	bh = hash_futex(&key);
+	bh = hash_futex(&q.key);
 	spin_lock(&bh->lock);
-	set_current_state(TASK_INTERRUPTIBLE);
 
 	if (unlikely(list_empty(&q.list))) {
-		/*
-		 * We were woken already.
-		 */
+		/* We were woken already. */
 		spin_unlock(&bh->lock);
-		set_current_state(TASK_RUNNING);
 		return 0;
 	}
 
+	__set_current_state(TASK_INTERRUPTIBLE);
 	spin_unlock(&bh->lock);
 	time = schedule_timeout(time);
-	set_current_state(TASK_RUNNING);
 
 	/*
 	 * NOTE: we don't remove ourselves from the waitqueue because
@@ -446,7 +479,7 @@
 	struct futex_q *q = filp->private_data;
 
 	unqueue_me(q);
-	kfree(filp->private_data);
+	kfree(q);
 	return 0;
 }
 
@@ -477,7 +510,6 @@
 static int futex_fd(unsigned long uaddr, int signal)
 {
 	struct futex_q *q;
-	union futex_key key;
 	struct file *filp;
 	int ret, err;
 
@@ -519,20 +551,22 @@
 	}
 
 	down_read(&current->mm->mmap_sem);
-	err = get_futex_key(uaddr, &key);
-	up_read(&current->mm->mmap_sem);
+	err = get_futex_key(uaddr, &q->key);
 
 	if (unlikely(err != 0)) {
+		up_read(&current->mm->mmap_sem);
 		put_unused_fd(ret);
 		put_filp(filp);
 		kfree(q);
 		return err;
 	}
 
-	init_waitqueue_head(&q->waiters);
+	/* queue_me() must be called before releasing mmap_sem, because
+	   key->shared.inode needs to be referenced while holding it. */
 	filp->private_data = q;
 
-	queue_me(q, &key, ret, filp);
+	queue_me(q, ret, filp);
+	up_read(&current->mm->mmap_sem);
 
 	/* Now we map fd to filp, so userspace can access it */
 	fd_install(ret, filp);

^ permalink raw reply	[flat|nested] 2+ messages in thread

* [PATCH] Bumbling follow-on to previous "perfect" futex patch
  2003-09-30  7:42 [PATCH] Futex waiters take an mm or inode reference Jamie Lokier
@ 2003-09-30  8:22 ` Jamie Lokier
  0 siblings, 0 replies; 2+ messages in thread
From: Jamie Lokier @ 2003-09-30  8:22 UTC (permalink / raw)
  To: torvalds; +Cc: akpm, Hu, Boris, linux-kernel, Hugh Dickins, Ulrich Drepper

Patch: futex_refs2-2.6.0-test6

Please apply this on top of the previous patch, titled "Futex waiters
take an mm or inode reference".

Thanks,
-- Jamie


--- dual-2.6.0-test6/kernel/futex.c.before	2003-09-30 06:43:17.000000000 +0100
+++ dual-2.6.0-test6/kernel/futex.c	2003-09-30 09:12:13.457796502 +0100
@@ -444,6 +444,8 @@
 	if (unlikely(list_empty(&q.list))) {
 		/* We were woken already. */
 		spin_unlock(&bh->lock);
+		/* Equivalent to calling unqueue_me() here, but faster. */
+		drop_key_refs(&q.key);
 		return 0;
 	}
 

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2003-09-30  8:23 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-09-30  7:42 [PATCH] Futex waiters take an mm or inode reference Jamie Lokier
2003-09-30  8:22 ` [PATCH] Bumbling follow-on to previous "perfect" futex patch Jamie Lokier

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).