All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] [patch 4a/4] ipc: sem optimise simple operations
@ 2009-08-14 19:16 Manfred Spraul
  2009-08-15  4:52 ` Nick Piggin
  0 siblings, 1 reply; 13+ messages in thread
From: Manfred Spraul @ 2009-08-14 19:16 UTC (permalink / raw)
  To: npiggin; +Cc: Andrew Morton, Nadia Derbey, Pierre Peiffer, linux-kernel

An alternative patch to Nick's proposal:
http://lkml.org/lkml/2009/8/11/11
The patch is based on Nick's patch.

The patch is ontop of the first three patches from Nick.

Identical with Nick's patch:
- per semaphore list to optimize single sop semop calls:
This avoids that the semaphore operations have to scan through all waiting
tasks, even for the tasks that are waiting on a different semaphore from
the same array.

Differences:
- same code for per-semaphore and global list.
- one list for both wait-for-zero and decrement instead of two lists.
- unlink_queue helper function

Open points:
- further testing
- understand all changes. e.g. why switch from "if (alter)" to
	"if (alter & !error)" in update_queue

>From the diffstat, it's simpler: 50 new lines instead of 110 new lines.

Nick: What do you think? Could you try your test app?

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
---
 include/linux/sem.h |    5 ++-
 ipc/sem.c           |   83 ++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 69 insertions(+), 19 deletions(-)

diff --git a/include/linux/sem.h b/include/linux/sem.h
index 1b191c1..ea03058 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -86,6 +86,7 @@ struct task_struct;
 struct sem {
 	int	semval;		/* current value */
 	int	sempid;		/* pid of last operation */
+	struct list_head	sem_pending;	/* pending operations to be processed */
 };
 
 /* One sem_array data structure for each set of semaphores in the system. */
@@ -96,11 +97,13 @@ struct sem_array {
 	struct sem		*sem_base;	/* ptr to first semaphore in array */
 	struct list_head	sem_pending;	/* pending operations to be processed */
 	struct list_head	list_id;	/* undo requests on this array */
-	unsigned long		sem_nsems;	/* no. of semaphores in array */
+	int			sem_nsems;	/* no. of semaphores in array */
+	int			complex_count;	/* pending complex operations */
 };
 
 /* One queue for each sleeping process in the system. */
 struct sem_queue {
+	struct list_head	simple_list; /* queue of pending operations */
 	struct list_head	list;	 /* queue of pending operations */
 	struct task_struct	*sleeper; /* this process */
 	struct sem_undo		*undo;	 /* undo structure */
diff --git a/ipc/sem.c b/ipc/sem.c
index 3629ef8..495449e 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -240,6 +240,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
 	key_t key = params->key;
 	int nsems = params->u.nsems;
 	int semflg = params->flg;
+	int i;
 
 	if (!nsems)
 		return -EINVAL;
@@ -272,6 +273,14 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
 	ns->used_sems += nsems;
 
 	sma->sem_base = (struct sem *) &sma[1];
+
+	for (i = 0; i < nsems; i++) {
+		struct sem *s = &sma->sem_base[i];
+
+		INIT_LIST_HEAD(&s->sem_pending);
+	}
+
+	sma->complex_count = 0;
 	INIT_LIST_HEAD(&sma->sem_pending);
 	INIT_LIST_HEAD(&sma->list_id);
 	sma->sem_nsems = nsems;
@@ -335,7 +344,7 @@ SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
  * all at once. Return 0 if yes, 1 if need to sleep, else return error code.
  */
 
-static int try_atomic_semop (struct sem_array * sma, struct sembuf * sops,
+static int try_atomic_semop(struct sem_array * sma, struct sembuf * sops,
 			     int nsops, struct sem_undo *un, int pid)
 {
 	int result, sem_op;
@@ -418,15 +427,35 @@ static void wake_up_sem_queue(struct sem_queue *q, int error)
 	preempt_enable();
 }
 
+static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
+{
+	list_del(&q->list);
+	if (q->nsops == 1) {
+		BUG_ON(list_empty(&q->simple_list));
+		list_del(&q->simple_list);
+	} else {
+		BUG_ON(!list_empty(&q->simple_list));
+		sma->complex_count--;
+	}
+}
+
+
 /* Go through the pending queue for the indicated semaphore
  * looking for tasks that can be completed.
  */
-static void update_queue (struct sem_array * sma)
+static void update_queue(struct sem_array *sma, int semnum)
 {
 	struct sem_queue *q, *tq;
+	struct list_head *pending_list;
 
 again:
-	list_for_each_entry_safe(q, tq, &sma->sem_pending, list) {
+	if (semnum == -1 || sma->complex_count) {
+		pending_list = &sma->sem_pending;
+	} else {
+		pending_list = &sma->sem_base[semnum].sem_pending;
+	}
+
+	list_for_each_entry_safe(q, tq, pending_list, list) {
 		int error;
 		int alter;
 
@@ -436,21 +465,21 @@ again:
 		/* Does q->sleeper still need to sleep? */
 		if (error > 0)
 			continue;
+		unlink_queue(sma, q);
 
-		list_del(&q->list);
+		alter = q->alter;
+		wake_up_sem_queue(q, error);
 
 		/*
 		 * The next operation that must be checked depends on the type
 		 * of the completed operation:
 		 * - if the operation modified the array, then restart from the
 		 *   head of the queue and check for threads that might be
-		 *   waiting for semaphore values to become 0.
+		 *   waiting for this particular semaphore value.
 		 * - if the operation didn't modify the array, then just
 		 *   continue.
 		 */
-		alter = q->alter;
-		wake_up_sem_queue(q, error);
-		if (alter)
+		if (alter && !error)
 			goto again;
 	}
 }
@@ -531,8 +560,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
 
 	/* Wake up all pending processes and let them fail with EIDRM. */
 	list_for_each_entry_safe(q, tq, &sma->sem_pending, list) {
-		list_del(&q->list);
-
+		unlink_queue(sma, q);
 		wake_up_sem_queue(q, -EIDRM);
 	}
 
@@ -754,7 +782,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
 		}
 		sma->sem_ctime = get_seconds();
 		/* maybe some queued-up processes were waiting for this */
-		update_queue(sma);
+		update_queue(sma, -1);
 		err = 0;
 		goto out_unlock;
 	}
@@ -796,7 +824,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
 		curr->sempid = task_tgid_vnr(current);
 		sma->sem_ctime = get_seconds();
 		/* maybe some queued-up processes were waiting for this */
-		update_queue(sma);
+		update_queue(sma, semnum);
 		err = 0;
 		goto out_unlock;
 	}
@@ -1169,10 +1197,14 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
 	if (error)
 		goto out_unlock_free;
 
-	error = try_atomic_semop (sma, sops, nsops, un, task_tgid_vnr(current));
+	error = try_atomic_semop(sma, sops, nsops, un, task_tgid_vnr(current));
 	if (error <= 0) {
-		if (alter && error == 0)
-			update_queue (sma);
+		if (alter && !error) {
+			if (nsops == 1)
+				update_queue(sma, sops->sem_num);
+			else
+				update_queue(sma, -1);
+		}
 		goto out_unlock_free;
 	}
 
@@ -1189,6 +1221,18 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
 		list_add_tail(&queue.list, &sma->sem_pending);
 	else
 		list_add(&queue.list, &sma->sem_pending);
+	if (nsops == 1) {
+		struct sem *curr;
+		curr = &sma->sem_base[sops->sem_num];
+
+		if (alter)
+			list_add_tail(&queue.list, &curr->sem_pending);
+		else
+			list_add(&queue.list, &curr->sem_pending);
+	} else {
+		INIT_LIST_HEAD(&queue.simple_list);
+		sma->complex_count++;
+	}
 
 	queue.status = -EINTR;
 	queue.sleeper = current;
@@ -1231,7 +1275,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
 	 */
 	if (timeout && jiffies_left == 0)
 		error = -EAGAIN;
-	list_del(&queue.list);
+	unlink_queue(sma, &queue);
 
 out_unlock_free:
 	sem_unlock(sma);
@@ -1356,11 +1400,14 @@ void exit_sem(struct task_struct *tsk)
 				if (semaphore->semval > SEMVMX)
 					semaphore->semval = SEMVMX;
 				semaphore->sempid = task_tgid_vnr(current);
+				if (likely(!sma->complex_count))
+					update_queue(sma, i);
 			}
 		}
 		sma->sem_otime = get_seconds();
 		/* maybe some queued-up processes were waiting for this */
-		update_queue(sma);
+		if (unlikely(sma->complex_count))
+			update_queue(sma, -1);
 		sem_unlock(sma);
 
 		call_rcu(&un->rcu, free_un);
@@ -1374,7 +1421,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
 	struct sem_array *sma = it;
 
 	return seq_printf(s,
-			  "%10d %10d  %4o %10lu %5u %5u %5u %5u %10lu %10lu\n",
+			  "%10d %10d  %4o %10u %5u %5u %5u %5u %10lu %10lu\n",
 			  sma->sem_perm.key,
 			  sma->sem_perm.id,
 			  sma->sem_perm.mode,
-- 
1.6.2.5


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] [patch 4a/4] ipc: sem optimise simple operations
  2009-08-14 19:16 [PATCH] [patch 4a/4] ipc: sem optimise simple operations Manfred Spraul
@ 2009-08-15  4:52 ` Nick Piggin
  2009-08-15 10:10   ` Manfred Spraul
  0 siblings, 1 reply; 13+ messages in thread
From: Nick Piggin @ 2009-08-15  4:52 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: Andrew Morton, Nadia Derbey, Pierre Peiffer, linux-kernel

On Fri, Aug 14, 2009 at 09:16:00PM +0200, Manfred Spraul wrote:
> An alternative patch to Nick's proposal:
> http://lkml.org/lkml/2009/8/11/11
> The patch is based on Nick's patch.
> 
> The patch is ontop of the first three patches from Nick.
> 
> Identical with Nick's patch:
> - per semaphore list to optimize single sop semop calls:
> This avoids that the semaphore operations have to scan through all waiting
> tasks, even for the tasks that are waiting on a different semaphore from
> the same array.
> 
> Differences:
> - same code for per-semaphore and global list.
> - one list for both wait-for-zero and decrement instead of two lists.
> - unlink_queue helper function
> 
> Open points:
> - further testing
> - understand all changes. e.g. why switch from "if (alter)" to
> 	"if (alter & !error)" in update_queue
> 
> >From the diffstat, it's simpler: 50 new lines instead of 110 new lines.
> 
> Nick: What do you think? Could you try your test app?

The problem with using the same algorithm is that we don't need to
"restart scanning" the simple-op lists when one of them succeeds.
This is because if a negative op fails, then no amount of subsequent
simple negative ops will make it succeed.

With multi-ops, it can be adding and subtracting and waiting for
zero of different sems etc, so in order to try to stay strictly
FIFO, then we always have to recheck all previous failed ops after
one succeeds.

The other problem is that we have to scan all ops in the multi-op
list because we don't know what combination of sem values might
allow the op to succeed. With the single-op lists, we know if the
semval is 0, then we don't have to scan the negative list, and if
it is non-0 then we don't have to scan the zero list.

Combine these two problems and that is where the O(n^2) behaviour
comes in to the complex-op algorithm.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] [patch 4a/4] ipc: sem optimise simple operations
  2009-08-15  4:52 ` Nick Piggin
@ 2009-08-15 10:10   ` Manfred Spraul
  2009-08-15 10:38     ` Nick Piggin
  0 siblings, 1 reply; 13+ messages in thread
From: Manfred Spraul @ 2009-08-15 10:10 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, Nadia Derbey, Pierre Peiffer, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2858 bytes --]

On 08/15/2009 06:52 AM, Nick Piggin wrote:
>
> The problem with using the same algorithm is that we don't need to
> "restart scanning" the simple-op lists when one of them succeeds.
> This is because if a negative op fails, then no amount of subsequent
> simple negative ops will make it succeed.
>
> With multi-ops, it can be adding and subtracting and waiting for
> zero of different sems etc, so in order to try to stay strictly
> FIFO, then we always have to recheck all previous failed ops after
> one succeeds.
>    
What prevents us from optimizing the complex algorithm?

Right now, the rule is "restart if q->alter".
What about "restart if at least one operation was an increment or if at 
least one
semaphore got 0"?

> The other problem is that we have to scan all ops in the multi-op
> list because we don't know what combination of sem values might
> allow the op to succeed. With the single-op lists, we know if the
> semval is 0, then we don't have to scan the negative list, and if
> it is non-0 then we don't have to scan the zero list.
>    
If semval is 0, then the negative list is empty - it doesn't cost 
anything to scan it, especially if the negative list is the first part 
of a joint list.

But you describe one difference between the simple and complex operations:
- complex "increment" operations can be in the queue.
- simple "increment" operations always succeed immediately.

Thus it is possible to stop scanning the simple queue if an "alter" 
operation is found and the semaphore value is 0.
For the complex queue, this optimization doesn't appear to be that simple.

AFAICS this is a realistic case:
- 1000 threads are waiting with "decrement by 1".
- semval is 0.
- an increment by 1 arrives.

I've optimized that case, too - it's just one test.

With both optimizations, I see only one case left where your algorithm 
is better:
- semaphore value 1.
- 1000 threads waiting for zero.
- an increment by 1 arrives.

My code would scan the 1000 entries, your code doesn't.
But: Does that exist in real life?

Btw,
- semaphore value 1
- 1000 threads with "decrement 2" are waiting
- an decrement by 1 arrives.
My code doesn't scan at all, AFAICS you would scan all 1000 entries.
The same argument applies:
I would bet that this case doesn't exist in real life.

Could you create a dump of the pending list from a test run?
Does your database use "wait for zero", or increment/decrement 
operations with offsets that are not +-1?
Do you see a common case where separate algorithms are necessary?

> Combine these two problems and that is where the O(n^2) behaviour
> comes in to the complex-op algorithm.
>    
Yes - the worst case remains O(n^2), but only if there are increments.
(to be fair: with the optimizations O(n*m) with n waiting tasks and m 
increments)

But OTHO neither of our patches solves that.

--
     Manfred

[-- Attachment #2: 0001--ipc-sem-optimise-simple-operations-v2.patch --]
[-- Type: text/plain, Size: 11461 bytes --]

>From 563903cbc136b1b9c5b5f2b52d8acb0bbb6a8732 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Sat, 15 Aug 2009 11:48:21 +0200
Subject: [PATCH] [PATCH 4a/4] ipc: sem optimise simple operations, v2

An alternative patch to Nick's proposal:
http://lkml.org/lkml/2009/8/11/11
The patch is based on Nick's patch.

The patch is ontop of the first three patches from Nick.

I've added two additional optimizations for two common cases:
- do not restart scanning for all "alter" operations, only if necessary.
- stop scanning decrements if the semaphore value is already 0.

Identical with Nick's patch:
- per semaphore list to optimize single sop semop calls:
This avoids that the semaphore operations have to scan through all waiting
tasks, even for the tasks that are waiting on a different semaphore from
the same array.

Differences:
- same code for per-semaphore and global list.
- both simple and complex operations are improved.
- one list for both wait-for-zero and decrement instead of two lists.
- unlink_queue helper function.
- do not use likely/unlikely in exit_sem(): complex_count depends on the
  application, the kernel doesn't know anything about the running app.
  And: it's a very rare path.

Open points:
- further testing

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
---
 include/linux/sem.h |    5 ++-
 ipc/sem.c           |  148 ++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 122 insertions(+), 31 deletions(-)

diff --git a/include/linux/sem.h b/include/linux/sem.h
index 1b191c1..ea03058 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -86,6 +86,7 @@ struct task_struct;
 struct sem {
 	int	semval;		/* current value */
 	int	sempid;		/* pid of last operation */
+	struct list_head	sem_pending;	/* pending operations to be processed */
 };
 
 /* One sem_array data structure for each set of semaphores in the system. */
@@ -96,11 +97,13 @@ struct sem_array {
 	struct sem		*sem_base;	/* ptr to first semaphore in array */
 	struct list_head	sem_pending;	/* pending operations to be processed */
 	struct list_head	list_id;	/* undo requests on this array */
-	unsigned long		sem_nsems;	/* no. of semaphores in array */
+	int			sem_nsems;	/* no. of semaphores in array */
+	int			complex_count;	/* pending complex operations */
 };
 
 /* One queue for each sleeping process in the system. */
 struct sem_queue {
+	struct list_head	simple_list; /* queue of pending operations */
 	struct list_head	list;	 /* queue of pending operations */
 	struct task_struct	*sleeper; /* this process */
 	struct sem_undo		*undo;	 /* undo structure */
diff --git a/ipc/sem.c b/ipc/sem.c
index 3629ef8..ee49b38 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -240,6 +240,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
 	key_t key = params->key;
 	int nsems = params->u.nsems;
 	int semflg = params->flg;
+	int i;
 
 	if (!nsems)
 		return -EINVAL;
@@ -272,6 +273,14 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
 	ns->used_sems += nsems;
 
 	sma->sem_base = (struct sem *) &sma[1];
+
+	for (i = 0; i < nsems; i++) {
+		struct sem *s = &sma->sem_base[i];
+
+		INIT_LIST_HEAD(&s->sem_pending);
+	}
+
+	sma->complex_count = 0;
 	INIT_LIST_HEAD(&sma->sem_pending);
 	INIT_LIST_HEAD(&sma->list_id);
 	sma->sem_nsems = nsems;
@@ -335,13 +344,16 @@ SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
  * all at once. Return 0 if yes, 1 if need to sleep, else return error code.
  */
 
-static int try_atomic_semop (struct sem_array * sma, struct sembuf * sops,
-			     int nsops, struct sem_undo *un, int pid)
+static int try_atomic_semop(struct sem_array * sma, struct sembuf * sops,
+			     int nsops, struct sem_undo *un, int pid,
+			     int *phelps)
 {
-	int result, sem_op;
+	int result, sem_op, helps;
 	struct sembuf *sop;
 	struct sem * curr;
 
+	helps = 0;
+
 	for (sop = sops; sop < sops + nsops; sop++) {
 		curr = sma->sem_base + sop->sem_num;
 		sem_op = sop->sem_op;
@@ -363,6 +375,15 @@ static int try_atomic_semop (struct sem_array * sma, struct sembuf * sops,
 			if (undo < (-SEMAEM - 1) || undo > SEMAEM)
 				goto out_of_range;
 		}
+		/* If we incremented a semaphore, or decremented
+		 * a semaphore to 0, then this might allow other operations to
+		 * proceed. Remember that.
+		 * Note: helps is only an optimization, it doesn't matter that
+		 * is is a bit conservative (e.g. decrement by 3, increment
+		 * that sem by 2 doesn't help, but the code returns "helps=1").
+		 */
+		if (sem_op > 0 || (sem_op < 0 && result == 0))
+			helps = 1;
 		curr->semval = result;
 	}
 
@@ -375,6 +396,7 @@ static int try_atomic_semop (struct sem_array * sma, struct sembuf * sops,
 	}
 	
 	sma->sem_otime = get_seconds();
+	*phelps = helps;
 	return 0;
 
 out_of_range:
@@ -394,6 +416,8 @@ undo:
 		sop--;
 	}
 
+	/* a failed operation can never help other operations. */
+	*phelps = 0;
 	return result;
 }
 
@@ -418,40 +442,85 @@ static void wake_up_sem_queue(struct sem_queue *q, int error)
 	preempt_enable();
 }
 
-/* Go through the pending queue for the indicated semaphore
- * looking for tasks that can be completed.
+static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
+{
+	list_del(&q->list);
+	if (q->nsops == 1) {
+		BUG_ON(list_empty(&q->simple_list));
+		list_del(&q->simple_list);
+	} else {
+		BUG_ON(!list_empty(&q->simple_list));
+		sma->complex_count--;
+	}
+}
+
+
+/**
+ * update_queue(sma, semnum): Look for tasks that can be completed.
+ * @sma: semaphore array.
+ * @semnum: semaphore that was modified.
+ *
+ * update_queue must be called after a semaphore in a semaphore array
+ * was modified. If multiple semaphore were modified, then @semnum
+ * must be set to -1.
  */
-static void update_queue (struct sem_array * sma)
+static void update_queue(struct sem_array *sma, int semnum)
 {
 	struct sem_queue *q, *tq;
+	struct list_head *pending_list;
+
+	/* if there are complex operations around, then knowing the semaphore
+	 * that was modified doesn't help us. Assume that multiple semaphores
+	 * were modified.
+	 */
+	if (sma->complex_count)
+		semnum = -1;
+
+	if (semnum == -1)
+		pending_list = &sma->sem_pending;
+	else
+		pending_list = &sma->sem_base[semnum].sem_pending;
 
 again:
-	list_for_each_entry_safe(q, tq, &sma->sem_pending, list) {
+	list_for_each_entry_safe(q, tq, pending_list, list) {
 		int error;
-		int alter;
+		int could_help;
+
+		/* If we are looking for only one semaphore and that semaphore
+		 * is 0, then it does not make sense to scan the "alter"
+		 * entries: simple increments that affect only one entry
+		 * succeed immediately and cannot be in the pending queue,
+		 * and decrements cannot succeed if the value is already 0.
+		 */
+		if (semnum != -1 && sma->sem_base[semnum].semval == 0 &&
+				q->alter)
+			break;
 
 		error = try_atomic_semop(sma, q->sops, q->nsops,
-					 q->undo, q->pid);
+					 q->undo, q->pid, &could_help);
 
-		/* Does q->sleeper still need to sleep? */
+		/* q still needs to sleep, nothing happened */
 		if (error > 0)
 			continue;
 
-		list_del(&q->list);
+		/* q must be woken up:
+		 * The semaphore operation was processed, either successfully
+		 * (error=0) or it failed (e.g. UNDO overflow).
+		 */
+		unlink_queue(sma, q);
+		wake_up_sem_queue(q, error);
 
 		/*
 		 * The next operation that must be checked depends on the type
 		 * of the completed operation:
-		 * - if the operation modified the array, then restart from the
-		 *   head of the queue and check for threads that might be
-		 *   waiting for semaphore values to become 0.
-		 * - if the operation didn't modify the array, then just
-		 *   continue.
+		 * - if we incremented something or if a semaphore value got 0,
+		 *   then restart from the head of pending list: there could
+		 *   be tasks that are waiting for this event.
 		 */
-		alter = q->alter;
-		wake_up_sem_queue(q, error);
-		if (alter)
+		if (could_help)
 			goto again;
+		/* - Otherwise: just continue scanning.
+		 */
 	}
 }
 
@@ -531,8 +600,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
 
 	/* Wake up all pending processes and let them fail with EIDRM. */
 	list_for_each_entry_safe(q, tq, &sma->sem_pending, list) {
-		list_del(&q->list);
-
+		unlink_queue(sma, q);
 		wake_up_sem_queue(q, -EIDRM);
 	}
 
@@ -754,7 +822,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
 		}
 		sma->sem_ctime = get_seconds();
 		/* maybe some queued-up processes were waiting for this */
-		update_queue(sma);
+		update_queue(sma, -1);
 		err = 0;
 		goto out_unlock;
 	}
@@ -796,7 +864,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
 		curr->sempid = task_tgid_vnr(current);
 		sma->sem_ctime = get_seconds();
 		/* maybe some queued-up processes were waiting for this */
-		update_queue(sma);
+		update_queue(sma, semnum);
 		err = 0;
 		goto out_unlock;
 	}
@@ -1072,7 +1140,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
 	struct sembuf fast_sops[SEMOPM_FAST];
 	struct sembuf* sops = fast_sops, *sop;
 	struct sem_undo *un;
-	int undos = 0, alter = 0, max;
+	int undos = 0, alter = 0, max, could_help;
 	struct sem_queue queue;
 	unsigned long jiffies_left = 0;
 	struct ipc_namespace *ns;
@@ -1169,10 +1237,15 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
 	if (error)
 		goto out_unlock_free;
 
-	error = try_atomic_semop (sma, sops, nsops, un, task_tgid_vnr(current));
+	error = try_atomic_semop(sma, sops, nsops, un,
+					task_tgid_vnr(current), &could_help);
 	if (error <= 0) {
-		if (alter && error == 0)
-			update_queue (sma);
+		if (could_help) {
+			if (nsops == 1)
+				update_queue(sma, sops->sem_num);
+			else
+				update_queue(sma, -1);
+		}
 		goto out_unlock_free;
 	}
 
@@ -1189,6 +1262,18 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
 		list_add_tail(&queue.list, &sma->sem_pending);
 	else
 		list_add(&queue.list, &sma->sem_pending);
+	if (nsops == 1) {
+		struct sem *curr;
+		curr = &sma->sem_base[sops->sem_num];
+
+		if (alter)
+			list_add_tail(&queue.list, &curr->sem_pending);
+		else
+			list_add(&queue.list, &curr->sem_pending);
+	} else {
+		INIT_LIST_HEAD(&queue.simple_list);
+		sma->complex_count++;
+	}
 
 	queue.status = -EINTR;
 	queue.sleeper = current;
@@ -1231,7 +1316,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
 	 */
 	if (timeout && jiffies_left == 0)
 		error = -EAGAIN;
-	list_del(&queue.list);
+	unlink_queue(sma, &queue);
 
 out_unlock_free:
 	sem_unlock(sma);
@@ -1356,11 +1441,14 @@ void exit_sem(struct task_struct *tsk)
 				if (semaphore->semval > SEMVMX)
 					semaphore->semval = SEMVMX;
 				semaphore->sempid = task_tgid_vnr(current);
+				if (!sma->complex_count)
+					update_queue(sma, i);
 			}
 		}
 		sma->sem_otime = get_seconds();
 		/* maybe some queued-up processes were waiting for this */
-		update_queue(sma);
+		if (sma->complex_count)
+			update_queue(sma, -1);
 		sem_unlock(sma);
 
 		call_rcu(&un->rcu, free_un);
@@ -1374,7 +1462,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
 	struct sem_array *sma = it;
 
 	return seq_printf(s,
-			  "%10d %10d  %4o %10lu %5u %5u %5u %5u %10lu %10lu\n",
+			  "%10d %10d  %4o %10u %5u %5u %5u %5u %10lu %10lu\n",
 			  sma->sem_perm.key,
 			  sma->sem_perm.id,
 			  sma->sem_perm.mode,
-- 
1.6.2.5


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] [patch 4a/4] ipc: sem optimise simple operations
  2009-08-15 10:10   ` Manfred Spraul
@ 2009-08-15 10:38     ` Nick Piggin
       [not found]       ` <4A86ABF0.2070207@colorfullife.com>
  0 siblings, 1 reply; 13+ messages in thread
From: Nick Piggin @ 2009-08-15 10:38 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: Andrew Morton, Nadia Derbey, Pierre Peiffer, linux-kernel

On Sat, Aug 15, 2009 at 12:10:34PM +0200, Manfred Spraul wrote:
> On 08/15/2009 06:52 AM, Nick Piggin wrote:
> >
> >The problem with using the same algorithm is that we don't need to
> >"restart scanning" the simple-op lists when one of them succeeds.
> >This is because if a negative op fails, then no amount of subsequent
> >simple negative ops will make it succeed.
> >
> >With multi-ops, it can be adding and subtracting and waiting for
> >zero of different sems etc, so in order to try to stay strictly
> >FIFO, then we always have to recheck all previous failed ops after
> >one succeeds.
> >   
> What prevents us from optimizing the complex algorithm?
> 
> Right now, the rule is "restart if q->alter".
> What about "restart if at least one operation was an increment or if at 
> least one
> semaphore got 0"?

Well you could but it would still not be optimal for simple-op
case where you have outstanding -1 and 0 operations. Complex
operations are rare, so I preferred not to make it even more
complex by adding special cases but rather just add another
loop for doing simple ops which is much more obviously right.

I don't think it is a big deal to have multiple loops.


> >The other problem is that we have to scan all ops in the multi-op
> >list because we don't know what combination of sem values might
> >allow the op to succeed. With the single-op lists, we know if the
> >semval is 0, then we don't have to scan the negative list, and if
> >it is non-0 then we don't have to scan the zero list.
> >   
> If semval is 0, then the negative list is empty - it doesn't cost 
> anything to scan it, especially if the negative list is the first part 
> of a joint list.

No that's not the problem. If semval is 0, then you still have to
scan the complex list. If semval is 0, then you do not have to scan
the *negative* list. The problem is not the zero list.

 
> But you describe one difference between the simple and complex operations:
> - complex "increment" operations can be in the queue.
> - simple "increment" operations always succeed immediately.
> 
> Thus it is possible to stop scanning the simple queue if an "alter" 
> operation is found and the semaphore value is 0.

This is what I mean.


> For the complex queue, this optimization doesn't appear to be that simple.
> 
> AFAICS this is a realistic case:
> - 1000 threads are waiting with "decrement by 1".
> - semval is 0.
> - an increment by 1 arrives.
> 
> I've optimized that case, too - it's just one test.

If you have a significant number of wait-for-0 and the update goes to
non-zero it seems like it needelessly has to scan the wait-for-0
list. Subsequent decrements will cause the zero ops to be rescanned.

Also, your can_help logic is broken. 

> +		/* If we incremented a semaphore, or decremented
> +		 * a semaphore to 0, then this might allow other operations to
> +		 * proceed. Remember that.
> +		 * Note: helps is only an optimization, it doesn't matter that
> +		 * is is a bit conservative (e.g. decrement by 3, increment
> +		 * that sem by 2 doesn't help, but the code returns "helps=1").
> +		 */
> +		if (sem_op > 0 || (sem_op < 0 && result == 0))
> +			helps = 1;
>  		curr->semval = result;
>  	}
  
It is absolutely not true that decrements only help if they cause
the semaphore to reach 0.

Consider a multi-op semop which includes a decrement operation then
a wait-for-0 operation.

This, and...

> -static void update_queue (struct sem_array * sma)
> +static void update_queue(struct sem_array *sma, int semnum)
>  {
>  	struct sem_queue *q, *tq;
> +	struct list_head *pending_list;
> +
> +	/* if there are complex operations around, then knowing the semaphore
> +	 * that was modified doesn't help us. Assume that multiple semaphores
> +	 * were modified.
> +	 */
> +	if (sma->complex_count)
> +		semnum = -1;
> +
> +	if (semnum == -1)
> +		pending_list = &sma->sem_pending;
> +	else
> +		pending_list = &sma->sem_base[semnum].sem_pending;
>  
>  again:
> -	list_for_each_entry_safe(q, tq, &sma->sem_pending, list) {
> +	list_for_each_entry_safe(q, tq, pending_list, list) {
>  		int error;
> -		int alter;
> +		int could_help;
> +
> +		/* If we are looking for only one semaphore and that semaphore
> +		 * is 0, then it does not make sense to scan the "alter"
> +		 * entries: simple increments that affect only one entry
> +		 * succeed immediately and cannot be in the pending queue,
> +		 * and decrements cannot succeed if the value is already 0.
> +		 */
> +		if (semnum != -1 && sma->sem_base[semnum].semval == 0 &&
> +				q->alter)
> +			break;

These additional tests just seem to add complexity to me. Wheras my
code leaves complex list processing unchanged, and just adds some
simpler loops, yours adds additional complexity to an already complex
loop. I prefer my approach.


> With both optimizations, I see only one case left where your algorithm 
> is better:
> - semaphore value 1.
> - 1000 threads waiting for zero.
> - an increment by 1 arrives.
> 
> My code would scan the 1000 entries, your code doesn't.
> But: Does that exist in real life?

Don't know.

 
> Btw,
> - semaphore value 1
> - 1000 threads with "decrement 2" are waiting
> - an decrement by 1 arrives.
> My code doesn't scan at all, AFAICS you would scan all 1000 entries.
> The same argument applies:
> I would bet that this case doesn't exist in real life.

I don't see it. My code doesn't scan the negative queue at all if
the semaphore value is 0.

 
> Could you create a dump of the pending list from a test run?
> Does your database use "wait for zero", or increment/decrement 
> operations with offsets that are not +-1?

In this case I don't think it uses wait-for-zero. I don't have
access to the source code, I just created a really simple test
case. I just don't see the harm in optimizing more cases, and
with simpler loops.


> Do you see a common case where separate algorithms are necessary?
> 
> >Combine these two problems and that is where the O(n^2) behaviour
> >comes in to the complex-op algorithm.
> >   
> Yes - the worst case remains O(n^2), but only if there are increments.
> (to be fair: with the optimizations O(n*m) with n waiting tasks and m 
> increments)
> 
> But OTHO neither of our patches solves that.

Yeah I was talking about the O(n^2) remaining in your patch with
only single-op entries. Complex ops are tricky and I wasn't
concerned with optimising them for now.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] [patch 4a/4] ipc: sem optimise simple operations
       [not found]       ` <4A86ABF0.2070207@colorfullife.com>
@ 2009-08-15 14:49         ` Nick Piggin
  2009-08-15 16:32           ` Manfred Spraul
  0 siblings, 1 reply; 13+ messages in thread
From: Nick Piggin @ 2009-08-15 14:49 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: Andrew Morton, Nadia Derbey, Pierre Peiffer, linux-kernel

On Sat, Aug 15, 2009 at 02:37:04PM +0200, Manfred Spraul wrote:
> On 08/15/2009 12:38 PM, Nick Piggin wrote:
> >Also, your can_help logic is broken.
> >
> >   
> Good point: with multi-ops, it's possible to wait for arbitrary values.
> I missed that.

Yeah multi ops aren't simple. It's easy to miss things when changing
the algorithm for them.


> >These additional tests just seem to add complexity to me. Wheras my
> >code leaves complex list processing unchanged, and just adds some
> >simpler loops, yours adds additional complexity to an already complex
> >loop. I prefer my approach.
> >   
> It depends on the complexity metric:

I mean code complexity which is most important here IMO, considering
there hasn't been a problem shown with my approach (and despite
being simpler removes all O(n^2) cases from the simple ops algorithms).


> From the .text size or the number of lines, my approach is simpler.
>             1list     2list
> lines    +67     +110 lines
> .text    +398    +678 bytes
> 
> And it needs less runtime memory, too:
> 20 bytes per semaphore instead of 36 [on 64-bit].
> 
> Anyway:
> As far as I can see, you found two performance problems:
> - the semop() scans all pending tasks from the array, not just the tasks 
> that are waiting on the semaphore that was modified.
> - when multiple decrements are waiting, then the code scans all 
> decrements, even if the first decrement reduced semval to 0 and further 
> progress is impossible
> 
> Let's stick to these two points.

I don't see how you've argued that yours is better.

If you are worried about memory consumption, we can add _rcu variants
to hlists and use them. And if you are worried about text size, then
I would bet my version actually uses less icache in the case of
simple ops being used.


> Attached is my proposal.
> 
> I'll verify the patches, but probably only tomorrow.
> 
> --
>     Manfred

>  From bff62ee2f8bc25c807095ddb8a9fc8d28140c785 Mon Sep 17 00:00:00 2001
>  From: Manfred Spraul <manfred@colorfullife.com>
> Date: Sat, 15 Aug 2009 14:29:06 +0200
> Subject: [PATCH 1/2] [PATCH 1/2] ipc/sem.c: alternative list optimization
> 
> Part 1: create per-sem list.
> ---
>  include/linux/sem.h |    5 ++-
>  ipc/sem.c           |   83 +++++++++++++++++++++++++++++++++++++++++---------
>  2 files changed, 72 insertions(+), 16 deletions(-)
> 
> diff --git a/include/linux/sem.h b/include/linux/sem.h
> index 1b191c1..8a4adbe 100644
> --- a/include/linux/sem.h
> +++ b/include/linux/sem.h
> @@ -86,6 +86,7 @@ struct task_struct;
>  struct sem {
>  	int	semval;		/* current value */
>  	int	sempid;		/* pid of last operation */
> +	struct list_head sem_pending; /* pending single-sop operations */
>  };
>  
>  /* One sem_array data structure for each set of semaphores in the system. */
> @@ -96,11 +97,13 @@ struct sem_array {
>  	struct sem		*sem_base;	/* ptr to first semaphore in array */
>  	struct list_head	sem_pending;	/* pending operations to be processed */
>  	struct list_head	list_id;	/* undo requests on this array */
> -	unsigned long		sem_nsems;	/* no. of semaphores in array */
> +	int			sem_nsems;	/* no. of semaphores in array */
> +	int			complex_count;	/* pending complex operations */
>  };
>  
>  /* One queue for each sleeping process in the system. */
>  struct sem_queue {
> +	struct list_head	simple_list; /* queue of pending operations */
>  	struct list_head	list;	 /* queue of pending operations */
>  	struct task_struct	*sleeper; /* this process */
>  	struct sem_undo		*undo;	 /* undo structure */
> diff --git a/ipc/sem.c b/ipc/sem.c
> index 3629ef8..59adeda 100644
> --- a/ipc/sem.c
> +++ b/ipc/sem.c
> @@ -240,6 +240,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
>  	key_t key = params->key;
>  	int nsems = params->u.nsems;
>  	int semflg = params->flg;
> +	int i;
>  
>  	if (!nsems)
>  		return -EINVAL;
> @@ -272,6 +273,14 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
>  	ns->used_sems += nsems;
>  
>  	sma->sem_base = (struct sem *) &sma[1];
> +
> +	for (i = 0; i < nsems; i++) {
> +		struct sem *s = &sma->sem_base[i];
> +
> +		INIT_LIST_HEAD(&s->sem_pending);
> +	}
> +
> +	sma->complex_count = 0;
>  	INIT_LIST_HEAD(&sma->sem_pending);
>  	INIT_LIST_HEAD(&sma->list_id);
>  	sma->sem_nsems = nsems;
> @@ -418,17 +427,48 @@ static void wake_up_sem_queue(struct sem_queue *q, int error)
>  	preempt_enable();
>  }
>  
> -/* Go through the pending queue for the indicated semaphore
> - * looking for tasks that can be completed.
> +static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
> +{
> +	list_del(&q->list);
> +	if (q->nsops == 1) {
> +		BUG_ON(list_empty(&q->simple_list));
> +		list_del(&q->simple_list);
> +	} else {
> +		BUG_ON(!list_empty(&q->simple_list));
> +		sma->complex_count--;
> +	}
> +}
> +
> +
> +/**
> + * update_queue(sma, semnum): Look for tasks that can be completed.
> + * @sma: semaphore array.
> + * @semnum: semaphore that was modified.
> + *
> + * update_queue must be called after a semaphore in a semaphore array
> + * was modified. If multiple semaphore were modified, then @semnum
> + * must be set to -1.
>   */
> -static void update_queue (struct sem_array * sma)
> +static void update_queue(struct sem_array *sma, int semnum)
>  {
>  	struct sem_queue *q, *tq;
> +	struct list_head *pending_list;
> +
> +	/* if there are complex operations around, then knowing the semaphore
> +	 * that was modified doesn't help us. Assume that multiple semaphores
> +	 * were modified.
> +	 */
> +	if (sma->complex_count)
> +		semnum = -1;
> +
> +	if (semnum == -1)
> +		pending_list = &sma->sem_pending;
> +	else
> +		pending_list = &sma->sem_base[semnum].sem_pending;
>  
>  again:
> -	list_for_each_entry_safe(q, tq, &sma->sem_pending, list) {
> -		int error;
> -		int alter;
> +	list_for_each_entry_safe(q, tq, pending_list, list) {
> +		int error, alter;
>  
>  		error = try_atomic_semop(sma, q->sops, q->nsops,
>  					 q->undo, q->pid);
> @@ -437,7 +477,7 @@ again:
>  		if (error > 0)
>  			continue;
>  
> -		list_del(&q->list);
> +		unlink_queue(sma, q);
>  
>  		/*
>  		 * The next operation that must be checked depends on the type
> @@ -531,8 +571,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
>  
>  	/* Wake up all pending processes and let them fail with EIDRM. */
>  	list_for_each_entry_safe(q, tq, &sma->sem_pending, list) {
> -		list_del(&q->list);
> -
> +		unlink_queue(sma, q);
>  		wake_up_sem_queue(q, -EIDRM);
>  	}
>  
> @@ -754,7 +793,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
>  		}
>  		sma->sem_ctime = get_seconds();
>  		/* maybe some queued-up processes were waiting for this */
> -		update_queue(sma);
> +		update_queue(sma, -1);
>  		err = 0;
>  		goto out_unlock;
>  	}
> @@ -796,7 +835,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
>  		curr->sempid = task_tgid_vnr(current);
>  		sma->sem_ctime = get_seconds();
>  		/* maybe some queued-up processes were waiting for this */
> -		update_queue(sma);
> +		update_queue(sma, semnum);
>  		err = 0;
>  		goto out_unlock;
>  	}
> @@ -1172,7 +1211,8 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
>  	error = try_atomic_semop (sma, sops, nsops, un, task_tgid_vnr(current));
>  	if (error <= 0) {
>  		if (alter && error == 0)
> -			update_queue (sma);
> +			update_queue(sma, (nsops == 1) ? sops[0].sem_num : -1);
> +
>  		goto out_unlock_free;
>  	}
>  
> @@ -1190,6 +1230,19 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
>  	else
>  		list_add(&queue.list, &sma->sem_pending);
>  
> +	if (nsops == 1) {
> +		struct sem *curr;
> +		curr = &sma->sem_base[sops->sem_num];
> +
> +		if (alter)
> +			list_add_tail(&queue.list, &curr->sem_pending);
> +		else
> +			list_add(&queue.list, &curr->sem_pending);
> +	} else {
> +		INIT_LIST_HEAD(&queue.simple_list);
> +		sma->complex_count++;
> +	}
> +
>  	queue.status = -EINTR;
>  	queue.sleeper = current;
>  	current->state = TASK_INTERRUPTIBLE;
> @@ -1231,7 +1284,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
>  	 */
>  	if (timeout && jiffies_left == 0)
>  		error = -EAGAIN;
> -	list_del(&queue.list);
> +	unlink_queue(sma, &queue);
>  
>  out_unlock_free:
>  	sem_unlock(sma);
> @@ -1360,7 +1413,7 @@ void exit_sem(struct task_struct *tsk)
>  		}
>  		sma->sem_otime = get_seconds();
>  		/* maybe some queued-up processes were waiting for this */
> -		update_queue(sma);
> +		update_queue(sma, -1);
>  		sem_unlock(sma);
>  
>  		call_rcu(&un->rcu, free_un);
> @@ -1374,7 +1427,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
>  	struct sem_array *sma = it;
>  
>  	return seq_printf(s,
> -			  "%10d %10d  %4o %10lu %5u %5u %5u %5u %10lu %10lu\n",
> +			  "%10d %10d  %4o %10u %5u %5u %5u %5u %10lu %10lu\n",
>  			  sma->sem_perm.key,
>  			  sma->sem_perm.id,
>  			  sma->sem_perm.mode,
> -- 
> 1.6.2.5
> 

>  From db1d2efe71ddc982c96f87a65c1bed2ce6bf9944 Mon Sep 17 00:00:00 2001
>  From: Manfred Spraul <manfred@colorfullife.com>
> Date: Sat, 15 Aug 2009 14:31:12 +0200
> Subject: [PATCH 2/2] [PATCH 2/2] ipc/sem.c: optimize single-sop decrements
> 
> ---
>  ipc/sem.c |   11 +++++++++++
>  1 files changed, 11 insertions(+), 0 deletions(-)
> 
> diff --git a/ipc/sem.c b/ipc/sem.c
> index 59adeda..403ca0a 100644
> --- a/ipc/sem.c
> +++ b/ipc/sem.c
> @@ -470,6 +470,17 @@ again:
>  	list_for_each_entry_safe(q, tq, pending_list, list) {
>  		int error, alter;
>  
> +		/* If we are looking for only one semaphore and that semaphore
> +		 * is 0, then it does not make sense to scan the "alter"
> +		 * entries: simple increments that affect only one entry
> +		 * succeed immediately and cannot be in the single sop,
> +		 * per semaphore pending queue, and decrements cannot
> +		 * succeed if the value is already 0.
> +		 */
> +		if (semnum != -1 && sma->sem_base[semnum].semval == 0 &&
> +				q->alter)
> +			break;
> +
>  		error = try_atomic_semop(sma, q->sops, q->nsops,
>  					 q->undo, q->pid);
>  
> -- 
> 1.6.2.5
> 


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] [patch 4a/4] ipc: sem optimise simple operations
  2009-08-15 14:49         ` Nick Piggin
@ 2009-08-15 16:32           ` Manfred Spraul
  2009-08-16  4:53             ` Nick Piggin
  2009-08-16 10:31             ` Nick Piggin
  0 siblings, 2 replies; 13+ messages in thread
From: Manfred Spraul @ 2009-08-15 16:32 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, Nadia Derbey, Pierre Peiffer, linux-kernel

On 08/15/2009 04:49 PM, Nick Piggin wrote:
>
> I don't see how you've argued that yours is better.
>
>    
Lower number of new code lines,
Lower total code size increase.
Lower number of seperate codepaths.
Lower runtime memory consumption.
Two seperate patches for the two algorithm improvements.

The main advantage of your version is that you optimize more cases.
> If you are worried about memory consumption, we can add _rcu variants
> to hlists and use them.
There is no need for _rcu, the whole code runs under a spinlock.
Thus the wait_for_zero queue could be converted to a hlist immediately.

Hmm: Did you track my proposals for your version?

- exit_sem() is not a hot path.
I would propose to tread every exit_sem as update_queue, not an 
update_queue_simple for every individual UNDO.

- create an unlink_queue() helper that contains the updates to q->lists 
and sma->complex_count.
Three copies ask for errors.

- now: use a hlist for the zero queue.

>   And if you are worried about text size, then
> I would bet my version actually uses less icache in the case of
> simple ops being used.
>    
It depends. After disabling inlining, including all helper functions 
that differ:

My proposal: 301 bytes for update_queue.

"simple", only negv: 226 bytes
"simple, negv+zero: 354 bytes
simple+complex: 526 bytes.

Thus with only +-1 simple ops, your version uses less icache. If both 
+-1 and 0 ops are used, your version uses more icache.

Could you please send me your benchmark app?

--
     Manfred

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] [patch 4a/4] ipc: sem optimise simple operations
  2009-08-15 16:32           ` Manfred Spraul
@ 2009-08-16  4:53             ` Nick Piggin
  2009-08-16  5:12               ` Nick Piggin
  2009-08-16 10:31             ` Nick Piggin
  1 sibling, 1 reply; 13+ messages in thread
From: Nick Piggin @ 2009-08-16  4:53 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: Andrew Morton, Nadia Derbey, Pierre Peiffer, linux-kernel

On Sat, Aug 15, 2009 at 06:32:14PM +0200, Manfred Spraul wrote:
> On 08/15/2009 04:49 PM, Nick Piggin wrote:
> >
> >I don't see how you've argued that yours is better.
> >
> >   

OK, but I'll add some context too.

> Lower number of new code lines,

Downside is you further complicate the already complex path.

> Lower total code size increase.

Downside is simple ops run in the complex path too so icache
footprint could be higher

> Lower number of seperate codepaths.

But they are independently all simpler. Combining them doesn't
add copmlexity together.

> Lower runtime memory consumption.

I'll fix this with hlists.

> Two seperate patches for the two algorithm improvements.
> 
> The main advantage of your version is that you optimize more cases.

And it is simpler.

> >If you are worried about memory consumption, we can add _rcu variants
> >to hlists and use them.
> There is no need for _rcu, the whole code runs under a spinlock.

Ah, yeah I was thinking of the undo list I think. Great then that
wil lbe easy.

> Thus the wait_for_zero queue could be converted to a hlist immediately.

Both queues can be.

> 
> Hmm: Did you track my proposals for your version?
> 
> - exit_sem() is not a hot path.
> I would propose to tread every exit_sem as update_queue, not an 
> update_queue_simple for every individual UNDO.

I don't think it matters too much, but ok. 

> - create an unlink_queue() helper that contains the updates to q->lists 
> and sma->complex_count.
> Three copies ask for errors.

Yes this is a good idea.

> - now: use a hlist for the zero queue.
> 
> >  And if you are worried about text size, then
> >I would bet my version actually uses less icache in the case of
> >simple ops being used.
> >   
> It depends. After disabling inlining, including all helper functions 
> that differ:
> 
> My proposal: 301 bytes for update_queue.
> 
> "simple", only negv: 226 bytes
> "simple, negv+zero: 354 bytes
> simple+complex: 526 bytes.
> 
> Thus with only +-1 simple ops, your version uses less icache. If both 
> +-1 and 0 ops are used, your version uses more icache.

I'll get rid of some of the BUG_ONs too, they're mostly there just
to verify correctness when I was developing it.

> Could you please send me your benchmark app?

Yeah I'll dig it out. It iterally was just lock, spin, unlock with
lots of processes.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] [patch 4a/4] ipc: sem optimise simple operations
  2009-08-16  4:53             ` Nick Piggin
@ 2009-08-16  5:12               ` Nick Piggin
  0 siblings, 0 replies; 13+ messages in thread
From: Nick Piggin @ 2009-08-16  5:12 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: Andrew Morton, Nadia Derbey, Pierre Peiffer, linux-kernel

On Sun, Aug 16, 2009 at 06:53:16AM +0200, Nick Piggin wrote:
> On Sat, Aug 15, 2009 at 06:32:14PM +0200, Manfred Spraul wrote:
> > Lower runtime memory consumption.
> 
> I'll fix this with hlists.

Oh right no the negv list can't easily be made an hlist. So
yes yours can save a bit of memory there.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] [patch 4a/4] ipc: sem optimise simple operations
  2009-08-15 16:32           ` Manfred Spraul
  2009-08-16  4:53             ` Nick Piggin
@ 2009-08-16 10:31             ` Nick Piggin
  2009-08-16 11:29               ` Manfred Spraul
  1 sibling, 1 reply; 13+ messages in thread
From: Nick Piggin @ 2009-08-16 10:31 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: Andrew Morton, Nadia Derbey, Pierre Peiffer, linux-kernel

On Sat, Aug 15, 2009 at 06:32:14PM +0200, Manfred Spraul wrote:
> It depends. After disabling inlining, including all helper functions 
> that differ:
> 
> My proposal: 301 bytes for update_queue.
> 
> "simple", only negv: 226 bytes
> "simple, negv+zero: 354 bytes
> simple+complex: 526 bytes.
> 
> Thus with only +-1 simple ops, your version uses less icache. If both 
> +-1 and 0 ops are used, your version uses more icache.

Don't forget that in that case, your version is badly suboptimal
due to the algorithmic complexity.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] [patch 4a/4] ipc: sem optimise simple operations
  2009-08-16 10:31             ` Nick Piggin
@ 2009-08-16 11:29               ` Manfred Spraul
  2009-08-17  6:44                 ` Nick Piggin
  0 siblings, 1 reply; 13+ messages in thread
From: Manfred Spraul @ 2009-08-16 11:29 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, Nadia Derbey, Pierre Peiffer, linux-kernel

On 08/16/2009 12:31 PM, Nick Piggin wrote:
> On Sat, Aug 15, 2009 at 06:32:14PM +0200, Manfred Spraul wrote:
>    
>> It depends. After disabling inlining, including all helper functions
>> that differ:
>>
>> My proposal: 301 bytes for update_queue.
>>
>> "simple", only negv: 226 bytes
>> "simple, negv+zero: 354 bytes
>> simple+complex: 526 bytes.
>>
>> Thus with only +-1 simple ops, your version uses less icache. If both
>> +-1 and 0 ops are used, your version uses more icache.
>>      
> Don't forget that in that case, your version is badly suboptimal
> due to the algorithmic complexity.
>    
I know, I mentioned it in the change log:
Waking up one "decrement by one" task is O(1) with your code and 
O(1+<number of waiting-for-zero tasks>) with my code.

This is the price paid for saving memory:
Your version keeps three pointers per semaphore (waiting-for-zero, 
oldest decrement, newest decrement)
My version keeps only two pointers (newest decrement, waiting-for-zero). 
The "oldest decrement" is reconstructed on the fly, and that operation 
is O(<number of waiting-for-zero tasks>).

--
     Manfred

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] [patch 4a/4] ipc: sem optimise simple operations
  2009-08-16 11:29               ` Manfred Spraul
@ 2009-08-17  6:44                 ` Nick Piggin
  2009-08-17 13:02                   ` Manfred Spraul
  0 siblings, 1 reply; 13+ messages in thread
From: Nick Piggin @ 2009-08-17  6:44 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: Andrew Morton, Nadia Derbey, Pierre Peiffer, linux-kernel

On Sun, Aug 16, 2009 at 01:29:23PM +0200, Manfred Spraul wrote:
> On 08/16/2009 12:31 PM, Nick Piggin wrote:
> >On Sat, Aug 15, 2009 at 06:32:14PM +0200, Manfred Spraul wrote:
> >   
> >>It depends. After disabling inlining, including all helper functions
> >>that differ:
> >>
> >>My proposal: 301 bytes for update_queue.
> >>
> >>"simple", only negv: 226 bytes
> >>"simple, negv+zero: 354 bytes
> >>simple+complex: 526 bytes.
> >>
> >>Thus with only +-1 simple ops, your version uses less icache. If both
> >>+-1 and 0 ops are used, your version uses more icache.
> >>     
> >Don't forget that in that case, your version is badly suboptimal
> >due to the algorithmic complexity.
> >   
> I know, I mentioned it in the change log:
> Waking up one "decrement by one" task is O(1) with your code and 
> O(1+<number of waiting-for-zero tasks>) with my code.
> 
> This is the price paid for saving memory:
> Your version keeps three pointers per semaphore (waiting-for-zero, 
> oldest decrement, newest decrement)
> My version keeps only two pointers (newest decrement, waiting-for-zero). 
> The "oldest decrement" is reconstructed on the fly, and that operation 
> is O(<number of waiting-for-zero tasks>).

OK, well let's just get something in.

I kind of liked to optimise both cases because it took this long to
uncover this suboptimal behaviour (and the problem isn't even
completely due to suboptimal semaphore but also userspace contention),
so even if there are some workloads with a little problem I don't know
if they will ever get reported/diagnosed.

That said, I'm not too unhappy with your version if you feel strongly
about it.

Thanks for reviewing the other patches too.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] [patch 4a/4] ipc: sem optimise simple operations
  2009-08-17  6:44                 ` Nick Piggin
@ 2009-08-17 13:02                   ` Manfred Spraul
  2009-08-17 13:10                     ` Nick Piggin
  0 siblings, 1 reply; 13+ messages in thread
From: Manfred Spraul @ 2009-08-17 13:02 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, Nadia Derbey, Pierre Peiffer, linux-kernel

On 08/17/2009 08:44 AM, Nick Piggin wrote:
> OK, well let's just get something in.
>    
Good, I would propose the that the following 7 patches should be merged:

http://lkml.org/lkml/2009/8/11/59
http://lkml.org/lkml/2009/8/11/9
http://lkml.org/lkml/2009/8/11/14
http://lkml.org/lkml/2009/8/15/163
http://lkml.org/lkml/2009/8/15/164
http://lkml.org/lkml/2009/8/15/167
http://lkml.org/lkml/2009/8/15/168

Nick: is that ok from your point of view?

> That said, I'm not too unhappy with your version if you feel strongly
> about it.
I would prefer it:
We simply don't know if a wait-for-zero list, only for single sop operations, is a step in the right direction.

Postgres uses single sop operations with just +-1 on one semaphore.
You wrote that your SAP workload also uses +-1.
According to google codesearch, apache, mozilla, mpich all use +-1.

Thus: Who uses single sop, wait for zero?
I'm just afraid that we optimize for the wrong case.

--
	Manfred


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] [patch 4a/4] ipc: sem optimise simple operations
  2009-08-17 13:02                   ` Manfred Spraul
@ 2009-08-17 13:10                     ` Nick Piggin
  0 siblings, 0 replies; 13+ messages in thread
From: Nick Piggin @ 2009-08-17 13:10 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: Andrew Morton, Nadia Derbey, Pierre Peiffer, linux-kernel

On Mon, Aug 17, 2009 at 03:02:48PM +0200, Manfred Spraul wrote:
> On 08/17/2009 08:44 AM, Nick Piggin wrote:
> >OK, well let's just get something in.
> >   
> Good, I would propose the that the following 7 patches should be merged:
> 
> http://lkml.org/lkml/2009/8/11/59
> http://lkml.org/lkml/2009/8/11/9
> http://lkml.org/lkml/2009/8/11/14
> http://lkml.org/lkml/2009/8/15/163
> http://lkml.org/lkml/2009/8/15/164
> http://lkml.org/lkml/2009/8/15/167
> http://lkml.org/lkml/2009/8/15/168
> 
> Nick: is that ok from your point of view?

Well I don't know if they need to be broken up so much... the complex
list one is just broken out of my patch, no? I don't think it really
is more reviewable if you just add it without doing anything to it...
but whatever.

 
> >That said, I'm not too unhappy with your version if you feel strongly
> >about it.
> I would prefer it:
> We simply don't know if a wait-for-zero list, only for single sop 
> operations, is a step in the right direction.
> 
> Postgres uses single sop operations with just +-1 on one semaphore.
> You wrote that your SAP workload also uses +-1.
> According to google codesearch, apache, mozilla, mpich all use +-1.
> 
> Thus: Who uses single sop, wait for zero?

Oracle. Arguably it is also better behaviour for fairness to wake in
FIFO order too in that case.

> I'm just afraid that we optimize for the wrong case.

My point is that there is very little downside, and it is actually
going via simpler code paths (and less icache). So I think it makes
sense, but anyway.


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2009-08-17 13:10 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-08-14 19:16 [PATCH] [patch 4a/4] ipc: sem optimise simple operations Manfred Spraul
2009-08-15  4:52 ` Nick Piggin
2009-08-15 10:10   ` Manfred Spraul
2009-08-15 10:38     ` Nick Piggin
     [not found]       ` <4A86ABF0.2070207@colorfullife.com>
2009-08-15 14:49         ` Nick Piggin
2009-08-15 16:32           ` Manfred Spraul
2009-08-16  4:53             ` Nick Piggin
2009-08-16  5:12               ` Nick Piggin
2009-08-16 10:31             ` Nick Piggin
2009-08-16 11:29               ` Manfred Spraul
2009-08-17  6:44                 ` Nick Piggin
2009-08-17 13:02                   ` Manfred Spraul
2009-08-17 13:10                     ` Nick Piggin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.