linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [tbench regression fixes]: digging out smelly deadmen.
@ 2008-10-09 23:17 Evgeniy Polyakov
  2008-10-10  5:40 ` Peter Zijlstra
                   ` (2 more replies)
  0 siblings, 3 replies; 94+ messages in thread
From: Evgeniy Polyakov @ 2008-10-09 23:17 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel, Peter Zijlstra, Ingo Molnar, David Miller

[-- Attachment #1: Type: text/plain, Size: 2472 bytes --]


Hi.

It was reported recently that tbench has a long history of regressions,
starting at least from 2.6.23 kernel. I verified that in my test
environment tbench 'lost' more than 100 MB/s from 470 down to 355
between at least 2.6.24 and 2.6.27. 2.6.26-2.6.27 performance regression
in my machines is rougly corresponds to 375 down to 355 MB/s.

I spent several days in various tests and bisections (unfortunately
bisect can not always point to the 'right' commit), and found following
problems.

First, related to the network, as lots of people expected: TSO/GSO over
loopback with tbench workload eats about 5-10 MB/s, since TSO/GSO frame
creation overhead is not paid by the optimized super-frame processing
gains. Since it brings really impressive improvement in big-packet
workload, it was (likely) decided not to add a patch for this, but
instead one can disable TSO/GSO via ethtool. This patch was added in
2.6.27 window, so it has its part in its regression.

Second part in the 26-27 window regression (I remind, it is about 20
MB/s) is related to the scheduler changes, which was expected by another
group of people. I tracked it down to the
a7be37ac8e1565e00880531f4e2aff421a21c803 commit, which, if being
reverted, returns 2.6.27 tbench perfromance to the highest (for
2.6.26-2.6.27) 365 MB/s mark. I also tested tree, stopped at above
commit itself, i.e. not 2.6.27, and got 373 MB/s, so likely another
changes in that merge ate couple of megs. Attached patch against 2.6.27.

Curious reader can ask, where did we lost another 100 MB/s? This small
issue was not detected (or at least reported in netdev@ with provocative
enough subject), and it happend to live somehere in 2.6.24-2.6.25 changes.
I was so lucky to 'guess' (just after couple of hundreds of compilations),
that it corresponds to 8f4d37ec073c17e2d4aa8851df5837d798606d6f commit about
high-resolution timers, attached patch against 2.6.25 brings tbench
performance for the 2.6.25 kernel tree to 455 MB/s.

There are still somewhat missed 20 MB/s, but 2.6.24 has 475 MB/s, so
likely bug lives between 2.6.24 and above 8f4d37ec073 commit.

I can test your patches (the most interesting attached one does not
apply clearly to the current tree) for the 2.6.27 tree tomorrow
(it is more than 3 A.M. in Moscow).

P.S. I'm not currently subscribed to any of the mentioned lists (and write
from long-ago-unused email), so can not find appropriate subject and reply
into the thread.

-- 
	Evgeniy Polyakov

[-- Attachment #2: return-10mb-2.6.27.diff --]
[-- Type: text/x-diff, Size: 5994 bytes --]

diff --git a/kernel/sched.c b/kernel/sched.c
index 13dd2db..70eb173 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1243,9 +1243,6 @@ static void resched_task(struct task_struct *p)
  */
 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
 
-/*
- * delta *= weight / lw
- */
 static unsigned long
 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 		struct load_weight *lw)
@@ -1273,6 +1270,12 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 }
 
+static inline unsigned long
+calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
+{
+	return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
+}
+
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index fb8994c..3597a3c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -334,34 +334,6 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
 #endif
 
 /*
- * delta *= w / rw
- */
-static inline unsigned long
-calc_delta_weight(unsigned long delta, struct sched_entity *se)
-{
-	for_each_sched_entity(se) {
-		delta = calc_delta_mine(delta,
-				se->load.weight, &cfs_rq_of(se)->load);
-	}
-
-	return delta;
-}
-
-/*
- * delta *= rw / w
- */
-static inline unsigned long
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
-{
-	for_each_sched_entity(se) {
-		delta = calc_delta_mine(delta,
-				cfs_rq_of(se)->load.weight, &se->load);
-	}
-
-	return delta;
-}
-
-/*
  * The idea is to set a period in which each task runs once.
  *
  * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
@@ -390,80 +362,47 @@ static u64 __sched_period(unsigned long nr_running)
  */
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
+	u64 slice = __sched_period(cfs_rq->nr_running);
+
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+
+		slice *= se->load.weight;
+		do_div(slice, cfs_rq->load.weight);
+	}
+
+
+	return slice;
 }
 
 /*
  * We calculate the vruntime slice of a to be inserted task
  *
- * vs = s*rw/w = p
+ * vs = s/w = p/rw
  */
 static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	unsigned long nr_running = cfs_rq->nr_running;
+	unsigned long weight;
+	u64 vslice;
 
 	if (!se->on_rq)
 		nr_running++;
 
-	return __sched_period(nr_running);
-}
-
-/*
- * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
- * that it favours >=0 over <0.
- *
- *   -20         |
- *               |
- *     0 --------+-------
- *             .'
- *    19     .'
- *
- */
-static unsigned long
-calc_delta_asym(unsigned long delta, struct sched_entity *se)
-{
-	struct load_weight lw = {
-		.weight = NICE_0_LOAD,
-		.inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
-	};
+	vslice = __sched_period(nr_running);
 
 	for_each_sched_entity(se) {
-		struct load_weight *se_lw = &se->load;
-		unsigned long rw = cfs_rq_of(se)->load.weight;
-
-#ifdef CONFIG_FAIR_SCHED_GROUP
-		struct cfs_rq *cfs_rq = se->my_q;
-		struct task_group *tg = NULL
-
-		if (cfs_rq)
-			tg = cfs_rq->tg;
-
-		if (tg && tg->shares < NICE_0_LOAD) {
-			/*
-			 * scale shares to what it would have been had
-			 * tg->weight been NICE_0_LOAD:
-			 *
-			 *   weight = 1024 * shares / tg->weight
-			 */
-			lw.weight *= se->load.weight;
-			lw.weight /= tg->shares;
-
-			lw.inv_weight = 0;
-
-			se_lw = &lw;
-			rw += lw.weight - se->load.weight;
-		} else
-#endif
+		cfs_rq = cfs_rq_of(se);
 
-		if (se->load.weight < NICE_0_LOAD) {
-			se_lw = &lw;
-			rw += NICE_0_LOAD - se->load.weight;
-		}
+		weight = cfs_rq->load.weight;
+		if (!se->on_rq)
+			weight += se->load.weight;
 
-		delta = calc_delta_mine(delta, rw, se_lw);
+		vslice *= NICE_0_LOAD;
+		do_div(vslice, weight);
 	}
 
-	return delta;
+	return vslice;
 }
 
 /*
@@ -480,7 +419,11 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 
 	curr->sum_exec_runtime += delta_exec;
 	schedstat_add(cfs_rq, exec_clock, delta_exec);
-	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
+	delta_exec_weighted = delta_exec;
+	if (unlikely(curr->load.weight != NICE_0_LOAD)) {
+		delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
+							&curr->load);
+	}
 	curr->vruntime += delta_exec_weighted;
 }
 
@@ -687,17 +630,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 
 	if (!initial) {
 		/* sleeps upto a single latency don't count. */
-		if (sched_feat(NEW_FAIR_SLEEPERS)) {
-			unsigned long thresh = sysctl_sched_latency;
-
-			/*
-			 * convert the sleeper threshold into virtual time
-			 */
-			if (sched_feat(NORMALIZED_SLEEPER))
-				thresh = calc_delta_fair(thresh, se);
-
-			vruntime -= thresh;
-		}
+		if (sched_feat(NEW_FAIR_SLEEPERS))
+			vruntime -= sysctl_sched_latency;
 
 		/* ensure we never gain time by being placed backwards. */
 		vruntime = max_vruntime(se->vruntime, vruntime);
@@ -1277,13 +1211,11 @@ static unsigned long wakeup_gran(struct sched_entity *se)
 	unsigned long gran = sysctl_sched_wakeup_granularity;
 
 	/*
-	 * More easily preempt - nice tasks, while not making it harder for
-	 * + nice tasks.
+	 * More easily preempt - nice tasks, while not making
+	 * it harder for + nice tasks.
 	 */
-	if (sched_feat(ASYM_GRAN))
-		gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
-	else
-		gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
+	if (unlikely(se->load.weight > NICE_0_LOAD))
+		gran = calc_delta_fair(gran, &se->load);
 
 	return gran;
 }
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 9353ca7..34ef70f 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,5 +1,4 @@
 SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
-SCHED_FEAT(NORMALIZED_SLEEPER, 1)
 SCHED_FEAT(WAKEUP_PREEMPT, 1)
 SCHED_FEAT(START_DEBIT, 1)
 SCHED_FEAT(AFFINE_WAKEUPS, 1)

[-- Attachment #3: return-80mb-2.6.25.diff --]
[-- Type: text/x-diff, Size: 17898 bytes --]

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c20c9e7..e06ecab 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -295,8 +295,8 @@ sysret_careful:
 	/* Handle a signal */ 
 sysret_signal:
 	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
-	testl $_TIF_DO_NOTIFY_MASK,%edx
+	sti
+	testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
 	jz    1f
 
 	/* Really a signal */
@@ -390,7 +390,7 @@ int_very_careful:
 	jmp int_restore_rest
 	
 int_signal:
-	testl $_TIF_DO_NOTIFY_MASK,%edx
+	testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
 	jz 1f
 	movq %rsp,%rdi		# &ptregs -> arg1
 	xorl %esi,%esi		# oldset -> arg2
@@ -637,7 +637,7 @@ retint_careful:
 	jmp retint_check
 	
 retint_signal:
-	testl $_TIF_DO_NOTIFY_MASK,%edx
+	testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
 	jz    retint_swapgs
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 0157a6f..1b085d2 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -667,9 +667,6 @@ void do_notify_resume(struct pt_regs *regs, void *_unused,
 	/* deal with pending signal delivery */
 	if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
 		do_signal(regs);
-
-	if (thread_info_flags & _TIF_HRTICK_RESCHED)
-		hrtick_resched();
 	
 	clear_thread_flag(TIF_IRET);
 }
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 1c83e51..9691bb8 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -504,9 +504,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 	/* deal with pending signal delivery */
 	if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
 		do_signal(regs);
-
-	if (thread_info_flags & _TIF_HRTICK_RESCHED)
-		hrtick_resched();
 }
 
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
diff --git a/include/asm-x86/thread_info_32.h b/include/asm-x86/thread_info_32.h
index 5bd5082..0514e3b 100644
--- a/include/asm-x86/thread_info_32.h
+++ b/include/asm-x86/thread_info_32.h
@@ -132,7 +132,6 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_SYSCALL_AUDIT	6	/* syscall auditing active */
 #define TIF_SECCOMP		7	/* secure computing */
 #define TIF_RESTORE_SIGMASK	8	/* restore signal mask in do_signal() */
-#define TIF_HRTICK_RESCHED	9	/* reprogram hrtick timer */
 #define TIF_MEMDIE		16
 #define TIF_DEBUG		17	/* uses debug registers */
 #define TIF_IO_BITMAP		18	/* uses I/O bitmap */
@@ -152,7 +151,6 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
-#define _TIF_HRTICK_RESCHED	(1<<TIF_HRTICK_RESCHED)
 #define _TIF_DEBUG		(1<<TIF_DEBUG)
 #define _TIF_IO_BITMAP		(1<<TIF_IO_BITMAP)
 #define _TIF_FREEZE		(1<<TIF_FREEZE)
diff --git a/include/asm-x86/thread_info_64.h b/include/asm-x86/thread_info_64.h
index 6c9b214..35ec680 100644
--- a/include/asm-x86/thread_info_64.h
+++ b/include/asm-x86/thread_info_64.h
@@ -112,7 +112,6 @@ static inline struct thread_info *stack_thread_info(void)
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_RESTORE_SIGMASK	9	/* restore signal mask in do_signal */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
-#define TIF_HRTICK_RESCHED	11	/* reprogram hrtick timer */
 /* 16 free */
 #define TIF_IA32		17	/* 32bit process */ 
 #define TIF_FORK		18	/* ret_from_fork */
@@ -135,7 +134,6 @@ static inline struct thread_info *stack_thread_info(void)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
 #define _TIF_MCE_NOTIFY		(1<<TIF_MCE_NOTIFY)
-#define _TIF_HRTICK_RESCHED	(1<<TIF_HRTICK_RESCHED)
 #define _TIF_IA32		(1<<TIF_IA32)
 #define _TIF_FORK		(1<<TIF_FORK)
 #define _TIF_ABI_PENDING	(1<<TIF_ABI_PENDING)
@@ -153,9 +151,6 @@ static inline struct thread_info *stack_thread_info(void)
 /* work to do on any return to user space */
 #define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
 
-#define _TIF_DO_NOTIFY_MASK \
-	(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED)
-
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW \
     (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS)
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 1ad56a7..386600e 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -214,11 +214,6 @@ static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
 	return timer->base->get_time();
 }
 
-static inline int hrtimer_is_hres_active(struct hrtimer *timer)
-{
-	return timer->base->cpu_base->hres_active;
-}
-
 /*
  * The resolution of the clocks. The resolution value is returned in
  * the clock_getres() system call to give application programmers an
@@ -253,10 +248,6 @@ static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
 	return timer->base->softirq_time;
 }
 
-static inline int hrtimer_is_hres_active(struct hrtimer *timer)
-{
-	return 0;
-}
 #endif
 
 extern ktime_t ktime_get(void);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6a1e7af..7b7905d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -285,7 +285,6 @@ extern void trap_init(void);
 extern void account_process_tick(struct task_struct *task, int user);
 extern void update_process_times(int user);
 extern void scheduler_tick(void);
-extern void hrtick_resched(void);
 
 extern void sched_show_task(struct task_struct *p);
 
@@ -887,7 +886,7 @@ struct sched_class {
 #endif
 
 	void (*set_curr_task) (struct rq *rq);
-	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
+	void (*task_tick) (struct rq *rq, struct task_struct *p);
 	void (*task_new) (struct rq *rq, struct task_struct *p);
 	void (*set_cpus_allowed)(struct task_struct *p, cpumask_t *newmask);
 
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 526128a..4af1580 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -54,5 +54,3 @@ config HZ
 	default 300 if HZ_300
 	default 1000 if HZ_1000
 
-config SCHED_HRTICK
-	def_bool HIGH_RES_TIMERS && X86
diff --git a/kernel/sched.c b/kernel/sched.c
index 8dcdec6..c481937 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -65,7 +65,6 @@
 #include <linux/reciprocal_div.h>
 #include <linux/unistd.h>
 #include <linux/pagemap.h>
-#include <linux/hrtimer.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -452,12 +451,6 @@ struct rq {
 	struct list_head migration_queue;
 #endif
 
-#ifdef CONFIG_SCHED_HRTICK
-	unsigned long hrtick_flags;
-	ktime_t hrtick_expire;
-	struct hrtimer hrtick_timer;
-#endif
-
 #ifdef CONFIG_SCHEDSTATS
 	/* latency stats */
 	struct sched_info rq_sched_info;
@@ -594,16 +587,16 @@ enum {
 	SCHED_FEAT_NEW_FAIR_SLEEPERS	= 1,
 	SCHED_FEAT_WAKEUP_PREEMPT	= 2,
 	SCHED_FEAT_START_DEBIT		= 4,
-	SCHED_FEAT_HRTICK		= 8,
-	SCHED_FEAT_DOUBLE_TICK		= 16,
+	SCHED_FEAT_TREE_AVG		= 8,
+	SCHED_FEAT_APPROX_AVG		= 16,
 };
 
 const_debug unsigned int sysctl_sched_features =
 		SCHED_FEAT_NEW_FAIR_SLEEPERS	* 1 |
 		SCHED_FEAT_WAKEUP_PREEMPT	* 1 |
 		SCHED_FEAT_START_DEBIT		* 1 |
-		SCHED_FEAT_HRTICK		* 1 |
-		SCHED_FEAT_DOUBLE_TICK		* 0;
+		SCHED_FEAT_TREE_AVG		* 0 |
+		SCHED_FEAT_APPROX_AVG		* 0;
 
 #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
 
@@ -841,173 +834,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
-static void __resched_task(struct task_struct *p, int tif_bit);
-
-static inline void resched_task(struct task_struct *p)
-{
-	__resched_task(p, TIF_NEED_RESCHED);
-}
-
-#ifdef CONFIG_SCHED_HRTICK
-/*
- * Use HR-timers to deliver accurate preemption points.
- *
- * Its all a bit involved since we cannot program an hrt while holding the
- * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
- * reschedule event.
- *
- * When we get rescheduled we reprogram the hrtick_timer outside of the
- * rq->lock.
- */
-static inline void resched_hrt(struct task_struct *p)
-{
-	__resched_task(p, TIF_HRTICK_RESCHED);
-}
-
-static inline void resched_rq(struct rq *rq)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rq->lock, flags);
-	resched_task(rq->curr);
-	spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-enum {
-	HRTICK_SET,		/* re-programm hrtick_timer */
-	HRTICK_RESET,		/* not a new slice */
-};
-
-/*
- * Use hrtick when:
- *  - enabled by features
- *  - hrtimer is actually high res
- */
-static inline int hrtick_enabled(struct rq *rq)
-{
-	if (!sched_feat(HRTICK))
-		return 0;
-	return hrtimer_is_hres_active(&rq->hrtick_timer);
-}
-
-/*
- * Called to set the hrtick timer state.
- *
- * called with rq->lock held and irqs disabled
- */
-static void hrtick_start(struct rq *rq, u64 delay, int reset)
-{
-	assert_spin_locked(&rq->lock);
-
-	/*
-	 * preempt at: now + delay
-	 */
-	rq->hrtick_expire =
-		ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
-	/*
-	 * indicate we need to program the timer
-	 */
-	__set_bit(HRTICK_SET, &rq->hrtick_flags);
-	if (reset)
-		__set_bit(HRTICK_RESET, &rq->hrtick_flags);
-
-	/*
-	 * New slices are called from the schedule path and don't need a
-	 * forced reschedule.
-	 */
-	if (reset)
-		resched_hrt(rq->curr);
-}
-
-static void hrtick_clear(struct rq *rq)
-{
-	if (hrtimer_active(&rq->hrtick_timer))
-		hrtimer_cancel(&rq->hrtick_timer);
-}
-
-/*
- * Update the timer from the possible pending state.
- */
-static void hrtick_set(struct rq *rq)
-{
-	ktime_t time;
-	int set, reset;
-	unsigned long flags;
-
-	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
-
-	spin_lock_irqsave(&rq->lock, flags);
-	set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
-	reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
-	time = rq->hrtick_expire;
-	clear_thread_flag(TIF_HRTICK_RESCHED);
-	spin_unlock_irqrestore(&rq->lock, flags);
-
-	if (set) {
-		hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
-		if (reset && !hrtimer_active(&rq->hrtick_timer))
-			resched_rq(rq);
-	} else
-		hrtick_clear(rq);
-}
-
-/*
- * High-resolution timer tick.
- * Runs from hardirq context with interrupts disabled.
- */
-static enum hrtimer_restart hrtick(struct hrtimer *timer)
-{
-	struct rq *rq = container_of(timer, struct rq, hrtick_timer);
-
-	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
-
-	spin_lock(&rq->lock);
-	__update_rq_clock(rq);
-	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
-	spin_unlock(&rq->lock);
-
-	return HRTIMER_NORESTART;
-}
-
-static inline void init_rq_hrtick(struct rq *rq)
-{
-	rq->hrtick_flags = 0;
-	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	rq->hrtick_timer.function = hrtick;
-	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
-}
-
-void hrtick_resched(void)
-{
-	struct rq *rq;
-	unsigned long flags;
-
-	if (!test_thread_flag(TIF_HRTICK_RESCHED))
-		return;
-
-	local_irq_save(flags);
-	rq = cpu_rq(smp_processor_id());
-	hrtick_set(rq);
-	local_irq_restore(flags);
-}
-#else
-static inline void hrtick_clear(struct rq *rq)
-{
-}
-
-static inline void hrtick_set(struct rq *rq)
-{
-}
-
-static inline void init_rq_hrtick(struct rq *rq)
-{
-}
-
-void hrtick_resched(void)
-{
-}
-#endif
-
 /*
  * resched_task - mark a task 'to be rescheduled now'.
  *
@@ -1021,16 +847,16 @@ void hrtick_resched(void)
 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 #endif
 
-static void __resched_task(struct task_struct *p, int tif_bit)
+static void resched_task(struct task_struct *p)
 {
 	int cpu;
 
 	assert_spin_locked(&task_rq(p)->lock);
 
-	if (unlikely(test_tsk_thread_flag(p, tif_bit)))
+	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
 		return;
 
-	set_tsk_thread_flag(p, tif_bit);
+	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
 
 	cpu = task_cpu(p);
 	if (cpu == smp_processor_id())
@@ -1096,10 +922,10 @@ void wake_up_idle_cpu(int cpu)
 #endif
 
 #else
-static void __resched_task(struct task_struct *p, int tif_bit)
+static inline void resched_task(struct task_struct *p)
 {
 	assert_spin_locked(&task_rq(p)->lock);
-	set_tsk_thread_flag(p, tif_bit);
+	set_tsk_need_resched(p);
 }
 #endif
 
@@ -3766,8 +3592,8 @@ void scheduler_tick(void)
 	}
 	rq->tick_timestamp = rq->clock;
 	update_cpu_load(rq);
-	curr->sched_class->task_tick(rq, curr, 0);
-	update_sched_rt_period(rq);
+	if (curr != rq->idle) /* FIXME: needed? */
+		curr->sched_class->task_tick(rq, curr);
 	spin_unlock(&rq->lock);
 
 #ifdef CONFIG_SMP
@@ -3913,8 +3739,6 @@ need_resched_nonpreemptible:
 
 	schedule_debug(prev);
 
-	hrtick_clear(rq);
-
 	/*
 	 * Do the rq-clock update outside the rq lock:
 	 */
@@ -3952,20 +3776,14 @@ need_resched_nonpreemptible:
 		++*switch_count;
 
 		context_switch(rq, prev, next); /* unlocks the rq */
-		/*
-		 * the context switch might have flipped the stack from under
-		 * us, hence refresh the local variables.
-		 */
-		cpu = smp_processor_id();
-		rq = cpu_rq(cpu);
 	} else
 		spin_unlock_irq(&rq->lock);
 
-	hrtick_set(rq);
-
-	if (unlikely(reacquire_kernel_lock(current) < 0))
+	if (unlikely(reacquire_kernel_lock(current) < 0)) {
+		cpu = smp_processor_id();
+		rq = cpu_rq(cpu);
 		goto need_resched_nonpreemptible;
-
+	}
 	preempt_enable_no_resched();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
@@ -7248,7 +7066,6 @@ void __init sched_init(void)
 		INIT_LIST_HEAD(&rq->migration_queue);
 		rq_attach_root(rq, &def_root_domain);
 #endif
-		init_rq_hrtick(rq);
 		atomic_set(&rq->nr_iowait, 0);
 		highest_cpu = i;
 	}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0080968..14a5af5 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -677,29 +677,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 	cfs_rq->curr = NULL;
 }
 
-static void
-entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
+static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
 	/*
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
 
-#ifdef CONFIG_SCHED_HRTICK
-	/*
-	 * queued ticks are scheduled to match the slice, so don't bother
-	 * validating it and just reschedule.
-	 */
-	if (queued)
-		return resched_task(rq_of(cfs_rq)->curr);
-	/*
-	 * don't let the period tick interfere with the hrtick preemption
-	 */
-	if (!sched_feat(DOUBLE_TICK) &&
-			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
-		return;
-#endif
-
 	if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
 		check_preempt_tick(cfs_rq, curr);
 }
@@ -803,43 +787,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
-#ifdef CONFIG_SCHED_HRTICK
-static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
-{
-	int requeue = rq->curr == p;
-	struct sched_entity *se = &p->se;
-	struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
-	WARN_ON(task_rq(p) != rq);
-
-	if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
-		u64 slice = sched_slice(cfs_rq, se);
-		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
-		s64 delta = slice - ran;
-
-		if (delta < 0) {
-			if (rq->curr == p)
-				resched_task(p);
-			return;
-		}
-
-		/*
-		 * Don't schedule slices shorter than 10000ns, that just
-		 * doesn't make sense. Rely on vruntime for fairness.
-		 */
-		if (!requeue)
-			delta = max(10000LL, delta);
-
-		hrtick_start(rq, delta, requeue);
-	}
-}
-#else
-static inline void
-hrtick_start_fair(struct rq *rq, struct task_struct *p)
-{
-}
-#endif
-
 /*
  * The enqueue_task method is called before nr_running is
  * increased. Here we update the fair scheduling stats and
@@ -857,8 +804,6 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 		enqueue_entity(cfs_rq, se, wakeup);
 		wakeup = 1;
 	}
-
-	hrtick_start_fair(rq, rq->curr);
 }
 
 /*
@@ -879,8 +824,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
 			break;
 		sleep = 1;
 	}
-
-	hrtick_start_fair(rq, rq->curr);
 }
 
 /*
@@ -1152,7 +1095,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 
 static struct task_struct *pick_next_task_fair(struct rq *rq)
 {
-	struct task_struct *p;
 	struct cfs_rq *cfs_rq = &rq->cfs;
 	struct sched_entity *se;
 
@@ -1164,10 +1106,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
-	p = task_of(se);
-	hrtick_start_fair(rq, p);
-
-	return p;
+	return task_of(se);
 }
 
 /*
@@ -1322,14 +1261,14 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 /*
  * scheduler tick hitting a task of our scheduling class:
  */
-static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
+static void task_tick_fair(struct rq *rq, struct task_struct *curr)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &curr->se;
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
-		entity_tick(cfs_rq, se, queued);
+		entity_tick(cfs_rq, se);
 	}
 }
 
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 2bcafa3..ef7a266 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -61,7 +61,7 @@ move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
 }
 #endif
 
-static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
+static void task_tick_idle(struct rq *rq, struct task_struct *curr)
 {
 }
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 0a6d2e5..7f67b1a 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1149,7 +1149,7 @@ static void watchdog(struct rq *rq, struct task_struct *p)
 	}
 }
 
-static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
+static void task_tick_rt(struct rq *rq, struct task_struct *p)
 {
 	update_curr_rt(rq);
 

^ permalink raw reply related	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-09 23:17 [tbench regression fixes]: digging out smelly deadmen Evgeniy Polyakov
@ 2008-10-10  5:40 ` Peter Zijlstra
  2008-10-10  8:09 ` Evgeniy Polyakov
  2008-10-10 10:13 ` [tbench regression fixes]: digging out smelly deadmen Mike Galbraith
  2 siblings, 0 replies; 94+ messages in thread
From: Peter Zijlstra @ 2008-10-10  5:40 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: netdev, linux-kernel, Ingo Molnar, David Miller

On Fri, 2008-10-10 at 03:17 +0400, Evgeniy Polyakov wrote:

> I was so lucky to 'guess' (just after couple of hundreds of compilations),
> that it corresponds to 8f4d37ec073c17e2d4aa8851df5837d798606d6f commit about
> high-resolution timers, attached patch against 2.6.25 brings tbench
> performance for the 2.6.25 kernel tree to 455 MB/s.

can you try

echo NO_HRTICK > /debug/sched_features

on .27 like kernels?

Also, what clocksource do those machines use?

cat /sys/devices/system/clocksource/clocksource0/current_clocksource

As to, a7be37ac8e1565e00880531f4e2aff421a21c803, could you try
tip/master? I reworked some of the wakeup preemption code in there.

Thanks for looking into this issue!


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-09 23:17 [tbench regression fixes]: digging out smelly deadmen Evgeniy Polyakov
  2008-10-10  5:40 ` Peter Zijlstra
@ 2008-10-10  8:09 ` Evgeniy Polyakov
  2008-10-10  9:15   ` Ingo Molnar
  2008-10-10 10:13 ` [tbench regression fixes]: digging out smelly deadmen Mike Galbraith
  2 siblings, 1 reply; 94+ messages in thread
From: Evgeniy Polyakov @ 2008-10-10  8:09 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-kernel, netdev, Ingo Molnar, David Miller

Hi Peter.

I've enabled kernel hacking option and scheduler debugging and turned
off hrticks and performance jumped to 382 MB/s:

vanilla 27: 347.222
no TSO/GSO: 357.331
no hrticks: 382.983

I use tsc clocksource, also available acpi_pm and jiffies,
with acpi_pm performance is even lower (I stopped test after it dropped
below 340 MB/s mark), jiffies do not work at all, looks like sockets
stuck in time_wait state when this clock source is used, although that
may be some different issue.

So I think hrticks are guilty, but still not as good as .25 tree without
mentioned changes (455 MB/s) and .24 (475 MB/s).

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-10  8:09 ` Evgeniy Polyakov
@ 2008-10-10  9:15   ` Ingo Molnar
  2008-10-10 11:31     ` Evgeniy Polyakov
  0 siblings, 1 reply; 94+ messages in thread
From: Ingo Molnar @ 2008-10-10  9:15 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Peter Zijlstra, linux-kernel, netdev, David Miller, Mike Galbraith


hi Evgeniy,

* Evgeniy Polyakov <s0mbre@tservice.net.ru> wrote:

> Hi Peter.
> 
> I've enabled kernel hacking option and scheduler debugging and turned
> off hrticks and performance jumped to 382 MB/s:
> 
> vanilla 27: 347.222
> no TSO/GSO: 357.331
> no hrticks: 382.983
> 
> I use tsc clocksource, also available acpi_pm and jiffies,
> with acpi_pm performance is even lower (I stopped test after it dropped
> below 340 MB/s mark), jiffies do not work at all, looks like sockets
> stuck in time_wait state when this clock source is used, although that
> may be some different issue.
> 
> So I think hrticks are guilty, but still not as good as .25 tree without
> mentioned changes (455 MB/s) and .24 (475 MB/s).

i'm glad that you are looking into this! That is an SMP box, right? If 
yes then could you try this sched-domains tuning utility i have written 
yesterday (incidentally):

  http://redhat.com/~mingo/cfs-scheduler/tune-sched-domains

just run it without options to see the current sched-domains options. On 
a testsystem i have it displays this:

# tune-sched-domains
usage: tune-sched-domains <val>
current val on cpu0/domain0:
SD flag: 47
+   1: SD_LOAD_BALANCE:          Do load balancing on this domain
+   2: SD_BALANCE_NEWIDLE:       Balance when about to become idle
+   4: SD_BALANCE_EXEC:          Balance on exec
+   8: SD_BALANCE_FORK:          Balance on fork, clone
-  16: SD_WAKE_IDLE:             Wake to idle CPU on task wakeup
+  32: SD_WAKE_AFFINE:           Wake task to waking CPU
-  64: SD_WAKE_BALANCE:          Perform balancing at task wakeup

then could you check what effects it has if you turn off 
SD_BALANCE_NEWIDLE? On my box i did it via:

# tune-sched-domains $[47-2]
changed /proc/sys/kernel/sched_domain/cpu0/domain0/flags: 47 => 45
SD flag: 45
+   1: SD_LOAD_BALANCE:          Do load balancing on this domain
-   2: SD_BALANCE_NEWIDLE:       Balance when about to become idle
+   4: SD_BALANCE_EXEC:          Balance on exec
+   8: SD_BALANCE_FORK:          Balance on fork, clone
-  16: SD_WAKE_IDLE:             Wake to idle CPU on task wakeup
+  32: SD_WAKE_AFFINE:           Wake task to waking CPU
-  64: SD_WAKE_BALANCE:          Perform balancing at task wakeup
changed /proc/sys/kernel/sched_domain/cpu0/domain1/flags: 1101 => 45
SD flag: 45
+   1: SD_LOAD_BALANCE:          Do load balancing on this domain
-   2: SD_BALANCE_NEWIDLE:       Balance when about to become idle
+   4: SD_BALANCE_EXEC:          Balance on exec
+   8: SD_BALANCE_FORK:          Balance on fork, clone
-  16: SD_WAKE_IDLE:             Wake to idle CPU on task wakeup
+  32: SD_WAKE_AFFINE:           Wake task to waking CPU
-  64: SD_WAKE_BALANCE:          Perform balancing at task wakeup

and please, when tuning such scheduler bits, could you run latest 
tip/master:

   http://people.redhat.com/mingo/tip.git/README

and you need to have CONFIG_SCHED_DEBUG=y enabled for the tuning knobs.

so that it's all in sync with upcoming scheduler changes/tunings/fixes.

It will also make it much easier for us to apply any fix patches you 
might send :-)

For advanced tuners: you can specify two or more domain flags options as 
well on the command line - that will be put into domain1/domain2/etc. I 
usually tune these flags via something like:

  tune-sched-domains $[1*1+1*2+1*4+1*8+0*16+1*32+1*64]

that makes it easy to set/clear each of the flags.

	Ingo

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-09 23:17 [tbench regression fixes]: digging out smelly deadmen Evgeniy Polyakov
  2008-10-10  5:40 ` Peter Zijlstra
  2008-10-10  8:09 ` Evgeniy Polyakov
@ 2008-10-10 10:13 ` Mike Galbraith
  2008-10-11 13:13   ` Evgeniy Polyakov
  2 siblings, 1 reply; 94+ messages in thread
From: Mike Galbraith @ 2008-10-10 10:13 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: netdev, linux-kernel, Peter Zijlstra, Ingo Molnar, David Miller

On Fri, 2008-10-10 at 03:17 +0400, Evgeniy Polyakov wrote: 
> Hi.

Greetings.  Glad to see someone pursuing this.

> It was reported recently that tbench has a long history of regressions,
> starting at least from 2.6.23 kernel. I verified that in my test
> environment tbench 'lost' more than 100 MB/s from 470 down to 355
> between at least 2.6.24 and 2.6.27. 2.6.26-2.6.27 performance regression
> in my machines is rougly corresponds to 375 down to 355 MB/s.
> 
> I spent several days in various tests and bisections (unfortunately
> bisect can not always point to the 'right' commit), and found following
> problems.
> 
> First, related to the network, as lots of people expected: TSO/GSO over
> loopback with tbench workload eats about 5-10 MB/s, since TSO/GSO frame
> creation overhead is not paid by the optimized super-frame processing
> gains. Since it brings really impressive improvement in big-packet
> workload, it was (likely) decided not to add a patch for this, but
> instead one can disable TSO/GSO via ethtool. This patch was added in
> 2.6.27 window, so it has its part in its regression.

Part, disabling TSO/GSO doesn't do enough here.  See test log below.

> Second part in the 26-27 window regression (I remind, it is about 20
> MB/s) is related to the scheduler changes, which was expected by another
> group of people. I tracked it down to the
> a7be37ac8e1565e00880531f4e2aff421a21c803 commit, which, if being
> reverted, returns 2.6.27 tbench perfromance to the highest (for
> 2.6.26-2.6.27) 365 MB/s mark. I also tested tree, stopped at above
> commit itself, i.e. not 2.6.27, and got 373 MB/s, so likely another
> changes in that merge ate couple of megs. Attached patch against 2.6.27.

a7be37a adds some math overhead, calls to calc_delta_mine() per
wakeup/context switch for all weight tasks, whereas previously these
calls were only made for tasks which were not nice 0.  It also shifts
performance a bit in favor of loads which dislike wakeup preemption,
this effect lessens as task count increases.  Per testing, overhead is
not the primary factor in throughput loss.  I believe clock accuracy to
be a more important factor than overhead by a very large margin.

Reverting a7be37a (and the two asym fixes) didn't do a whole lot for me
either.  I'm still ~8% down from 2.6.26 for netperf, and ~3% for tbench,
and the 2.6.26 numbers are gcc-4.1, which are a little lower than
gcc-4.3.  Along the way, I've reverted 100% of scheduler and ilk 26->27
and been unable to recover throughput.  (Too bad I didn't know about
that TSO/GSO thingy, would have been nice.)

I can achieve nearly the same improvement for tbench with a little
tinker, and _more_ for netperf than reverting these changes delivers,
see last log entry, experiment cut math overhead by less than 1/3.

For the full cfs history, even with those three reverts, I'm ~6% down on
tbench, and ~14% for netperf, and haven't found out where it went.

> Curious reader can ask, where did we lost another 100 MB/s? This small
> issue was not detected (or at least reported in netdev@ with provocative
> enough subject), and it happend to live somehere in 2.6.24-2.6.25 changes.
> I was so lucky to 'guess' (just after couple of hundreds of compilations),
> that it corresponds to 8f4d37ec073c17e2d4aa8851df5837d798606d6f commit about
> high-resolution timers, attached patch against 2.6.25 brings tbench
> performance for the 2.6.25 kernel tree to 455 MB/s.

I have highres timers disabled in my kernels because per testing it does
cost a lot at high frequency, but primarily because it's not available
throughout test group, same for nohz.  A patchlet went into 2.6.27 to
neutralized the cost of hrtick when it's not active.  Per re-test,
2.6.27 should be zero impact with hrtick disabled. 

> There are still somewhat missed 20 MB/s, but 2.6.24 has 475 MB/s, so
> likely bug lives between 2.6.24 and above 8f4d37ec073 commit.

I lost some at 24, got it back at 25 etc.  Some of it is fairness /
preemption differences, but there's a bunch I can't find, and massive
amounts of time spent bisecting were a waste of time.

My annotated test log.  File under fwiw.

Note:  2.6.23 cfs was apparently a bad-hair day for high frequency
switchers.  Anyone entering the way-back-machine to test 2.6.23, should
probably use cfs-24.1, which is 2.6.24 scheduler minus on zero impact
for nice 0 loads line.

-------------------------------------------------------------------------
UP config, no nohz or highres timers except as noted.

60 sec localhost network tests, tbench 1 and 1 netperf TCP_RR pair.
use ring-test -t 2 -w 0 -s 0 to see roughly how heavy the full ~0 work
fast path is, vmstat 10 ctx/s fed to bc (close enough for gvt work). 
ring-test args: -t NR tasks -w work_ms -s sleep_ms

sched_wakeup_granularity_ns always set to 0 for all tests to maximize
context switches.

Why?  O(1) preempts very aggressively with dissimilar task loads, as
both tbench and netperf are.  With O(1), sleepier component preempts
less sleepy component on each and every wakeup.  CFS preempts based on
lag (sleepiness) as well, but it's short vs long term.  Granularity of
zero was as close to apple/apple as I could get.. apple/pineapple.

2.6.22.19-up
ring-test   - 1.204 us/cycle  = 830 KHz  (gcc-4.1)
ring-test   - doorstop                   (gcc-4.3)
netperf     - 147798.56 rr/s  = 295 KHz  (hmm, a bit unstable, 140K..147K rr/s)
tbench      - 374.573 MB/sec

2.6.22.19-cfs-v24.1-up
ring-test   - 1.098 us/cycle  = 910 KHz  (gcc-4.1)
ring-test   - doorstop                   (gcc-4.3)
netperf     - 140039.03 rr/s  = 280 KHz = 3.57us - 1.10us sched = 2.47us/packet network
tbench      - 364.191 MB/sec

2.6.23.17-up
ring-test   - 1.252 us/cycle  = 798 KHz  (gcc-4.1)
ring-test   - 1.235 us/cycle  = 809 KHz  (gcc-4.3)
netperf     - 123736.40 rr/s  = 247 KHz  sb 268 KHZ / 134336.37 rr/s
tbench      - 355.906 MB/sec

2.6.23.17-cfs-v24.1-up
ring-test   - 1.100 us/cycle  = 909 KHz  (gcc-4.1)
ring-test   - 1.074 us/cycle  = 931 KHz  (gcc-4.3)
netperf     - 135847.14 rr/s  = 271 KHz  sb 280 KHz / 140039.03 rr/s
tbench      - 364.511 MB/sec

2.6.24.7-up
ring-test   - 1.100 us/cycle  = 909 KHz  (gcc-4.1)
ring-test   - 1.068 us/cycle  = 936 KHz  (gcc-4.3)
netperf     - 122300.66 rr/s  = 244 KHz  sb 280 KHz / 140039.03 rr/s
tbench      - 341.523 MB/sec

2.6.25.17-up
ring-test   - 1.163 us/cycle  = 859 KHz  (gcc-4.1)
ring-test   - 1.129 us/cycle  = 885 KHz  (gcc-4.3)
netperf     - 132102.70 rr/s  = 264 KHz  sb 275 KHz / 137627.30 rr/s
tbench      - 361.71 MB/sec

retest 2.6.25.18-up, gcc = 4.3

2.6.25.18-up
push patches/revert_hrtick.diff
ring-test   - 1.127 us/cycle  = 887 KHz
netperf     - 132123.42 rr/s
tbench      - 358.964 361.538 361.164 MB/sec
(all is well, zero impact as expected, enable highres timers)

2.6.25.18-up
pop patches/revert_hrtick.diff
push patches/hrtick.diff (cut overhead when hrtick disabled patchlet in .27)

echo 7 > sched_features = nohrtick
ring-test   - 1.183 us/cycle  = 845 KHz
netperf     - 131976.23 rr/s
tbench      - 361.17 360.468 361.721 MB/sec

echo 15 > sched_features = default = hrtick
ring-test   - 1.333 us/cycle  = 750 KHz        - .887
netperf     - 120520.67 rr/s                   - .913
tbench      - 344.092 344.569 344.839 MB/sec   - .953

(yeah, why i turned highres timers off while testing high frequency throughput)

2.6.26.5-up
ring-test   - 1.195 us/cycle  = 836 KHz  (gcc-4.1)
ring-test   - 1.179 us/cycle  = 847 KHz  (gcc-4.3)
netperf     - 131289.73 rr/s  = 262 KHZ  sb 272 KHz / 136425.64 rr/s
tbench      - 354.07 MB/sec

2.6.27-rc8-up
ring-test   - 1.225 us/cycle  = 816 KHz  (gcc-4.1)
ring-test   - 1.196 us/cycle  = 836 KHz  (gcc-4.3)
netperf     - 118090.27 rr/s  = 236 KHz  sb 270 KHz / 135317.99 rr/s
tbench      - 329.856 MB/sec

retest of 2.6.27-final-up, gcc = 4.3.  tbench/netperf numbers above here
are all gcc-4.1 except for 2.6.25 retest.

2.6.27-final-up
ring-test   - 1.193 us/cycle  = 838 KHz  (gcc-4.3)
tbench      - 337.377 MB/sec           tso/gso on
tbench      - 340.362 MB/sec           tso/gso off
netperf     - TCP_RR 120751.30 rr/s    tso/gso on
netperf     - TCP_RR 121293.48 rr/s    tso/gso off

2.6.27-final-up
push revert_weight_and_asym_stuff.diff
ring-test   - 1.133 us/cycle  = 882 KHz  (gcc-4.3)
tbench      - 340.481 MB/sec           tso/gso on
tbench      - 343.472 MB/sec           tso/gso off
netperf     - 119486.14 rr/s           tso/gso on
netperf     - 121035.56 rr/s           tso/gso off

2.6.27-final-up-tinker
ring-test   - 1.141 us/cycle  = 876 KHz  (gcc-4.3)
tbench      - 339.095 MB/sec           tso/gso on
tbench      - 340.507 MB/sec           tso/gso off
netperf     - 122371.59 rr/s           tso/gso on
netperf     - 124650.09 rr/s           tso/gso off



^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-10  9:15   ` Ingo Molnar
@ 2008-10-10 11:31     ` Evgeniy Polyakov
  2008-10-10 11:40       ` Ingo Molnar
  2008-10-10 11:42       ` Ingo Molnar
  0 siblings, 2 replies; 94+ messages in thread
From: Evgeniy Polyakov @ 2008-10-10 11:31 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, linux-kernel, netdev, David Miller, Mike Galbraith

Hi Ingo.

On Fri, Oct 10, 2008 at 11:15:11AM +0200, Ingo Molnar (mingo@elte.hu) wrote:

> > 
> > I use tsc clocksource, also available acpi_pm and jiffies,
> > with acpi_pm performance is even lower (I stopped test after it dropped
> > below 340 MB/s mark), jiffies do not work at all, looks like sockets
> > stuck in time_wait state when this clock source is used, although that
> > may be some different issue.
> > 
> > So I think hrticks are guilty, but still not as good as .25 tree without
> > mentioned changes (455 MB/s) and .24 (475 MB/s).
> 
> i'm glad that you are looking into this! That is an SMP box, right? If 
> yes then could you try this sched-domains tuning utility i have written 
> yesterday (incidentally):
> 
>   http://redhat.com/~mingo/cfs-scheduler/tune-sched-domains

I've removed SD_BALANCE_NEWIDLE:
# ./tune-sched-domains $[191-2]
changed /proc/sys/kernel/sched_domain/cpu0/domain0/flags: 191 => 189
SD flag: 189
+   1: SD_LOAD_BALANCE:          Do load balancing on this domain
-   2: SD_BALANCE_NEWIDLE:       Balance when about to become idle
+   4: SD_BALANCE_EXEC:          Balance on exec
+   8: SD_BALANCE_FORK:          Balance on fork, clone
+  16: SD_WAKE_IDLE:             Wake to idle CPU on task wakeup
+  32: SD_WAKE_AFFINE:           Wake task to waking CPU
-  64: SD_WAKE_BALANCE:          Perform balancing at task wakeup
+ 128: SD_SHARE_CPUPOWER:        Domain members share cpu power
changed /proc/sys/kernel/sched_domain/cpu0/domain1/flags: 47 => 189
SD flag: 189
+   1: SD_LOAD_BALANCE:          Do load balancing on this domain
-   2: SD_BALANCE_NEWIDLE:       Balance when about to become idle
+   4: SD_BALANCE_EXEC:          Balance on exec
+   8: SD_BALANCE_FORK:          Balance on fork, clone
+  16: SD_WAKE_IDLE:             Wake to idle CPU on task wakeup
+  32: SD_WAKE_AFFINE:           Wake task to waking CPU
-  64: SD_WAKE_BALANCE:          Perform balancing at task wakeup
+ 128: SD_SHARE_CPUPOWER:        Domain members share cpu power

And got noticeble improvement (each new line has fixes from previous):

vanilla 27: 347.222
no TSO/GSO: 357.331
no hrticks: 382.983
no balance: 389.802

> and please, when tuning such scheduler bits, could you run latest 
> tip/master:
> 
>    http://people.redhat.com/mingo/tip.git/README
> 
> and you need to have CONFIG_SCHED_DEBUG=y enabled for the tuning knobs.
> 
> so that it's all in sync with upcoming scheduler changes/tunings/fixes.

Ok, I've started to pull it down, I will reply back when things are
ready.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-10 11:31     ` Evgeniy Polyakov
@ 2008-10-10 11:40       ` Ingo Molnar
  2008-10-10 13:25         ` Evgeniy Polyakov
  2008-10-10 11:42       ` Ingo Molnar
  1 sibling, 1 reply; 94+ messages in thread
From: Ingo Molnar @ 2008-10-10 11:40 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Peter Zijlstra, linux-kernel, netdev, David Miller,
	Mike Galbraith, Nick Piggin


* Evgeniy Polyakov <s0mbre@tservice.net.ru> wrote:

> Hi Ingo.
> 
> On Fri, Oct 10, 2008 at 11:15:11AM +0200, Ingo Molnar (mingo@elte.hu) wrote:
> 
> > > 
> > > I use tsc clocksource, also available acpi_pm and jiffies,
> > > with acpi_pm performance is even lower (I stopped test after it dropped
> > > below 340 MB/s mark), jiffies do not work at all, looks like sockets
> > > stuck in time_wait state when this clock source is used, although that
> > > may be some different issue.
> > > 
> > > So I think hrticks are guilty, but still not as good as .25 tree without
> > > mentioned changes (455 MB/s) and .24 (475 MB/s).
> > 
> > i'm glad that you are looking into this! That is an SMP box, right? If 
> > yes then could you try this sched-domains tuning utility i have written 
> > yesterday (incidentally):
> > 
> >   http://redhat.com/~mingo/cfs-scheduler/tune-sched-domains
> 
> I've removed SD_BALANCE_NEWIDLE:
> # ./tune-sched-domains $[191-2]
> changed /proc/sys/kernel/sched_domain/cpu0/domain0/flags: 191 => 189
> SD flag: 189
> +   1: SD_LOAD_BALANCE:          Do load balancing on this domain
> -   2: SD_BALANCE_NEWIDLE:       Balance when about to become idle
> +   4: SD_BALANCE_EXEC:          Balance on exec
> +   8: SD_BALANCE_FORK:          Balance on fork, clone
> +  16: SD_WAKE_IDLE:             Wake to idle CPU on task wakeup
> +  32: SD_WAKE_AFFINE:           Wake task to waking CPU
> -  64: SD_WAKE_BALANCE:          Perform balancing at task wakeup
> + 128: SD_SHARE_CPUPOWER:        Domain members share cpu power
> changed /proc/sys/kernel/sched_domain/cpu0/domain1/flags: 47 => 189
> SD flag: 189
> +   1: SD_LOAD_BALANCE:          Do load balancing on this domain
> -   2: SD_BALANCE_NEWIDLE:       Balance when about to become idle
> +   4: SD_BALANCE_EXEC:          Balance on exec
> +   8: SD_BALANCE_FORK:          Balance on fork, clone
> +  16: SD_WAKE_IDLE:             Wake to idle CPU on task wakeup
> +  32: SD_WAKE_AFFINE:           Wake task to waking CPU
> -  64: SD_WAKE_BALANCE:          Perform balancing at task wakeup
> + 128: SD_SHARE_CPUPOWER:        Domain members share cpu power
> 
> And got noticeble improvement (each new line has fixes from previous):
> 
> vanilla 27: 347.222
> no TSO/GSO: 357.331
> no hrticks: 382.983
> no balance: 389.802
> 
> > and please, when tuning such scheduler bits, could you run latest 
> > tip/master:
> > 
> >    http://people.redhat.com/mingo/tip.git/README
> > 
> > and you need to have CONFIG_SCHED_DEBUG=y enabled for the tuning knobs.
> > 
> > so that it's all in sync with upcoming scheduler changes/tunings/fixes.
> 
> Ok, I've started to pull it down, I will reply back when things are
> ready.

make sure you have this fix in tip/master already:

  5b7dba4: sched_clock: prevent scd->clock from moving backwards

Note: Mike is 100% correct in suggesting that a very good cpu_clock() is 
needed for precise scheduling.

i've also Cc:-ed Nick.

	Ingo

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-10 11:31     ` Evgeniy Polyakov
  2008-10-10 11:40       ` Ingo Molnar
@ 2008-10-10 11:42       ` Ingo Molnar
  2008-10-10 11:55         ` Evgeniy Polyakov
  1 sibling, 1 reply; 94+ messages in thread
From: Ingo Molnar @ 2008-10-10 11:42 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Peter Zijlstra, linux-kernel, netdev, David Miller, Mike Galbraith


* Evgeniy Polyakov <s0mbre@tservice.net.ru> wrote:

> > i'm glad that you are looking into this! That is an SMP box, right? 
> > If yes then could you try this sched-domains tuning utility i have 
> > written yesterday (incidentally):
> > 
> >   http://redhat.com/~mingo/cfs-scheduler/tune-sched-domains
> 
> I've removed SD_BALANCE_NEWIDLE:
> # ./tune-sched-domains $[191-2]

> And got noticeble improvement (each new line has fixes from previous):
> 
> vanilla 27: 347.222
> no TSO/GSO: 357.331
> no hrticks: 382.983
> no balance: 389.802

okay. The target is 470 MB/sec, right? (Assuming the workload is sane 
and 'fixing' it does not mean we have to schedule worse.)

We are still way off from 470 MB/sec.

	Ingo

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-10 11:42       ` Ingo Molnar
@ 2008-10-10 11:55         ` Evgeniy Polyakov
  2008-10-10 11:57           ` Ingo Molnar
  0 siblings, 1 reply; 94+ messages in thread
From: Evgeniy Polyakov @ 2008-10-10 11:55 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, linux-kernel, netdev, David Miller, Mike Galbraith

On Fri, Oct 10, 2008 at 01:42:45PM +0200, Ingo Molnar (mingo@elte.hu) wrote:
> > vanilla 27: 347.222
> > no TSO/GSO: 357.331
> > no hrticks: 382.983
> > no balance: 389.802
> 
> okay. The target is 470 MB/sec, right? (Assuming the workload is sane 
> and 'fixing' it does not mean we have to schedule worse.)

Well, that's where I started/stopped, so maybe we will even move
further? :)

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-10 11:55         ` Evgeniy Polyakov
@ 2008-10-10 11:57           ` Ingo Molnar
  2008-10-24 22:25             ` Rafael J. Wysocki
  0 siblings, 1 reply; 94+ messages in thread
From: Ingo Molnar @ 2008-10-10 11:57 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Peter Zijlstra, linux-kernel, netdev, David Miller, Mike Galbraith


* Evgeniy Polyakov <s0mbre@tservice.net.ru> wrote:

> On Fri, Oct 10, 2008 at 01:42:45PM +0200, Ingo Molnar (mingo@elte.hu) wrote:
> > > vanilla 27: 347.222
> > > no TSO/GSO: 357.331
> > > no hrticks: 382.983
> > > no balance: 389.802
> > 
> > okay. The target is 470 MB/sec, right? (Assuming the workload is sane 
> > and 'fixing' it does not mean we have to schedule worse.)
> 
> Well, that's where I started/stopped, so maybe we will even move
> further? :)

that's the right attitude ;)

	Ingo

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-10 11:40       ` Ingo Molnar
@ 2008-10-10 13:25         ` Evgeniy Polyakov
  0 siblings, 0 replies; 94+ messages in thread
From: Evgeniy Polyakov @ 2008-10-10 13:25 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, linux-kernel, netdev, David Miller,
	Mike Galbraith, Nick Piggin

On Fri, Oct 10, 2008 at 01:40:42PM +0200, Ingo Molnar (mingo@elte.hu) wrote:
> make sure you have this fix in tip/master already:
> 
>   5b7dba4: sched_clock: prevent scd->clock from moving backwards
> 
> Note: Mike is 100% correct in suggesting that a very good cpu_clock() is 
> needed for precise scheduling.

The last commit is 5dc64a3442b98eaa and aforementioned changeset was included.
Result is quite bad:

vanilla 27: 	347.222
no TSO/GSO:	357.331
no hrticks:	382.983
no balance:	389.802
tip:		365.576

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-10 10:13 ` [tbench regression fixes]: digging out smelly deadmen Mike Galbraith
@ 2008-10-11 13:13   ` Evgeniy Polyakov
  2008-10-11 14:39     ` Peter Zijlstra
  0 siblings, 1 reply; 94+ messages in thread
From: Evgeniy Polyakov @ 2008-10-11 13:13 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: netdev, linux-kernel, Peter Zijlstra, Ingo Molnar, David Miller

Hi Mike.

On Fri, Oct 10, 2008 at 12:13:43PM +0200, Mike Galbraith (efault@gmx.de) wrote:
> a7be37a adds some math overhead, calls to calc_delta_mine() per
> wakeup/context switch for all weight tasks, whereas previously these
> calls were only made for tasks which were not nice 0.  It also shifts
> performance a bit in favor of loads which dislike wakeup preemption,

I believe anyone dislikes this :)

> this effect lessens as task count increases.  Per testing, overhead is
> not the primary factor in throughput loss.  I believe clock accuracy to
> be a more important factor than overhead by a very large margin.

In my tests it was not just overhead, it was a disaster.
And stopping just before this commit gained 20 MB/s out of 30 MB/s lose
for 26-27 window. No matter what accuracy it brings, this is just wrong
to assume that such performance drop in some workloads is justified.
What this accuracy is needed for?

> Reverting a7be37a (and the two asym fixes) didn't do a whole lot for me
> either.  I'm still ~8% down from 2.6.26 for netperf, and ~3% for tbench,
> and the 2.6.26 numbers are gcc-4.1, which are a little lower than
> gcc-4.3.  Along the way, I've reverted 100% of scheduler and ilk 26->27
> and been unable to recover throughput.  (Too bad I didn't know about
> that TSO/GSO thingy, would have been nice.)
> 
> I can achieve nearly the same improvement for tbench with a little
> tinker, and _more_ for netperf than reverting these changes delivers,
> see last log entry, experiment cut math overhead by less than 1/3.

Yeah, that's what I like :)

> For the full cfs history, even with those three reverts, I'm ~6% down on
> tbench, and ~14% for netperf, and haven't found out where it went.
> 
> > Curious reader can ask, where did we lost another 100 MB/s? This small
> > issue was not detected (or at least reported in netdev@ with provocative
> > enough subject), and it happend to live somehere in 2.6.24-2.6.25 changes.
> > I was so lucky to 'guess' (just after couple of hundreds of compilations),
> > that it corresponds to 8f4d37ec073c17e2d4aa8851df5837d798606d6f commit about
> > high-resolution timers, attached patch against 2.6.25 brings tbench
> > performance for the 2.6.25 kernel tree to 455 MB/s.
> 
> I have highres timers disabled in my kernels because per testing it does
> cost a lot at high frequency, but primarily because it's not available
> throughout test group, same for nohz.  A patchlet went into 2.6.27 to
> neutralized the cost of hrtick when it's not active.  Per re-test,
> 2.6.27 should be zero impact with hrtick disabled. 

Well, yes, disabling it should bring performance back, but since they
are actually enabled everywhere and trick with debugfs is not widely
known, this is actually a red flag.

> > There are still somewhat missed 20 MB/s, but 2.6.24 has 475 MB/s, so
> > likely bug lives between 2.6.24 and above 8f4d37ec073 commit.
> 
> I lost some at 24, got it back at 25 etc.  Some of it is fairness /
> preemption differences, but there's a bunch I can't find, and massive
> amounts of time spent bisecting were a waste of time.

Yup, but since I slacked with bits of beer after POHMELFS release I did
not regret too much :)

> My annotated test log.  File under fwiw.
> 
> Note:  2.6.23 cfs was apparently a bad-hair day for high frequency
> switchers.  Anyone entering the way-back-machine to test 2.6.23, should
> probably use cfs-24.1, which is 2.6.24 scheduler minus on zero impact
> for nice 0 loads line.
> 
> -------------------------------------------------------------------------
> UP config, no nohz or highres timers except as noted.

UP actually may expect the differece in our results: I have 4-way (2
physical and 2 logical (HT enabled) CPUs) 32-bit old Xeons with highmem
enabled. I also tried low-latency preemption and no preemption (server)
without much difference.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-11 13:13   ` Evgeniy Polyakov
@ 2008-10-11 14:39     ` Peter Zijlstra
  2008-10-11 18:13       ` Mike Galbraith
  0 siblings, 1 reply; 94+ messages in thread
From: Peter Zijlstra @ 2008-10-11 14:39 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Mike Galbraith, netdev, linux-kernel, Ingo Molnar, David Miller

On Sat, 2008-10-11 at 17:13 +0400, Evgeniy Polyakov wrote:
> Hi Mike.
> 
> On Fri, Oct 10, 2008 at 12:13:43PM +0200, Mike Galbraith (efault@gmx.de) wrote:
> > a7be37a adds some math overhead, calls to calc_delta_mine() per
> > wakeup/context switch for all weight tasks, whereas previously these
> > calls were only made for tasks which were not nice 0.  It also shifts
> > performance a bit in favor of loads which dislike wakeup preemption,
> 
> I believe anyone dislikes this :)
> 
> > this effect lessens as task count increases.  Per testing, overhead is
> > not the primary factor in throughput loss.  I believe clock accuracy to
> > be a more important factor than overhead by a very large margin.
> 
> In my tests it was not just overhead, it was a disaster.
> And stopping just before this commit gained 20 MB/s out of 30 MB/s lose
> for 26-27 window. No matter what accuracy it brings, this is just wrong
> to assume that such performance drop in some workloads is justified.
> What this accuracy is needed for?

a7be37a 's purpose is for group scheduling where it provides means to
calculate things in a unform metric.

If you take the following scenario:

    R
   /|\
  A 1 B
 /|\  |
2 3 4 5

Where letters denote supertasks/groups and digits are tasks.

We used to look at a single level only, so if you want to compute a
task's ideal runtime, you'd take:

  runtime_i = period w_i / \Sum_i w_i

So, in the above example, assuming all entries have an equal weight,
we'd want to run A for p/3. But then we'd also want to run 2 for p/3.
IOW. all of A's tasks would run in p time.

Which in contrairy to the expectation that all tasks in the scenario
would run in p.

So what the patch does is change the calculation to:

  period \Prod_l w_l,i / \Sum_i w_l,i

Which would, for 2 end up being: p 1/3 1/3 = p/9.

Now the thing that causes the extra math in the !group case is that for
the single level case, we can avoid doing that division by the sum,
because that is equal for all tasks (we then compensate for it at some
other place).

However, for the nested case, we cannot do that.

That said, we can probably still avoid the division for the top level
stuff, because the sum of the top level weights is still invariant
between all tasks.

I'll have a stab at doing so... I initially didn't do this because my
first try gave some real ugly code, but we'll see - these numbers are a
very convincing reason to try again.


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-11 14:39     ` Peter Zijlstra
@ 2008-10-11 18:13       ` Mike Galbraith
  2008-10-12  6:02         ` Mike Galbraith
  0 siblings, 1 reply; 94+ messages in thread
From: Mike Galbraith @ 2008-10-11 18:13 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Evgeniy Polyakov, netdev, linux-kernel, Ingo Molnar, David Miller

On Sat, 2008-10-11 at 16:39 +0200, Peter Zijlstra wrote:

> That said, we can probably still avoid the division for the top level
> stuff, because the sum of the top level weights is still invariant
> between all tasks.

Less math would be nice of course...

> I'll have a stab at doing so... I initially didn't do this because my
> first try gave some real ugly code, but we'll see - these numbers are a
> very convincing reason to try again.

...but the numbers I get on Q6600 don't pin the tail on the math donkey.

Update to UP test log.

2.6.27-final-up
ring-test   - 1.193 us/cycle  = 838 KHz  (gcc-4.3)
tbench      - 337.377 MB/sec           tso/gso on
tbench      - 340.362 MB/sec           tso/gso off
netperf     - 120751.30 rr/s           tso/gso on
netperf     - 121293.48 rr/s           tso/gso off

2.6.27-final-up
patches/revert_weight_and_asym_stuff.diff
ring-test   - 1.133 us/cycle  = 882 KHz  (gcc-4.3)
tbench      - 340.481 MB/sec           tso/gso on
tbench      - 343.472 MB/sec           tso/gso off
netperf     - 119486.14 rr/s           tso/gso on
netperf     - 121035.56 rr/s           tso/gso off

2.6.28-up
ring-test   - 1.149 us/cycle  = 870 KHz  (gcc-4.3)
tbench      - 343.681 MB/sec           tso/gso off
netperf     - 122812.54 rr/s           tso/gso off

My SMP log, updated to account for TSO/GSO monkey-wrench.

(<bleep> truckload of time <bleep> wasted chasing unbisectable
<bleepity-bleep> tso gizmo. <bleep!>)

SMP config, same as UP kernels tested, except SMP.

tbench -t 60 4 localhost followed by four 60 sec netperf
TCP_RR pairs, each pair on it's own core of my Q6600.

2.6.22.19

Throughput 1250.73 MB/sec 4 procs                  1.00

16384  87380  1        1       60.01    111272.55  1.00
16384  87380  1        1       60.00    104689.58
16384  87380  1        1       60.00    110733.05
16384  87380  1        1       60.00    110748.88

2.6.22.19-cfs-v24.1

Throughput 1213.21 MB/sec 4 procs                  .970

16384  87380  1        1       60.01    108569.27  .992
16384  87380  1        1       60.01    108541.04
16384  87380  1        1       60.00    108579.63
16384  87380  1        1       60.01    108519.09

2.6.23.17

Throughput 1200.46 MB/sec 4 procs                  .959

16384  87380  1        1       60.01    95987.66   .866
16384  87380  1        1       60.01    92819.98
16384  87380  1        1       60.01    95454.00
16384  87380  1        1       60.01    94834.84

2.6.23.17-cfs-v24.1

Throughput 1238.68 MB/sec 4 procs                  .990

16384  87380  1        1       60.01    105871.52  .969
16384  87380  1        1       60.01    105813.11
16384  87380  1        1       60.01    106106.31
16384  87380  1        1       60.01    106310.20

2.6.24.7

Throughput 1204 MB/sec 4 procs                     .962

16384  87380  1        1       60.00    99599.27   .910
16384  87380  1        1       60.00    99439.95
16384  87380  1        1       60.00    99556.38
16384  87380  1        1       60.00    99500.45

2.6.25.17

Throughput 1223.16 MB/sec 4 procs                  .977
16384  87380  1        1       60.00    101768.95  .930
16384  87380  1        1       60.00    101888.46
16384  87380  1        1       60.01    101608.21
16384  87380  1        1       60.01    101833.05

2.6.26.5

Throughput 1183.47 MB/sec 4 procs                  .945

16384  87380  1        1       60.00    100837.12  .922
16384  87380  1        1       60.00    101230.12
16384  87380  1        1       60.00    100868.45
16384  87380  1        1       60.00    100491.41

numbers above here are gcc-4.1, below gcc-4.3

2.6.26.6

Throughput 1177.18 MB/sec 4 procs

16384  87380  1        1       60.00    100896.10
16384  87380  1        1       60.00    100028.16
16384  87380  1        1       60.00    101729.44
16384  87380  1        1       60.01    100341.26

TSO/GSO off

2.6.27-final

Throughput 1177.39 MB/sec 4 procs

16384  87380  1        1       60.00    98830.65
16384  87380  1        1       60.00    98722.47
16384  87380  1        1       60.00    98565.17
16384  87380  1        1       60.00    98633.03

2.6.27-final
patches/revert_weight_and_asym_stuff.diff

Throughput 1167.67 MB/sec 4 procs

16384  87380  1        1       60.00    97003.05
16384  87380  1        1       60.00    96758.42
16384  87380  1        1       60.00    96432.01
16384  87380  1        1       60.00    97060.98

2.6.28.git

Throughput 1173.14 MB/sec 4 procs

16384  87380  1        1       60.00    98449.33
16384  87380  1        1       60.00    98484.92
16384  87380  1        1       60.00    98657.98
16384  87380  1        1       60.00    98467.39




^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-11 18:13       ` Mike Galbraith
@ 2008-10-12  6:02         ` Mike Galbraith
  2008-10-12  6:33           ` Mike Galbraith
  0 siblings, 1 reply; 94+ messages in thread
From: Mike Galbraith @ 2008-10-12  6:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Evgeniy Polyakov, netdev, linux-kernel, Ingo Molnar, David Miller

[-- Attachment #1: Type: text/plain, Size: 2093 bytes --]

On Sat, 2008-10-11 at 20:13 +0200, Mike Galbraith wrote:
> On Sat, 2008-10-11 at 16:39 +0200, Peter Zijlstra wrote:
> 
> > That said, we can probably still avoid the division for the top level
> > stuff, because the sum of the top level weights is still invariant
> > between all tasks.
> 
> Less math would be nice of course...
> 
> > I'll have a stab at doing so... I initially didn't do this because my
> > first try gave some real ugly code, but we'll see - these numbers are a
> > very convincing reason to try again.
> 
> ...but the numbers I get on Q6600 don't pin the tail on the math donkey.

Since I showed the rest of my numbers, I may as well show freshly
generated oltp numbers too.  Chart attached.  2.6.27.rev is 2.6.27 with
weight/asym changes reverted.

Data:

read/write requests/sec per client count
                            1       2       4       8      16      32      64     128     256  
2.6.26.6.mysql		 7978	19856	37238	36652	34399	33054	31608	27983	23411
2.6.27.mysql		 9618	18329	37128	36504	33590	31846	30719	27685	21299
2.6.27.rev.mysql	10944	19544	37349	36582	33793	31744	29161	25719	21026
2.6.28.git.mysql	 9518	18031	30418	33571	33330	32797	31353	29139	25793
									
2.6.26.6.pgsql		14165	27516	53883	53679	51960	49694	44377	35361	32879
2.6.27.pgsql		14146	27519	53797	53739	52850	47633	39976	30552	28741
2.6.27.rev.pgsql	14168	27561	53973	54043	53150	47900	39906	31987	28034
2.6.28.git.pgsql	14404	28318	55124	55010	55002	54890	53745	53519	52215

Less cycles used on math is still better of course, but my box seems to
care a lot more about preemption timing than math, regardless of load.

	-Mike

Aside:  Don't pay too much attention to mysql client < cores (4)
numbers, these jitter considerably.

Aside2:  Inquiring minds may wonder about pgsql numbers - preempting
holder of nasty userland spinlock hurts like heck.  It's triggered by
short term fairness, too much, and you pay.  Easy to cure, but you pay
for the cure too.  Moral of story is if you're running heavily loaded
server, turn preemption knobs, you'll lose peak, but scale better.

[-- Attachment #2: zzz.pdf --]
[-- Type: application/pdf, Size: 29786 bytes --]

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-12  6:02         ` Mike Galbraith
@ 2008-10-12  6:33           ` Mike Galbraith
  0 siblings, 0 replies; 94+ messages in thread
From: Mike Galbraith @ 2008-10-12  6:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Evgeniy Polyakov, netdev, linux-kernel, Ingo Molnar, David Miller

On Sun, 2008-10-12 at 08:02 +0200, Mike Galbraith wrote:

> Data:
> 
> read/write requests/sec per client count
>                             1       2       4       8      16      32      64     128     256  
> 2.6.26.6.mysql		 7978	19856	37238	36652	34399	33054	31608	27983	23411
> 2.6.27.mysql		 9618	18329	37128	36504	33590	31846	30719	27685	21299
> 2.6.27.rev.mysql	10944	19544	37349	36582	33793	31744	29161	25719	21026
> 2.6.28.git.mysql	 9518	18031	30418	33571	33330	32797	31353	29139	25793
> 									
> 2.6.26.6.pgsql		14165	27516	53883	53679	51960	49694	44377	35361	32879
> 2.6.27.pgsql		14146	27519	53797	53739	52850	47633	39976	30552	28741
> 2.6.27.rev.pgsql	14168	27561	53973	54043	53150	47900	39906	31987	28034
> 2.6.28.git.pgsql	14404	28318	55124	55010	55002	54890	53745	53519	52215

P.S.  all knobs stock, TSO/GSO off.


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-10 11:57           ` Ingo Molnar
@ 2008-10-24 22:25             ` Rafael J. Wysocki
  2008-10-24 23:31               ` David Miller
                                 ` (2 more replies)
  0 siblings, 3 replies; 94+ messages in thread
From: Rafael J. Wysocki @ 2008-10-24 22:25 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Evgeniy Polyakov, Peter Zijlstra, linux-kernel, netdev,
	David Miller, Mike Galbraith

On Friday, 10 of October 2008, Ingo Molnar wrote:
> 
> * Evgeniy Polyakov <s0mbre@tservice.net.ru> wrote:
> 
> > On Fri, Oct 10, 2008 at 01:42:45PM +0200, Ingo Molnar (mingo@elte.hu) wrote:
> > > > vanilla 27: 347.222
> > > > no TSO/GSO: 357.331
> > > > no hrticks: 382.983
> > > > no balance: 389.802
> > > 
> > > okay. The target is 470 MB/sec, right? (Assuming the workload is sane 
> > > and 'fixing' it does not mean we have to schedule worse.)
> > 
> > Well, that's where I started/stopped, so maybe we will even move
> > further? :)
> 
> that's the right attitude ;)

Can anyone please tell me if there was any conclusion of this thread?

Thanks,
Rafael

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-24 22:25             ` Rafael J. Wysocki
@ 2008-10-24 23:31               ` David Miller
  2008-10-25  4:05                 ` Mike Galbraith
  2008-10-25 11:13                 ` Rafael J. Wysocki
  2008-10-25  3:37               ` Mike Galbraith
  2008-10-26 11:29               ` Evgeniy Polyakov
  2 siblings, 2 replies; 94+ messages in thread
From: David Miller @ 2008-10-24 23:31 UTC (permalink / raw)
  To: rjw; +Cc: mingo, s0mbre, a.p.zijlstra, linux-kernel, netdev, efault

From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sat, 25 Oct 2008 00:25:34 +0200

> On Friday, 10 of October 2008, Ingo Molnar wrote:
> > 
> > * Evgeniy Polyakov <s0mbre@tservice.net.ru> wrote:
> > 
> > > On Fri, Oct 10, 2008 at 01:42:45PM +0200, Ingo Molnar (mingo@elte.hu) wrote:
> > > > > vanilla 27: 347.222
> > > > > no TSO/GSO: 357.331
> > > > > no hrticks: 382.983
> > > > > no balance: 389.802
> > > > 
> > > > okay. The target is 470 MB/sec, right? (Assuming the workload is sane 
> > > > and 'fixing' it does not mean we have to schedule worse.)
> > > 
> > > Well, that's where I started/stopped, so maybe we will even move
> > > further? :)
> > 
> > that's the right attitude ;)
> 
> Can anyone please tell me if there was any conclusion of this thread?

I made some more analysis in private with Ingo and Peter Z. and found
that the tbench decreases correlate pretty much directly with the
ongoing increasing cpu cost of wake_up() and friends in the fair
scheduler.

The largest increase in computational cost of wakeups came in 2.6.27
when the hrtimer bits got added, it more than tripled the cost of a wakeup.
In 2.6.28-rc1 the hrtimer feature has been disabled, but I think that
should be backports into the 2.6.27-stable branch.

So I think that should be backported, and meanwhile I'm spending some
time in the background trying to replace the fair schedulers RB tree
crud with something faster so maybe at some point we can recover all
of the regressions in this area caused by the CFS code.

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-24 22:25             ` Rafael J. Wysocki
  2008-10-24 23:31               ` David Miller
@ 2008-10-25  3:37               ` Mike Galbraith
  2008-10-25  5:16                 ` David Miller
  2008-10-26 11:29               ` Evgeniy Polyakov
  2 siblings, 1 reply; 94+ messages in thread
From: Mike Galbraith @ 2008-10-25  3:37 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Ingo Molnar, Evgeniy Polyakov, Peter Zijlstra, linux-kernel,
	netdev, David Miller

On Sat, 2008-10-25 at 00:25 +0200, Rafael J. Wysocki wrote:
> On Friday, 10 of October 2008, Ingo Molnar wrote:
> > 
> > * Evgeniy Polyakov <s0mbre@tservice.net.ru> wrote:
> > 
> > > On Fri, Oct 10, 2008 at 01:42:45PM +0200, Ingo Molnar (mingo@elte.hu) wrote:
> > > > > vanilla 27: 347.222
> > > > > no TSO/GSO: 357.331
> > > > > no hrticks: 382.983
> > > > > no balance: 389.802
> > > > 
> > > > okay. The target is 470 MB/sec, right? (Assuming the workload is sane 
> > > > and 'fixing' it does not mean we have to schedule worse.)
> > > 
> > > Well, that's where I started/stopped, so maybe we will even move
> > > further? :)
> > 
> > that's the right attitude ;)
> 
> Can anyone please tell me if there was any conclusion of this thread?

Part of the .27 regression was added scheduler overhead going from .26
to .27.  The scheduler overhead is now gone, but an unidentified source
of localhost throughput loss remains for both SMP and UP configs.

	-Mike

My last test data, updated to reflect recent commits:

Legend:
clock  = v2.6.26..5052696 + 5052696..v2.6.27-rc7 sched clock changes
weight = a7be37a + c9c294a + ced8aa1 (adds math overhead)
buddy  = 103638d (adds math overhead)
buddy_overhead = b0aa51b (removes math overhead of buddy)
revert_to_per_rq_vruntime = f9c0b09 (+2 lines, removes math overhead of weight)

2.6.26.6-up virgin
ring-test   - 1.169 us/cycle  = 855 KHz                                 1.000
netperf     - 130967.54 131143.75 130914.96 rr/s    avg 131008.75 rr/s  1.000
tbench      - 357.593 355.455 356.048 MB/sec        avg 356.365 MB/sec  1.000

2.6.26.6-up + clock + buddy + weight (== .27 scheduler)
ring-test   - 1.234 us/cycle  = 810 KHz                                  .947 [cmp1]
netperf     - 128026.62 128118.48 127973.54 rr/s    avg 128039.54 rr/s   .977
tbench      - 342.011 345.307 343.535 MB/sec        avg 343.617 MB/sec   .964

2.6.26.6-up + clock + buddy + weight + revert_to_per_rq_vruntime + buddy_overhead
ring-test   - 1.174 us/cycle  = 851 KHz                                  .995 [cmp2]
netperf     - 133928.03 134265.41 134297.06 rr/s    avg 134163.50 rr/s  1.024
tbench      - 358.049 359.529 358.342 MB/sec        avg 358.640 MB/sec  1.006

                                                       versus .26 counterpart
2.6.27-up virgin
ring-test   - 1.193 us/cycle  = 838 KHz                                 1.034 [vs cmp1]
netperf     - 121293.48 121700.96 120716.98 rr/s    avg 121237.14 rr/s   .946
tbench      - 340.362 339.780 341.353 MB/sec        avg 340.498 MB/sec   .990

2.6.27-up + revert_to_per_rq_vruntime + buddy_overhead
ring-test   - 1.122 us/cycle  = 891 KHz                                 1.047 [vs cmp2]
netperf     - 119353.27 118600.98 119719.12 rr/s    avg 119224.45 rr/s   .900
tbench      - 338.701 338.508 338.562 MB/sec        avg 338.590 MB/sec   .951

SMP config

2.6.26.6-smp virgin
ring-test   - 1.575 us/cycle  = 634 KHz                                 1.000
netperf     - 400487.72 400321.98 404165.10 rr/s    avg 401658.26 rr/s  1.000
tbench      - 1178.27 1177.18 1184.61 MB/sec        avg 1180.02 MB/sec  1.000

2.6.26.6-smp + clock + buddy + weight + revert_to_per_rq_vruntime + buddy_overhead
ring-test   - 1.575 us/cycle  = 634 KHz                                 1.000
netperf     - 412191.70 411873.15 414638.27 rr/s    avg 412901.04 rr/s  1.027
tbench      - 1193.18 1200.93 1199.61 MB/sec        avg 1197.90 MB/sec  1.015

                                                             versus 26.6 plus
2.6.27-smp virgin
ring-test   - 1.674 us/cycle  = 597 KHz                                  .941
netperf     - 382536.26 380931.29 380552.82 rr/s    avg 381340.12 rr/s   .923
tbench      - 1151.47 1143.21 1154.17 MB/sec        avg 1149.616 MB/sec  .959

2.6.27-smp + revert_to_per_rq_vruntime + buddy_overhead
ring-test   - 1.570 us/cycle  = 636 KHz                                 1.003
netperf     - 386487.91 389858.00 388180.91 rr/s    avg 388175.60 rr/s   .940
tbench      - 1179.52 1184.25 1180.18 MB/sec        avg 1181.31 MB/sec   .986




^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-24 23:31               ` David Miller
@ 2008-10-25  4:05                 ` Mike Galbraith
  2008-10-25  5:15                   ` David Miller
  2008-10-25 11:13                 ` Rafael J. Wysocki
  1 sibling, 1 reply; 94+ messages in thread
From: Mike Galbraith @ 2008-10-25  4:05 UTC (permalink / raw)
  To: David Miller; +Cc: rjw, mingo, s0mbre, a.p.zijlstra, linux-kernel, netdev

On Fri, 2008-10-24 at 16:31 -0700, David Miller wrote:
> From: "Rafael J. Wysocki" <rjw@sisk.pl>
> Date: Sat, 25 Oct 2008 00:25:34 +0200
> 
> > On Friday, 10 of October 2008, Ingo Molnar wrote:
> > > 
> > > * Evgeniy Polyakov <s0mbre@tservice.net.ru> wrote:
> > > 
> > > > On Fri, Oct 10, 2008 at 01:42:45PM +0200, Ingo Molnar (mingo@elte.hu) wrote:
> > > > > > vanilla 27: 347.222
> > > > > > no TSO/GSO: 357.331
> > > > > > no hrticks: 382.983
> > > > > > no balance: 389.802
> > > > > 
> > > > > okay. The target is 470 MB/sec, right? (Assuming the workload is sane 
> > > > > and 'fixing' it does not mean we have to schedule worse.)
> > > > 
> > > > Well, that's where I started/stopped, so maybe we will even move
> > > > further? :)
> > > 
> > > that's the right attitude ;)
> > 
> > Can anyone please tell me if there was any conclusion of this thread?
> 
> I made some more analysis in private with Ingo and Peter Z. and found
> that the tbench decreases correlate pretty much directly with the
> ongoing increasing cpu cost of wake_up() and friends in the fair
> scheduler.
> 
> The largest increase in computational cost of wakeups came in 2.6.27
> when the hrtimer bits got added, it more than tripled the cost of a wakeup.
> In 2.6.28-rc1 the hrtimer feature has been disabled, but I think that
> should be backports into the 2.6.27-stable branch.
> 
> So I think that should be backported, and meanwhile I'm spending some
> time in the background trying to replace the fair schedulers RB tree
> crud with something faster so maybe at some point we can recover all
> of the regressions in this area caused by the CFS code.

My test data indicates (to me anyway) that there is another source of
localhost throughput loss in .27.  In that data, there is no hrtick
overhead since I didn't have highres timers enabled, and computational
costs added in .27 were removed.  Dunno where it lives, but it does
appear to exist.

	-Mike


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-25  4:05                 ` Mike Galbraith
@ 2008-10-25  5:15                   ` David Miller
  2008-10-25  5:53                     ` Mike Galbraith
  0 siblings, 1 reply; 94+ messages in thread
From: David Miller @ 2008-10-25  5:15 UTC (permalink / raw)
  To: efault; +Cc: rjw, mingo, s0mbre, a.p.zijlstra, linux-kernel, netdev

From: Mike Galbraith <efault@gmx.de>
Date: Sat, 25 Oct 2008 06:05:01 +0200

> My test data indicates (to me anyway) that there is another source of
> localhost throughput loss in .27.  In that data, there is no hrtick
> overhead since I didn't have highres timers enabled, and computational
> costs added in .27 were removed.  Dunno where it lives, but it does
> appear to exist.

Disabling TSO on loopback doesn't fix that bit for you?


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-25  3:37               ` Mike Galbraith
@ 2008-10-25  5:16                 ` David Miller
  2008-10-25  5:58                   ` Mike Galbraith
  0 siblings, 1 reply; 94+ messages in thread
From: David Miller @ 2008-10-25  5:16 UTC (permalink / raw)
  To: efault; +Cc: rjw, mingo, s0mbre, a.p.zijlstra, linux-kernel, netdev

From: Mike Galbraith <efault@gmx.de>
Date: Sat, 25 Oct 2008 05:37:28 +0200

> Part of the .27 regression was added scheduler overhead going from .26
> to .27.  The scheduler overhead is now gone, but an unidentified source
> of localhost throughput loss remains for both SMP and UP configs.

It has to be the TSO thinky Evgeniy hit too right?

If not, please bisect this.

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-25  5:15                   ` David Miller
@ 2008-10-25  5:53                     ` Mike Galbraith
  0 siblings, 0 replies; 94+ messages in thread
From: Mike Galbraith @ 2008-10-25  5:53 UTC (permalink / raw)
  To: David Miller; +Cc: rjw, mingo, s0mbre, a.p.zijlstra, linux-kernel, netdev

On Fri, 2008-10-24 at 22:15 -0700, David Miller wrote:
> From: Mike Galbraith <efault@gmx.de>
> Date: Sat, 25 Oct 2008 06:05:01 +0200
> 
> > My test data indicates (to me anyway) that there is another source of
> > localhost throughput loss in .27.  In that data, there is no hrtick
> > overhead since I didn't have highres timers enabled, and computational
> > costs added in .27 were removed.  Dunno where it lives, but it does
> > appear to exist.
> 
> Disabling TSO on loopback doesn't fix that bit for you?

No.  Those numbers are with TSO/GSO disabled.

I did a manual 100% sched and everything related revert to 26 scheduler,
and had ~the same result as these numbers.  27 with 100% revert actually
performed a bit _worse_ for me than 27 with it's overhead.. which
puzzles me greatly.

	-Mike


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-25  5:16                 ` David Miller
@ 2008-10-25  5:58                   ` Mike Galbraith
  2008-10-25  6:53                     ` Mike Galbraith
  2008-10-25  7:19                     ` David Miller
  0 siblings, 2 replies; 94+ messages in thread
From: Mike Galbraith @ 2008-10-25  5:58 UTC (permalink / raw)
  To: David Miller; +Cc: rjw, mingo, s0mbre, a.p.zijlstra, linux-kernel, netdev

On Fri, 2008-10-24 at 22:16 -0700, David Miller wrote:
> From: Mike Galbraith <efault@gmx.de>
> Date: Sat, 25 Oct 2008 05:37:28 +0200
> 
> > Part of the .27 regression was added scheduler overhead going from .26
> > to .27.  The scheduler overhead is now gone, but an unidentified source
> > of localhost throughput loss remains for both SMP and UP configs.
> 
> It has to be the TSO thinky Evgeniy hit too right?

Dunno.

> If not, please bisect this.

(oh my <fword> gawd:)

I spent long day manweeks trying to bisect and whatnot.  It's immune to
my feeble efforts, and my git-foo.

	-Mike


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-25  5:58                   ` Mike Galbraith
@ 2008-10-25  6:53                     ` Mike Galbraith
  2008-10-25  7:24                       ` David Miller
  2008-10-25  7:19                     ` David Miller
  1 sibling, 1 reply; 94+ messages in thread
From: Mike Galbraith @ 2008-10-25  6:53 UTC (permalink / raw)
  To: David Miller; +Cc: rjw, mingo, s0mbre, a.p.zijlstra, linux-kernel, netdev

On Sat, 2008-10-25 at 07:58 +0200, Mike Galbraith wrote:
> On Fri, 2008-10-24 at 22:16 -0700, David Miller wrote:

> > If not, please bisect this.
> 
> (oh my <fword> gawd:)
> 
> I spent long day manweeks trying to bisect and whatnot.  It's immune to
> my feeble efforts, and my git-foo.

but..

(tbench/netperf numbers were tested with gcc-4.1 at this time in log, I
went back and re-measured ring-test because I switched compilers)

2.6.22.19-up
ring-test   - 1.204 us/cycle  = 830 KHz  (gcc-4.1)
ring-test   - doorstop                   (gcc-4.3)
netperf     - 147798.56 rr/s  = 295 KHz  (hmm, a bit unstable, 140K..147K rr/s)
tbench      - 374.573 MB/sec

2.6.22.19-cfs-v24.1-up
ring-test   - 1.098 us/cycle  = 910 KHz  (gcc-4.1)
ring-test   - doorstop                   (gcc-4.3)
netperf     - 140039.03 rr/s  = 280 KHz = 3.57us - 1.10us sched = 2.47us/packet network
tbench      - 364.191 MB/sec

2.6.23.17-up
ring-test   - 1.252 us/cycle  = 798 KHz  (gcc-4.1)
ring-test   - 1.235 us/cycle  = 809 KHz  (gcc-4.3)
netperf     - 123736.40 rr/s  = 247 KHz  sb 268 KHZ / 134336.37 rr/s
tbench      - 355.906 MB/sec

2.6.23.17-cfs-v24.1-up
ring-test   - 1.100 us/cycle  = 909 KHz  (gcc-4.1)
ring-test   - 1.074 us/cycle  = 931 KHz  (gcc-4.3)
netperf     - 135847.14 rr/s  = 271 KHz  sb 280 KHz / 140039.03 rr/s
tbench      - 364.511 MB/sec

2.6.24.7-up
ring-test   - 1.100 us/cycle  = 909 KHz  (gcc-4.1)
ring-test   - 1.068 us/cycle  = 936 KHz  (gcc-4.3)
netperf     - 122300.66 rr/s  = 244 KHz  sb 280 KHz / 140039.03 rr/s
tbench      - 341.523 MB/sec

2.6.25.17-up
ring-test   - 1.163 us/cycle  = 859 KHz  (gcc-4.1)
ring-test   - 1.129 us/cycle  = 885 KHz  (gcc-4.3)
netperf     - 132102.70 rr/s  = 264 KHz  sb 275 KHz / 137627.30 rr/s
tbench      - 361.71 MB/sec

..in 25, something happened that dropped my max context switch rate from
~930 KHz to ~885 KHz.  Maybe I'll have better luck trying to find that.
Added to to-do list.  Benchmark mysteries I'm going to have to leave
alone, they've kicked my little butt quite thoroughly ;-)

	-Mike


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-25  5:58                   ` Mike Galbraith
  2008-10-25  6:53                     ` Mike Galbraith
@ 2008-10-25  7:19                     ` David Miller
  2008-10-25  7:33                       ` Mike Galbraith
  1 sibling, 1 reply; 94+ messages in thread
From: David Miller @ 2008-10-25  7:19 UTC (permalink / raw)
  To: efault; +Cc: rjw, mingo, s0mbre, a.p.zijlstra, linux-kernel, netdev

From: Mike Galbraith <efault@gmx.de>
Date: Sat, 25 Oct 2008 07:58:53 +0200

> I spent long day manweeks trying to bisect and whatnot.  It's immune to
> my feeble efforts, and my git-foo.

I understand, this is what happened to me when I tried to look into
the gradual tbench regressions since 2.6.22

I guess the only way to attack these things is to analyze the code and
make some debugging hacks to get some measurements and numbers.

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-25  6:53                     ` Mike Galbraith
@ 2008-10-25  7:24                       ` David Miller
  2008-10-25  7:52                         ` Mike Galbraith
  2008-10-25 23:10                         ` Jiri Kosina
  0 siblings, 2 replies; 94+ messages in thread
From: David Miller @ 2008-10-25  7:24 UTC (permalink / raw)
  To: efault; +Cc: rjw, mingo, s0mbre, a.p.zijlstra, linux-kernel, netdev

From: Mike Galbraith <efault@gmx.de>
Date: Sat, 25 Oct 2008 08:53:43 +0200

> On Sat, 2008-10-25 at 07:58 +0200, Mike Galbraith wrote:
> 2.6.24.7-up
> ring-test   - 1.100 us/cycle  = 909 KHz  (gcc-4.1)
> ring-test   - 1.068 us/cycle  = 936 KHz  (gcc-4.3)
> netperf     - 122300.66 rr/s  = 244 KHz  sb 280 KHz / 140039.03 rr/s
> tbench      - 341.523 MB/sec
> 
> 2.6.25.17-up
> ring-test   - 1.163 us/cycle  = 859 KHz  (gcc-4.1)
> ring-test   - 1.129 us/cycle  = 885 KHz  (gcc-4.3)
> netperf     - 132102.70 rr/s  = 264 KHz  sb 275 KHz / 137627.30 rr/s
> tbench      - 361.71 MB/sec
> 
> ..in 25, something happened that dropped my max context switch rate from
> ~930 KHz to ~885 KHz.  Maybe I'll have better luck trying to find that.
> Added to to-do list.  Benchmark mysteries I'm going to have to leave
> alone, they've kicked my little butt quite thoroughly ;-)

But note that tbench performance improved a bit in 2.6.25.

In my tests I noticed a similar effect, but from 2.6.23 to 2.6.24,
weird.

Just for the public record here are the numbers I got in my testing.
Each entry was run purely on the latest 2.6.X-stable tree for each
release.  First is the tbench score and then there are 40 numbers
which are sparc64 cpu cycle counts of default_wake_function().

v2.6.22:

	Throughput 173.677 MB/sec  2 clients  2 procs  max_latency=38.192 ms

	1636 1483 1552 1560 1534 1522 1472 1530 1518 1468
	1534 1402 1468 1656 1383 1362 1516 1336 1392 1472
	1652 1522 1486 1363 1430 1334 1382 1398 1448 1439
	1662 1540 1526 1472 1539 1434 1452 1492 1502 1432

v2.6.23: This is when CFS got added to the tree.

	Throughput 167.933 MB/sec  2 clients  2 procs  max_latency=25.428 ms

	3435 3363 3165 3304 3401 3189 3280 3243 3156 3295
	3439 3375 2950 2945 2727 3383 3560 3417 3221 3271
	3595 3293 3323 3283 3267 3279 3343 3293 3203 3341
	3413 3268 3107 3361 3245 3195 3079 3184 3405 3191

v2.6.24:

	Throughput 170.314 MB/sec  2 clients  2 procs  max_latency=22.121 ms

	2136 1886 2030 1929 2021 1941 2009 2067 1895 2019
	2072 1985 1992 1986 2031 2085 2014 2103 1825 1705
	2018 2034 1921 2079 1901 1989 1976 2035 2053 1971
	2144 2059 2025 2024 2029 1932 1980 1947 1956 2008

v2.6.25:

	Throughput 165.294 MB/sec  2 clients  2 procs  max_latency=108.869 ms

	2551 2707 2674 2771 2641 2727 2647 2865 2800 2796
	2793 2745 2609 2753 2674 2618 2671 2668 2641 2744
	2727 2616 2897 2720 2682 2737 2551 2677 2687 2603
	2725 2717 2510 2682 2658 2581 2713 2608 2619 2586

v2.6.26:

	Throughput 160.759 MB/sec  2 clients  2 procs  max_latency=31.420 ms

	2576 2492 2556 2517 2496 2473 2620 2464 2535 2494
	2800 2297 2183 2634 2546 2579 2488 2455 2632 2540
	2566 2540 2536 2496 2432 2453 2462 2568 2406 2522
	2565 2620 2532 2416 2434 2452 2524 2440 2424 2412

v2.6.27:

	Throughput 143.776 MB/sec  2 clients  2 procs  max_latency=31.279 ms

	4783 4710 27307 4955 5363 4270 4514 4469 3949 4422
	4177 4424 4510 18290 4380 3956 4293 4368 3919 4283
	4607 3960 4294 3842 18957 3942 4402 4488 3988 5157
	4604 4219 4186 22628 4289 4149 4089 4543 4217 4075

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-25  7:19                     ` David Miller
@ 2008-10-25  7:33                       ` Mike Galbraith
  2008-10-27 17:26                         ` Rick Jones
  0 siblings, 1 reply; 94+ messages in thread
From: Mike Galbraith @ 2008-10-25  7:33 UTC (permalink / raw)
  To: David Miller; +Cc: rjw, mingo, s0mbre, a.p.zijlstra, linux-kernel, netdev

On Sat, 2008-10-25 at 00:19 -0700, David Miller wrote:
> From: Mike Galbraith <efault@gmx.de>
> Date: Sat, 25 Oct 2008 07:58:53 +0200
> 
> > I spent long day manweeks trying to bisect and whatnot.  It's immune to
> > my feeble efforts, and my git-foo.
> 
> I understand, this is what happened to me when I tried to look into
> the gradual tbench regressions since 2.6.22

That's exactly what I've been trying to look into, but combined with
netperf.  The thing is an incredibly twisted maze of _this_ affects
_that_... sometimes involving magic and/or mythical creatures.

Very very annoying.

	-Mike


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-25  7:24                       ` David Miller
@ 2008-10-25  7:52                         ` Mike Galbraith
  2008-10-25 23:10                         ` Jiri Kosina
  1 sibling, 0 replies; 94+ messages in thread
From: Mike Galbraith @ 2008-10-25  7:52 UTC (permalink / raw)
  To: David Miller; +Cc: rjw, mingo, s0mbre, a.p.zijlstra, linux-kernel, netdev

On Sat, 2008-10-25 at 00:24 -0700, David Miller wrote:
> From: Mike Galbraith <efault@gmx.de>
> Date: Sat, 25 Oct 2008 08:53:43 +0200
> 
> > On Sat, 2008-10-25 at 07:58 +0200, Mike Galbraith wrote:
> > 2.6.24.7-up
> > ring-test   - 1.100 us/cycle  = 909 KHz  (gcc-4.1)
> > ring-test   - 1.068 us/cycle  = 936 KHz  (gcc-4.3)
> > netperf     - 122300.66 rr/s  = 244 KHz  sb 280 KHz / 140039.03 rr/s
> > tbench      - 341.523 MB/sec
> > 
> > 2.6.25.17-up
> > ring-test   - 1.163 us/cycle  = 859 KHz  (gcc-4.1)
> > ring-test   - 1.129 us/cycle  = 885 KHz  (gcc-4.3)
> > netperf     - 132102.70 rr/s  = 264 KHz  sb 275 KHz / 137627.30 rr/s
> > tbench      - 361.71 MB/sec
> > 
> > ..in 25, something happened that dropped my max context switch rate from
> > ~930 KHz to ~885 KHz.  Maybe I'll have better luck trying to find that.
> > Added to to-do list.  Benchmark mysteries I'm going to have to leave
> > alone, they've kicked my little butt quite thoroughly ;-)
> 
> But note that tbench performance improved a bit in 2.6.25.

Yeah, netperf too.

> In my tests I noticed a similar effect, but from 2.6.23 to 2.6.24,
> weird.

23->24 I can understand.  In my testing, 23 CFS was not a wonderful
experience for rapid switchers.  24 is cfs-24.1.
 
> Just for the public record here are the numbers I got in my testing.
> Each entry was run purely on the latest 2.6.X-stable tree for each
> release.  First is the tbench score and then there are 40 numbers
> which are sparc64 cpu cycle counts of default_wake_function().

Your numbers seem to ~agree with mine.  And yeah, that hrtick is damned
expensive.  I didn't realize _how_ expensive until I trimmed my config
way way down from distro.  Just having highres timers enabled makes a
very large difference here, even without hrtick enabled, and with the
overhead of a disabled hrtick removed.

	-Mike


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-24 23:31               ` David Miller
  2008-10-25  4:05                 ` Mike Galbraith
@ 2008-10-25 11:13                 ` Rafael J. Wysocki
  2008-10-26  3:55                   ` David Miller
  1 sibling, 1 reply; 94+ messages in thread
From: Rafael J. Wysocki @ 2008-10-25 11:13 UTC (permalink / raw)
  To: David Miller; +Cc: mingo, s0mbre, a.p.zijlstra, linux-kernel, netdev, efault

On Saturday, 25 of October 2008, David Miller wrote:
> From: "Rafael J. Wysocki" <rjw@sisk.pl>
> Date: Sat, 25 Oct 2008 00:25:34 +0200
> 
> > On Friday, 10 of October 2008, Ingo Molnar wrote:
> > > 
> > > * Evgeniy Polyakov <s0mbre@tservice.net.ru> wrote:
> > > 
> > > > On Fri, Oct 10, 2008 at 01:42:45PM +0200, Ingo Molnar (mingo@elte.hu) wrote:
> > > > > > vanilla 27: 347.222
> > > > > > no TSO/GSO: 357.331
> > > > > > no hrticks: 382.983
> > > > > > no balance: 389.802
> > > > > 
> > > > > okay. The target is 470 MB/sec, right? (Assuming the workload is sane 
> > > > > and 'fixing' it does not mean we have to schedule worse.)
> > > > 
> > > > Well, that's where I started/stopped, so maybe we will even move
> > > > further? :)
> > > 
> > > that's the right attitude ;)
> > 
> > Can anyone please tell me if there was any conclusion of this thread?
> 
> I made some more analysis in private with Ingo and Peter Z. and found
> that the tbench decreases correlate pretty much directly with the
> ongoing increasing cpu cost of wake_up() and friends in the fair
> scheduler.
> 
> The largest increase in computational cost of wakeups came in 2.6.27
> when the hrtimer bits got added, it more than tripled the cost of a wakeup.
> In 2.6.28-rc1 the hrtimer feature has been disabled, but I think that
> should be backports into the 2.6.27-stable branch.

Thanks a lot for the info.

Could you please give me a pointer to the commit disabling the hrtimer feature?

Rafael

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-25  7:24                       ` David Miller
  2008-10-25  7:52                         ` Mike Galbraith
@ 2008-10-25 23:10                         ` Jiri Kosina
  2008-10-26  8:46                           ` Mike Galbraith
  1 sibling, 1 reply; 94+ messages in thread
From: Jiri Kosina @ 2008-10-25 23:10 UTC (permalink / raw)
  To: David Miller
  Cc: efault, rjw, Ingo Molnar, s0mbre, a.p.zijlstra, linux-kernel, netdev

On Sat, 25 Oct 2008, David Miller wrote:

> But note that tbench performance improved a bit in 2.6.25.
> In my tests I noticed a similar effect, but from 2.6.23 to 2.6.24,
> weird.
> Just for the public record here are the numbers I got in my testing.

I have been currently looking at very similarly looking issue. For the 
public record, here are the numbers we have been able to come up with so 
far (measured with dbench, so the absolute values are slightly different, 
but still shows similar pattern)

208.4 MB/sec  -- vanilla 2.6.16.60
201.6 MB/sec  -- vanilla 2.6.20.1
172.9 MB/sec  -- vanilla 2.6.22.19
74.2 MB/sec   -- vanilla 2.6.23
 46.1 MB/sec  -- vanilla 2.6.24.2
 30.6 MB/sec  -- vanilla 2.6.26.1

I.e. huge drop for 2.6.23 (this was with default configs for each 
respective kernel).
2.6.23-rc1 shows 80.5 MB/s, i.e. a few % better than final 2.6.23, but 
still pretty bad. 

I have gone through the commits that went into -rc1 and tried to figure 
out which one could be responsible. Here are the numbers:

 85.3 MB/s for 2ba2d00363 (just before on-deman readahead has been merged)
 82.7 MB/s for 45426812d6 (before cond_resched() has been added into page 
187.7 MB/s for c1e4fe711a4 (just before CFS scheduler has been merged)
                           invalidation code)

So the current bigest suspect is CFS, but I don't have enough numbers yet 
to be able to point a finger to it with 100% certainity. Hopefully soon.

Just my $0.02

-- 
Jiri Kosina
SUSE Labs


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-25 11:13                 ` Rafael J. Wysocki
@ 2008-10-26  3:55                   ` David Miller
  2008-10-26 11:33                     ` Rafael J. Wysocki
  0 siblings, 1 reply; 94+ messages in thread
From: David Miller @ 2008-10-26  3:55 UTC (permalink / raw)
  To: rjw; +Cc: mingo, s0mbre, a.p.zijlstra, linux-kernel, netdev, efault

From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sat, 25 Oct 2008 13:13:20 +0200

> Could you please give me a pointer to the commit disabling the hrtimer feature?

Here it is:

commit 0c4b83da58ec2e96ce9c44c211d6eac5f9dae478
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Oct 20 14:27:43 2008 +0200

    sched: disable the hrtick for now
    
    David Miller reported that hrtick update overhead has tripled the
    wakeup overhead on Sparc64.
    
    That is too much - disable the HRTICK feature for now by default,
    until a faster implementation is found.
    
    Reported-by: David Miller <davem@davemloft.net>
    Acked-by: Peter Zijlstra <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 7c9e8f4..fda0162 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -5,7 +5,7 @@ SCHED_FEAT(START_DEBIT, 1)
 SCHED_FEAT(AFFINE_WAKEUPS, 1)
 SCHED_FEAT(CACHE_HOT_BUDDY, 1)
 SCHED_FEAT(SYNC_WAKEUPS, 1)
-SCHED_FEAT(HRTICK, 1)
+SCHED_FEAT(HRTICK, 0)
 SCHED_FEAT(DOUBLE_TICK, 0)
 SCHED_FEAT(ASYM_GRAN, 1)
 SCHED_FEAT(LB_BIAS, 1)

^ permalink raw reply related	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-25 23:10                         ` Jiri Kosina
@ 2008-10-26  8:46                           ` Mike Galbraith
  2008-10-26  9:00                             ` Peter Zijlstra
  0 siblings, 1 reply; 94+ messages in thread
From: Mike Galbraith @ 2008-10-26  8:46 UTC (permalink / raw)
  To: Jiri Kosina
  Cc: David Miller, rjw, Ingo Molnar, s0mbre, a.p.zijlstra,
	linux-kernel, netdev

On Sun, 2008-10-26 at 01:10 +0200, Jiri Kosina wrote:
> On Sat, 25 Oct 2008, David Miller wrote:
> 
> > But note that tbench performance improved a bit in 2.6.25.
> > In my tests I noticed a similar effect, but from 2.6.23 to 2.6.24,
> > weird.
> > Just for the public record here are the numbers I got in my testing.
> 
> I have been currently looking at very similarly looking issue. For the 
> public record, here are the numbers we have been able to come up with so 
> far (measured with dbench, so the absolute values are slightly different, 
> but still shows similar pattern)
> 
> 208.4 MB/sec  -- vanilla 2.6.16.60
> 201.6 MB/sec  -- vanilla 2.6.20.1
> 172.9 MB/sec  -- vanilla 2.6.22.19
> 74.2 MB/sec   -- vanilla 2.6.23
>  46.1 MB/sec  -- vanilla 2.6.24.2
>  30.6 MB/sec  -- vanilla 2.6.26.1
> 
> I.e. huge drop for 2.6.23 (this was with default configs for each 
> respective kernel).
> 2.6.23-rc1 shows 80.5 MB/s, i.e. a few % better than final 2.6.23, but 
> still pretty bad. 
> 
> I have gone through the commits that went into -rc1 and tried to figure 
> out which one could be responsible. Here are the numbers:
> 
>  85.3 MB/s for 2ba2d00363 (just before on-deman readahead has been merged)
>  82.7 MB/s for 45426812d6 (before cond_resched() has been added into page 
> 187.7 MB/s for c1e4fe711a4 (just before CFS scheduler has been merged)
>                            invalidation code)
> 
> So the current bigest suspect is CFS, but I don't have enough numbers yet 
> to be able to point a finger to it with 100% certainity. Hopefully soon.

Hi,

High client count right?

I reproduced this on my Q6600 box.  However, I also reproduced it with
2.6.22.19.  What I think you're seeing is just dbench creating a massive
train wreck.  With CFS, it appears to be more likely to start->end
_sustain_, but the wreckage is present in O(1) scheduler runs as well,
and will start->end sustain there as well.

2.6.22.19-smp           Throughput 967.933 MB/sec 16 procs Throughput 147.879 MB/sec 160 procs
                        Throughput 950.325 MB/sec 16 procs Throughput 349.959 MB/sec 160 procs
                        Throughput 953.382 MB/sec 16 procs Throughput 126.821 MB/sec 160 procs <== massive jitter
2.6.22.19-cfs-v24.1-smp Throughput 978.047 MB/sec 16 procs Throughput 170.662 MB/sec 160 procs
                        Throughput 943.254 MB/sec 16 procs Throughput 39.388 MB/sec 160 procs <== sustained train wreck
                        Throughput 934.042 MB/sec 16 procs Throughput 239.574 MB/sec 160 procs
2.6.23.17-smp           Throughput 1173.97 MB/sec 16 procs Throughput 100.996 MB/sec 160 procs
                        Throughput 1122.85 MB/sec 16 procs Throughput 80.3747 MB/sec 160 procs
                        Throughput 1113.60 MB/sec 16 procs Throughput 99.3723 MB/sec 160 procs
2.6.24.7-smp            Throughput 1030.34 MB/sec 16 procs Throughput 256.419 MB/sec 160 procs
                        Throughput 970.602 MB/sec 16 procs Throughput 257.008 MB/sec 160 procs
                        Throughput 1056.48 MB/sec 16 procs Throughput 248.841 MB/sec 160 procs
2.6.25.19-smp           Throughput 955.874 MB/sec 16 procs Throughput 40.5735 MB/sec 160 procs
                        Throughput 943.348 MB/sec 16 procs Throughput 62.3966 MB/sec 160 procs
			Throughput 937.595 MB/sec 16 procs Throughput 17.4639 MB/sec 160 procs
2.6.26.7-smp            Throughput 904.564 MB/sec 16 procs Throughput 118.364 MB/sec 160 procs
                        Throughput 891.824 MB/sec 16 procs Throughput 34.2193 MB/sec 160 procs
                        Throughput 880.850 MB/sec 16 procs Throughput 22.4938 MB/sec 160 procs
2.6.27.4-smp            Throughput 856.660 MB/sec 16 procs Throughput 168.243 MB/sec 160 procs
                        Throughput 880.121 MB/sec 16 procs Throughput 120.132 MB/sec 160 procs
                        Throughput 880.121 MB/sec 16 procs Throughput 142.105 MB/sec 160 procs

Check out fugliness:

2.6.22.19-smp  Throughput 35.5075 MB/sec 160 procs  (start->end sustained train wreck)

Full output from above run:

dbench version 3.04 - Copyright Andrew Tridgell 1999-2004

Running for 60 seconds with load '/usr/share/dbench/client.txt' and minimum warmup 12 secs
160 clients started
 160        54   310.43 MB/sec  warmup   1 sec   
 160        54   155.18 MB/sec  warmup   2 sec   
 160        54   103.46 MB/sec  warmup   3 sec   
 160        54    77.59 MB/sec  warmup   4 sec   
 160        56    64.81 MB/sec  warmup   5 sec   
 160        57    54.01 MB/sec  warmup   6 sec   
 160        57    46.29 MB/sec  warmup   7 sec   
 160       812   129.07 MB/sec  warmup   8 sec   
 160      1739   205.08 MB/sec  warmup   9 sec   
 160      2634   262.22 MB/sec  warmup  10 sec   
 160      3437   305.41 MB/sec  warmup  11 sec   
 160      3815   307.35 MB/sec  warmup  12 sec   
 160      4241   311.07 MB/sec  warmup  13 sec   
 160      5142   344.02 MB/sec  warmup  14 sec   
 160      5991   369.46 MB/sec  warmup  15 sec   
 160      6346   369.09 MB/sec  warmup  16 sec   
 160      6347   347.97 MB/sec  warmup  17 sec   
 160      6347   328.66 MB/sec  warmup  18 sec   
 160      6348   311.50 MB/sec  warmup  19 sec   
 160      6348     0.00 MB/sec  execute   1 sec   
 160      6348     2.08 MB/sec  execute   2 sec   
 160      6349     2.75 MB/sec  execute   3 sec   
 160      6356    16.25 MB/sec  execute   4 sec   
 160      6360    17.21 MB/sec  execute   5 sec   
 160      6574    45.07 MB/sec  execute   6 sec   
 160      6882    76.17 MB/sec  execute   7 sec   
 160      7006    86.37 MB/sec  execute   8 sec   
 160      7006    76.77 MB/sec  execute   9 sec   
 160      7006    69.09 MB/sec  execute  10 sec   
 160      7039    68.67 MB/sec  execute  11 sec   
 160      7043    64.71 MB/sec  execute  12 sec   
 160      7044    60.29 MB/sec  execute  13 sec   
 160      7044    55.98 MB/sec  execute  14 sec   
 160      7057    56.13 MB/sec  execute  15 sec   
 160      7057    52.63 MB/sec  execute  16 sec   
 160      7059    50.21 MB/sec  execute  17 sec   
 160      7083    49.73 MB/sec  execute  18 sec   
 160      7086    48.05 MB/sec  execute  19 sec   
 160      7088    46.40 MB/sec  execute  20 sec   
 160      7088    44.19 MB/sec  execute  21 sec   
 160      7094    43.59 MB/sec  execute  22 sec   
 160      7094    41.69 MB/sec  execute  23 sec   
 160      7094    39.96 MB/sec  execute  24 sec   
 160      7094    38.36 MB/sec  execute  25 sec   
 160      7094    36.88 MB/sec  execute  26 sec   
 160      7094    35.52 MB/sec  execute  27 sec   
 160      7098    34.91 MB/sec  execute  28 sec   
 160      7124    36.72 MB/sec  execute  29 sec   
 160      7124    35.50 MB/sec  execute  30 sec   
 160      7124    34.35 MB/sec  execute  31 sec   
 160      7124    33.28 MB/sec  execute  32 sec   
 160      7124    32.27 MB/sec  execute  33 sec   
 160      7124    31.32 MB/sec  execute  34 sec   
 160      7283    34.80 MB/sec  execute  35 sec   
 160      7681    44.95 MB/sec  execute  36 sec   
 160      7681    43.79 MB/sec  execute  37 sec   
 160      7681    42.64 MB/sec  execute  38 sec   
 160      7689    42.23 MB/sec  execute  39 sec   
 160      7691    41.48 MB/sec  execute  40 sec   
 160      7693    40.76 MB/sec  execute  41 sec   
 160      7703    40.54 MB/sec  execute  42 sec   
 160      7704    39.81 MB/sec  execute  43 sec   
 160      7704    38.91 MB/sec  execute  44 sec   
 160      7704    38.04 MB/sec  execute  45 sec   
 160      7704    37.21 MB/sec  execute  46 sec   
 160      7704    36.42 MB/sec  execute  47 sec   
 160      7704    35.66 MB/sec  execute  48 sec   
 160      7747    36.58 MB/sec  execute  49 sec   
 160      7854    38.00 MB/sec  execute  50 sec   
 160      7857    37.65 MB/sec  execute  51 sec   
 160      7861    37.29 MB/sec  execute  52 sec   
 160      7862    36.67 MB/sec  execute  53 sec   
 160      7864    36.21 MB/sec  execute  54 sec   
 160      7877    35.85 MB/sec  execute  55 sec   
 160      7877    35.21 MB/sec  execute  56 sec   
 160      8015    37.11 MB/sec  execute  57 sec   
 160      8019    36.57 MB/sec  execute  58 sec   
 160      8019    35.95 MB/sec  execute  59 sec   
 160      8019    35.36 MB/sec  cleanup  60 sec   
 160      8019    34.78 MB/sec  cleanup  61 sec   
 160      8019    34.23 MB/sec  cleanup  63 sec   
 160      8019    33.69 MB/sec  cleanup  64 sec   
 160      8019    33.16 MB/sec  cleanup  65 sec   
 160      8019    32.65 MB/sec  cleanup  66 sec   
 160      8019    32.21 MB/sec  cleanup  67 sec   
 160      8019    31.73 MB/sec  cleanup  68 sec   
 160      8019    31.27 MB/sec  cleanup  69 sec   
 160      8019    30.84 MB/sec  cleanup  70 sec   
 160      8019    30.40 MB/sec  cleanup  71 sec   
 160      8019    29.98 MB/sec  cleanup  72 sec   
 160      8019    29.58 MB/sec  cleanup  73 sec   
 160      8019    29.18 MB/sec  cleanup  74 sec   
 160      8019    29.03 MB/sec  cleanup  74 sec   

Throughput 35.5075 MB/sec 160 procs

Throughput 180.934 MB/sec 160 procs (next run, non-sustained train wreck)

Full output of this run:

dbench version 3.04 - Copyright Andrew Tridgell 1999-2004

Running for 60 seconds with load '/usr/share/dbench/client.txt' and minimum warmup 12 secs
160 clients started
 160        67   321.43 MB/sec  warmup   1 sec   
 160        67   160.61 MB/sec  warmup   2 sec   
 160        67   107.04 MB/sec  warmup   3 sec   
 160        67    80.27 MB/sec  warmup   4 sec   
 160        67    64.21 MB/sec  warmup   5 sec   
 160       267    89.74 MB/sec  warmup   6 sec   
 160      1022   169.68 MB/sec  warmup   7 sec   
 160      1821   240.62 MB/sec  warmup   8 sec   
 160      2591   290.39 MB/sec  warmup   9 sec   
 160      3125   308.04 MB/sec  warmup  10 sec   
 160      3125   280.04 MB/sec  warmup  11 sec   
 160      3217   263.23 MB/sec  warmup  12 sec   
 160      3725   276.45 MB/sec  warmup  13 sec   
 160      4237   288.32 MB/sec  warmup  14 sec   
 160      4748   300.98 MB/sec  warmup  15 sec   
 160      4810   286.69 MB/sec  warmup  16 sec   
 160      4812   270.89 MB/sec  warmup  17 sec   
 160      4812   255.95 MB/sec  warmup  18 sec   
 160      4812   242.48 MB/sec  warmup  19 sec   
 160      4812   230.35 MB/sec  warmup  20 sec   
 160      4812   219.38 MB/sec  warmup  21 sec   
 160      4812   209.41 MB/sec  warmup  22 sec   
 160      4812   200.31 MB/sec  warmup  23 sec   
 160      4812   191.96 MB/sec  warmup  24 sec   
 160      4812   184.28 MB/sec  warmup  25 sec   
 160      4812   177.19 MB/sec  warmup  26 sec   
 160      4836   175.89 MB/sec  warmup  27 sec   
 160      4836   169.61 MB/sec  warmup  28 sec   
 160      4841   163.97 MB/sec  warmup  29 sec   
 160      5004   163.03 MB/sec  warmup  30 sec   
 160      5450   170.58 MB/sec  warmup  31 sec   
 160      5951   178.79 MB/sec  warmup  32 sec   
 160      6086   176.86 MB/sec  warmup  33 sec   
 160      6127   174.53 MB/sec  warmup  34 sec   
 160      6129   169.67 MB/sec  warmup  35 sec   
 160      6131   165.36 MB/sec  warmup  36 sec   
 160      6137   161.65 MB/sec  warmup  37 sec   
 160      6141   157.85 MB/sec  warmup  38 sec   
 160      6145   154.32 MB/sec  warmup  39 sec   
 160      6145   150.46 MB/sec  warmup  40 sec   
 160      6145   146.79 MB/sec  warmup  41 sec   
 160      6145   143.30 MB/sec  warmup  42 sec   
 160      6145   139.97 MB/sec  warmup  43 sec   
 160      6145   136.78 MB/sec  warmup  44 sec   
 160      6145   133.74 MB/sec  warmup  45 sec   
 160      6145   130.84 MB/sec  warmup  46 sec   
 160      6145   128.05 MB/sec  warmup  47 sec   
 160      6178   128.41 MB/sec  warmup  48 sec   
 160      6180   126.13 MB/sec  warmup  49 sec   
 160      6184   124.09 MB/sec  warmup  50 sec   
 160      6187   122.03 MB/sec  warmup  51 sec   
 160      6192   120.19 MB/sec  warmup  52 sec   
 160      6196   118.42 MB/sec  warmup  53 sec   
 160      6228   116.88 MB/sec  warmup  54 sec   
 160      6231   114.97 MB/sec  warmup  55 sec   
 160      6231   112.92 MB/sec  warmup  56 sec   
 160      6398   114.17 MB/sec  warmup  57 sec   
 160      6401   112.44 MB/sec  warmup  58 sec   
 160      6402   110.69 MB/sec  warmup  59 sec   
 160      6402   108.84 MB/sec  warmup  60 sec   
 160      6405   107.38 MB/sec  warmup  61 sec   
 160      6405   105.65 MB/sec  warmup  62 sec   
 160      6407   104.03 MB/sec  warmup  64 sec   
 160      6431   103.16 MB/sec  warmup  65 sec   
 160      6432   101.64 MB/sec  warmup  66 sec   
 160      6432   100.10 MB/sec  warmup  67 sec   
 160      6460    99.42 MB/sec  warmup  68 sec   
 160      6698   100.92 MB/sec  warmup  69 sec   
 160      7218   106.21 MB/sec  warmup  70 sec   
 160      7254    36.49 MB/sec  execute   1 sec   
 160      7254    18.24 MB/sec  execute   2 sec   
 160      7259    21.06 MB/sec  execute   3 sec   
 160      7359    37.80 MB/sec  execute   4 sec   
 160      7381    34.05 MB/sec  execute   5 sec   
 160      7381    28.37 MB/sec  execute   6 sec   
 160      7381    24.32 MB/sec  execute   7 sec   
 160      7381    21.28 MB/sec  execute   8 sec   
 160      7404    21.03 MB/sec  execute   9 sec   
 160      7647    43.24 MB/sec  execute  10 sec   
 160      7649    39.94 MB/sec  execute  11 sec   
 160      7672    38.48 MB/sec  execute  12 sec   
 160      7680    37.10 MB/sec  execute  13 sec   
 160      7856    46.09 MB/sec  execute  14 sec   
 160      7856    43.02 MB/sec  execute  15 sec   
 160      7856    40.33 MB/sec  execute  16 sec   
 160      7856    37.99 MB/sec  execute  17 sec   
 160      8561    71.30 MB/sec  execute  18 sec   
 160      9070    92.10 MB/sec  execute  19 sec   
 160      9080    88.86 MB/sec  execute  20 sec   
 160      9086    86.13 MB/sec  execute  21 sec   
 160      9089    82.70 MB/sec  execute  22 sec   
 160      9095    79.98 MB/sec  execute  23 sec   
 160      9098    77.32 MB/sec  execute  24 sec   
 160      9101    74.78 MB/sec  execute  25 sec   
 160      9105    72.70 MB/sec  execute  26 sec   
 160      9107    70.34 MB/sec  execute  27 sec   
 160      9110    68.40 MB/sec  execute  28 sec   
 160      9114    66.60 MB/sec  execute  29 sec   
 160      9114    64.38 MB/sec  execute  30 sec   
 160      9114    62.30 MB/sec  execute  31 sec   
 160      9146    61.31 MB/sec  execute  32 sec   
 160      9493    68.80 MB/sec  execute  33 sec   
 160     10040    80.50 MB/sec  execute  34 sec   
 160     10567    91.12 MB/sec  execute  35 sec   
 160     10908    96.72 MB/sec  execute  36 sec   
 160     11234   101.86 MB/sec  execute  37 sec   
 160     12062   118.23 MB/sec  execute  38 sec   
 160     12987   135.90 MB/sec  execute  39 sec   
 160     13883   152.07 MB/sec  execute  40 sec   
 160     14730   166.18 MB/sec  execute  41 sec   
 160     14829   165.26 MB/sec  execute  42 sec   
 160     14836   162.03 MB/sec  execute  43 sec   
 160     14851   158.64 MB/sec  execute  44 sec   
 160     14851   155.11 MB/sec  execute  45 sec   
 160     14851   151.74 MB/sec  execute  46 sec   
 160     15022   151.70 MB/sec  execute  47 sec   
 160     15292   153.38 MB/sec  execute  48 sec   
 160     15580   155.28 MB/sec  execute  49 sec   
 160     15846   156.73 MB/sec  execute  50 sec   
 160     16449   164.00 MB/sec  execute  51 sec   
 160     17097   171.56 MB/sec  execute  52 sec   
 160     17097   168.32 MB/sec  execute  53 sec   
 160     17310   168.62 MB/sec  execute  54 sec   
 160     18075   177.42 MB/sec  execute  55 sec   
 160     18828   186.31 MB/sec  execute  56 sec   
 160     18876   184.04 MB/sec  execute  57 sec   
 160     18876   180.87 MB/sec  execute  58 sec   
 160     18879   177.81 MB/sec  execute  59 sec   
 160     19294   180.80 MB/sec  cleanup  60 sec   
 160     19294   177.84 MB/sec  cleanup  61 sec   
 160     19294   174.97 MB/sec  cleanup  63 sec   
 160     19294   172.24 MB/sec  cleanup  64 sec   
 160     19294   169.55 MB/sec  cleanup  65 sec   
 160     19294   166.95 MB/sec  cleanup  66 sec   
 160     19294   164.42 MB/sec  cleanup  67 sec   
 160     19294   161.97 MB/sec  cleanup  68 sec   
 160     19294   159.59 MB/sec  cleanup  69 sec   
 160     19294   157.28 MB/sec  cleanup  70 sec   
 160     19294   155.03 MB/sec  cleanup  71 sec   
 160     19294   152.86 MB/sec  cleanup  72 sec   
 160     19294   150.76 MB/sec  cleanup  73 sec   
 160     19294   148.71 MB/sec  cleanup  74 sec   
 160     19294   146.70 MB/sec  cleanup  75 sec   
 160     19294   144.75 MB/sec  cleanup  76 sec   
 160     19294   142.85 MB/sec  cleanup  77 sec   
 160     19294   141.72 MB/sec  cleanup  77 sec   

Throughput 180.934 MB/sec 160 procs



^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-26  8:46                           ` Mike Galbraith
@ 2008-10-26  9:00                             ` Peter Zijlstra
  2008-10-26  9:11                               ` Andrew Morton
  2008-10-26  9:15                               ` Mike Galbraith
  0 siblings, 2 replies; 94+ messages in thread
From: Peter Zijlstra @ 2008-10-26  9:00 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: Jiri Kosina, David Miller, rjw, Ingo Molnar, s0mbre,
	linux-kernel, netdev, Andrew Morton

On Sun, 2008-10-26 at 09:46 +0100, Mike Galbraith wrote: 
> On Sun, 2008-10-26 at 01:10 +0200, Jiri Kosina wrote:
> > On Sat, 25 Oct 2008, David Miller wrote:
> > 
> > > But note that tbench performance improved a bit in 2.6.25.
> > > In my tests I noticed a similar effect, but from 2.6.23 to 2.6.24,
> > > weird.
> > > Just for the public record here are the numbers I got in my testing.
> > 
> > I have been currently looking at very similarly looking issue. For the 
> > public record, here are the numbers we have been able to come up with so 
> > far (measured with dbench, so the absolute values are slightly different, 
> > but still shows similar pattern)
> > 
> > 208.4 MB/sec  -- vanilla 2.6.16.60
> > 201.6 MB/sec  -- vanilla 2.6.20.1
> > 172.9 MB/sec  -- vanilla 2.6.22.19
> > 74.2 MB/sec   -- vanilla 2.6.23
> >  46.1 MB/sec  -- vanilla 2.6.24.2
> >  30.6 MB/sec  -- vanilla 2.6.26.1
> > 
> > I.e. huge drop for 2.6.23 (this was with default configs for each 
> > respective kernel).
> > 2.6.23-rc1 shows 80.5 MB/s, i.e. a few % better than final 2.6.23, but 
> > still pretty bad. 
> > 
> > I have gone through the commits that went into -rc1 and tried to figure 
> > out which one could be responsible. Here are the numbers:
> > 
> >  85.3 MB/s for 2ba2d00363 (just before on-deman readahead has been merged)
> >  82.7 MB/s for 45426812d6 (before cond_resched() has been added into page 
> > 187.7 MB/s for c1e4fe711a4 (just before CFS scheduler has been merged)
> >                            invalidation code)
> > 
> > So the current bigest suspect is CFS, but I don't have enough numbers yet 
> > to be able to point a finger to it with 100% certainity. Hopefully soon.

> I reproduced this on my Q6600 box.  However, I also reproduced it with
> 2.6.22.19.  What I think you're seeing is just dbench creating a
> massive train wreck. 

wasn't dbench one of those non-benchmarks that thrives on randomness and
unfairness?

Andrew said recently:
  "dbench is pretty chaotic and it could be that a good change causes
dbench to get worse.  That's happened plenty of times in the past."

So I'm not inclined to worry too much about dbench in any way shape or
form.




^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-26  9:00                             ` Peter Zijlstra
@ 2008-10-26  9:11                               ` Andrew Morton
  2008-10-26  9:27                                 ` Evgeniy Polyakov
                                                   ` (2 more replies)
  2008-10-26  9:15                               ` Mike Galbraith
  1 sibling, 3 replies; 94+ messages in thread
From: Andrew Morton @ 2008-10-26  9:11 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mike Galbraith, Jiri Kosina, David Miller, rjw, Ingo Molnar,
	s0mbre, linux-kernel, netdev

On Sun, 26 Oct 2008 10:00:48 +0100 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> On Sun, 2008-10-26 at 09:46 +0100, Mike Galbraith wrote: 
> > On Sun, 2008-10-26 at 01:10 +0200, Jiri Kosina wrote:
> > > On Sat, 25 Oct 2008, David Miller wrote:
> > > 
> > > > But note that tbench performance improved a bit in 2.6.25.
> > > > In my tests I noticed a similar effect, but from 2.6.23 to 2.6.24,
> > > > weird.
> > > > Just for the public record here are the numbers I got in my testing.
> > > 
> > > I have been currently looking at very similarly looking issue. For the 
> > > public record, here are the numbers we have been able to come up with so 
> > > far (measured with dbench, so the absolute values are slightly different, 
> > > but still shows similar pattern)
> > > 
> > > 208.4 MB/sec  -- vanilla 2.6.16.60
> > > 201.6 MB/sec  -- vanilla 2.6.20.1
> > > 172.9 MB/sec  -- vanilla 2.6.22.19
> > > 74.2 MB/sec   -- vanilla 2.6.23
> > >  46.1 MB/sec  -- vanilla 2.6.24.2
> > >  30.6 MB/sec  -- vanilla 2.6.26.1
> > > 
> > > I.e. huge drop for 2.6.23 (this was with default configs for each 
> > > respective kernel).

Was this when we decreased the default value of
/proc/sys/vm/dirty_ratio, perhaps?  dbench is sensitive to that.

> > > 2.6.23-rc1 shows 80.5 MB/s, i.e. a few % better than final 2.6.23, but 
> > > still pretty bad. 
> > > 
> > > I have gone through the commits that went into -rc1 and tried to figure 
> > > out which one could be responsible. Here are the numbers:
> > > 
> > >  85.3 MB/s for 2ba2d00363 (just before on-deman readahead has been merged)
> > >  82.7 MB/s for 45426812d6 (before cond_resched() has been added into page 
> > > 187.7 MB/s for c1e4fe711a4 (just before CFS scheduler has been merged)
> > >                            invalidation code)
> > > 
> > > So the current bigest suspect is CFS, but I don't have enough numbers yet 
> > > to be able to point a finger to it with 100% certainity. Hopefully soon.
> 
> > I reproduced this on my Q6600 box.  However, I also reproduced it with
> > 2.6.22.19.  What I think you're seeing is just dbench creating a
> > massive train wreck. 
> 
> wasn't dbench one of those non-benchmarks that thrives on randomness and
> unfairness?
> 
> Andrew said recently:
>   "dbench is pretty chaotic and it could be that a good change causes
> dbench to get worse.  That's happened plenty of times in the past."
> 
> So I'm not inclined to worry too much about dbench in any way shape or
> form.

Well.  If there is a consistent change in dbench throughput, it is
important that we at least understand the reasons for it.  But we
don't necessarily want to optimise for dbench throughput.


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-26  9:00                             ` Peter Zijlstra
  2008-10-26  9:11                               ` Andrew Morton
@ 2008-10-26  9:15                               ` Mike Galbraith
  1 sibling, 0 replies; 94+ messages in thread
From: Mike Galbraith @ 2008-10-26  9:15 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jiri Kosina, David Miller, rjw, Ingo Molnar, s0mbre,
	linux-kernel, netdev, Andrew Morton

On Sun, 2008-10-26 at 10:00 +0100, Peter Zijlstra wrote:
> On Sun, 2008-10-26 at 09:46 +0100, Mike Galbraith wrote: 

> > I reproduced this on my Q6600 box.  However, I also reproduced it with
> > 2.6.22.19.  What I think you're seeing is just dbench creating a
> > massive train wreck. 
> 
> wasn't dbench one of those non-benchmarks that thrives on randomness and
> unfairness?
> 
> Andrew said recently:
>   "dbench is pretty chaotic and it could be that a good change causes
> dbench to get worse.  That's happened plenty of times in the past."
> 
> So I'm not inclined to worry too much about dbench in any way shape or
> form.

Yeah, I was just curious.  The switch rate of dbench isn't high enough
for math to be an issue, so I wondered how the heck CFS could be such a
huge problem for this load.  Looks to me like all the math in the
_world_ couldn't hurt.. or help.

	-Mike


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-26  9:11                               ` Andrew Morton
@ 2008-10-26  9:27                                 ` Evgeniy Polyakov
  2008-10-26  9:34                                   ` Andrew Morton
  2008-10-26 10:23                                 ` Mike Galbraith
  2008-10-26 19:03                                 ` Jiri Kosina
  2 siblings, 1 reply; 94+ messages in thread
From: Evgeniy Polyakov @ 2008-10-26  9:27 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Zijlstra, Mike Galbraith, Jiri Kosina, David Miller, rjw,
	Ingo Molnar, s0mbre, linux-kernel, netdev

Hi.

On Sun, Oct 26, 2008 at 02:11:53AM -0700, Andrew Morton (akpm@linux-foundation.org) wrote:
> > Andrew said recently:
> >   "dbench is pretty chaotic and it could be that a good change causes
> > dbench to get worse.  That's happened plenty of times in the past."
> > 
> > So I'm not inclined to worry too much about dbench in any way shape or
> > form.
> 
> Well.  If there is a consistent change in dbench throughput, it is
> important that we at least understand the reasons for it.  But we
> don't necessarily want to optimise for dbench throughput.

Sorry, but such excuses do not deserve to be said. No matter how
ugly, wrong, unusual or whatever else you might say about some test, but
it shows the problem, which has to be fixed. There is no 'dbench tune',
there is fair number of problems, and at least several of them dbench
already helped to narrow down and precisely locate. The same regressions
were also observed in other benchmarks, originally reported before I
started this thread.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-26  9:27                                 ` Evgeniy Polyakov
@ 2008-10-26  9:34                                   ` Andrew Morton
  2008-10-26 10:05                                     ` Evgeniy Polyakov
  0 siblings, 1 reply; 94+ messages in thread
From: Andrew Morton @ 2008-10-26  9:34 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Peter Zijlstra, Mike Galbraith, Jiri Kosina, David Miller, rjw,
	Ingo Molnar, s0mbre, linux-kernel, netdev

On Sun, 26 Oct 2008 12:27:22 +0300 Evgeniy Polyakov <zbr@ioremap.net> wrote:

> Hi.
> 
> On Sun, Oct 26, 2008 at 02:11:53AM -0700, Andrew Morton (akpm@linux-foundation.org) wrote:
> > > Andrew said recently:
> > >   "dbench is pretty chaotic and it could be that a good change causes
> > > dbench to get worse.  That's happened plenty of times in the past."
> > > 
> > > So I'm not inclined to worry too much about dbench in any way shape or
> > > form.
> > 
> > Well.  If there is a consistent change in dbench throughput, it is
> > important that we at least understand the reasons for it.  But we
> > don't necessarily want to optimise for dbench throughput.
> 
> Sorry, but such excuses do not deserve to be said. No matter how
> ugly, wrong, unusual or whatever else you might say about some test, but
> it shows the problem, which has to be fixed.

Not necessarily.  There are times when we have made changes which we
knew full well reduced dbench's throughput, because we believed them to
be of overall benefit.  I referred to one of them above.

> There is no 'dbench tune',
> there is fair number of problems, and at least several of them dbench
> already helped to narrow down and precisely locate. The same regressions
> were also observed in other benchmarks, originally reported before I
> started this thread.

You seem to be saying what I said.

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-26  9:34                                   ` Andrew Morton
@ 2008-10-26 10:05                                     ` Evgeniy Polyakov
  2008-10-27  2:34                                       ` David Miller
  0 siblings, 1 reply; 94+ messages in thread
From: Evgeniy Polyakov @ 2008-10-26 10:05 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Zijlstra, Mike Galbraith, Jiri Kosina, David Miller, rjw,
	Ingo Molnar, s0mbre, linux-kernel, netdev

Hi Andrew.

On Sun, Oct 26, 2008 at 02:34:39AM -0700, Andrew Morton (akpm@linux-foundation.org) wrote:
> Not necessarily.  There are times when we have made changes which we
> knew full well reduced dbench's throughput, because we believed them to
> be of overall benefit.  I referred to one of them above.

I suppose, there were words about dbench is not a real-life test, so if
it will suddenly suck, no one will care. Sigh, theorists...
I'm not surprised there were no changes when I reported hrtimers to be
the main guilty factor in my setup for dbench tests, and only when David
showed that they also killed his sparks via wake_up(), something was
done. Now this regression even dissapeared from the list.
Good direction, we should always follow this.

As a side note, is hrtimer subsystem also used for BH backend? I have
not yet analyzed data about vanilla kernels only being able to accept
clients at 20-30k accepts per second, while some other magical tree
(not vanilla) around 2.6.18 was able to that with 50k accepts per
second. There are lots of CPUs, ram, bandwidth, which are effectively
unused even behind linux load balancer...

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-26  9:11                               ` Andrew Morton
  2008-10-26  9:27                                 ` Evgeniy Polyakov
@ 2008-10-26 10:23                                 ` Mike Galbraith
  2008-10-26 19:03                                 ` Jiri Kosina
  2 siblings, 0 replies; 94+ messages in thread
From: Mike Galbraith @ 2008-10-26 10:23 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Zijlstra, Jiri Kosina, David Miller, rjw, Ingo Molnar,
	s0mbre, linux-kernel, netdev

On Sun, 2008-10-26 at 02:11 -0700, Andrew Morton wrote:

> Was this when we decreased the default value of
> /proc/sys/vm/dirty_ratio, perhaps?  dbench is sensitive to that.

Wow, indeed.  I fired up an ext2 disk to take kjournald out of the
picture (dunno, just a transient thought).  Stock settings produced
three perma-wrecks in a row.  With it bumped to 50, three very
considerably nicer results in a row appeared.

2.6.26.7-smp dirty_ratio = 10 (stock)
Throughput 36.3649 MB/sec 160 procs
Throughput 47.0787 MB/sec 160 procs
Throughput 88.2055 MB/sec 160 procs

2.6.26.7-smp dirty_ratio = 50
Throughput 1009.98 MB/sec 160 procs
Throughput 1101.57 MB/sec 160 procs
Throughput 943.205 MB/sec 160 procs

	-Mike


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-24 22:25             ` Rafael J. Wysocki
  2008-10-24 23:31               ` David Miller
  2008-10-25  3:37               ` Mike Galbraith
@ 2008-10-26 11:29               ` Evgeniy Polyakov
  2008-10-26 12:23                 ` Evgeniy Polyakov
  2 siblings, 1 reply; 94+ messages in thread
From: Evgeniy Polyakov @ 2008-10-26 11:29 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Ingo Molnar, Evgeniy Polyakov, Peter Zijlstra, linux-kernel,
	netdev, David Miller, Mike Galbraith, Andrew Morton

On Sat, Oct 25, 2008 at 12:25:34AM +0200, Rafael J. Wysocki (rjw@sisk.pl) wrote:
> > > > > vanilla 27: 347.222
> > > > > no TSO/GSO: 357.331
> > > > > no hrticks: 382.983
> > > > > no balance: 389.802
> 
> Can anyone please tell me if there was any conclusion of this thread?

For the reference, just pulled git tree (4403b4 commit): 361.184
and with dirty_ratio set to 50: 361.086
without scheduler domain tuning things are essentially the same: 361.367

So, things are getting worse with time, and previous tunes do not help
anymore.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-26  3:55                   ` David Miller
@ 2008-10-26 11:33                     ` Rafael J. Wysocki
  0 siblings, 0 replies; 94+ messages in thread
From: Rafael J. Wysocki @ 2008-10-26 11:33 UTC (permalink / raw)
  To: David Miller; +Cc: mingo, s0mbre, a.p.zijlstra, linux-kernel, netdev, efault

On Sunday, 26 of October 2008, David Miller wrote:
> From: "Rafael J. Wysocki" <rjw@sisk.pl>
> Date: Sat, 25 Oct 2008 13:13:20 +0200
> 
> > Could you please give me a pointer to the commit disabling the hrtimer feature?
> 
> Here it is:

Thanks a lot!

> commit 0c4b83da58ec2e96ce9c44c211d6eac5f9dae478
> Author: Ingo Molnar <mingo@elte.hu>
> Date:   Mon Oct 20 14:27:43 2008 +0200
> 
>     sched: disable the hrtick for now
>     
>     David Miller reported that hrtick update overhead has tripled the
>     wakeup overhead on Sparc64.
>     
>     That is too much - disable the HRTICK feature for now by default,
>     until a faster implementation is found.
>     
>     Reported-by: David Miller <davem@davemloft.net>
>     Acked-by: Peter Zijlstra <peterz@infradead.org>
>     Signed-off-by: Ingo Molnar <mingo@elte.hu>
> 
> diff --git a/kernel/sched_features.h b/kernel/sched_features.h
> index 7c9e8f4..fda0162 100644
> --- a/kernel/sched_features.h
> +++ b/kernel/sched_features.h
> @@ -5,7 +5,7 @@ SCHED_FEAT(START_DEBIT, 1)
>  SCHED_FEAT(AFFINE_WAKEUPS, 1)
>  SCHED_FEAT(CACHE_HOT_BUDDY, 1)
>  SCHED_FEAT(SYNC_WAKEUPS, 1)
> -SCHED_FEAT(HRTICK, 1)
> +SCHED_FEAT(HRTICK, 0)
>  SCHED_FEAT(DOUBLE_TICK, 0)
>  SCHED_FEAT(ASYM_GRAN, 1)
>  SCHED_FEAT(LB_BIAS, 1)

Rafael

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-26 11:29               ` Evgeniy Polyakov
@ 2008-10-26 12:23                 ` Evgeniy Polyakov
  2008-10-30 18:15                   ` Stephen Hemminger
  0 siblings, 1 reply; 94+ messages in thread
From: Evgeniy Polyakov @ 2008-10-26 12:23 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Ingo Molnar, Evgeniy Polyakov, Peter Zijlstra, linux-kernel,
	netdev, David Miller, Mike Galbraith, Andrew Morton

[-- Attachment #1: Type: text/plain, Size: 827 bytes --]

Hi.

> > > > > > vanilla 27: 347.222
> > > > > > no TSO/GSO: 357.331
> > > > > > no hrticks: 382.983
> > > > > > no balance: 389.802
> > 
> > Can anyone please tell me if there was any conclusion of this thread?
> 
> For the reference, just pulled git tree (4403b4 commit): 361.184
> and with dirty_ratio set to 50: 361.086
> without scheduler domain tuning things are essentially the same: 361.367
> 
> So, things are getting worse with time, and previous tunes do not help
> anymore.

That's the picture, how we go on my hardware:
4 (2 physical, 2 logical hyper-threaded) 32-bit xeons 8gb of ram.
We probably can be a little bit better for -rc1 kernel though,
if I enable only 4gb via config.

Better one time to see than 1000 times to read. One can scare children
with our graphs... Picture attached.

-- 
	Evgeniy Polyakov

[-- Attachment #2: tbench-regression-2.6.27-rc1.png --]
[-- Type: image/png, Size: 4996 bytes --]

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-26  9:11                               ` Andrew Morton
  2008-10-26  9:27                                 ` Evgeniy Polyakov
  2008-10-26 10:23                                 ` Mike Galbraith
@ 2008-10-26 19:03                                 ` Jiri Kosina
  2008-10-27  9:29                                   ` Mike Galbraith
  2008-10-27 10:42                                   ` Jiri Kosina
  2 siblings, 2 replies; 94+ messages in thread
From: Jiri Kosina @ 2008-10-26 19:03 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Zijlstra, Mike Galbraith, David Miller, rjw, Ingo Molnar,
	s0mbre, linux-kernel, netdev

On Sun, 26 Oct 2008, Andrew Morton wrote:

> > > > 208.4 MB/sec  -- vanilla 2.6.16.60
> > > > 201.6 MB/sec  -- vanilla 2.6.20.1
> > > > 172.9 MB/sec  -- vanilla 2.6.22.19
> > > > 74.2 MB/sec   -- vanilla 2.6.23
> > > >  46.1 MB/sec  -- vanilla 2.6.24.2
> > > >  30.6 MB/sec  -- vanilla 2.6.26.1
> > > > I.e. huge drop for 2.6.23 (this was with default configs for each 
> > > > respective kernel).
> Was this when we decreased the default value of
> /proc/sys/vm/dirty_ratio, perhaps?  dbench is sensitive to that.

2.6.28 gives 41.8 MB/s with /proc/sys/vm/dirty_ratio == 50. So small 
improvement, but still far far away from the throughput of pre-2.6.23 
kernels.

-- 
Jiri Kosina
SUSE Labs

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-26 10:05                                     ` Evgeniy Polyakov
@ 2008-10-27  2:34                                       ` David Miller
  2008-10-27  9:30                                         ` Ingo Molnar
  0 siblings, 1 reply; 94+ messages in thread
From: David Miller @ 2008-10-27  2:34 UTC (permalink / raw)
  To: zbr
  Cc: akpm, a.p.zijlstra, efault, jkosina, rjw, mingo, s0mbre,
	linux-kernel, netdev

From: Evgeniy Polyakov <zbr@ioremap.net>
Date: Sun, 26 Oct 2008 13:05:55 +0300

> I'm not surprised there were no changes when I reported hrtimers to be
> the main guilty factor in my setup for dbench tests, and only when David
> showed that they also killed his sparks via wake_up(), something was
> done. Now this regression even dissapeared from the list.
> Good direction, we should always follow this.

Yes, this situation was in my opinion a complete fucking joke.  Someone
like me shouldn't have to do all of the hard work for the scheduler
folks in order for a bug like this to get seriously looked at.

Evgeniy's difficult work was effectively ignored except by other
testers who could also see and reproduce the problem.

No scheduler developer looked seriously into these reports other than
to say "please try to reproduce with tip" (?!?!?!)  I guess showing
the developer the exact changeset(s) which add the regression isn't
enough these days :-/

Did any scheduler developer try to run tbench ONCE and do even a tiny
bit of analysis, like the kind I did?  Answer honestly...  Linus even
asked you guys in the private thread to "please look into it".  So, if
none of you did, you should all be deeply ashamed of yourselves.

People like me shouldn't have to do all of that work for you just to
get something to happen.

Not until I went privately to Ingo and Linus with cycle counts and a
full disagnosis (of every single release since 2.6.22, a whole 2 days
of work for me) of the precise code eating up too many cycles and
causing problems DID ANYTHING HAPPEN.

This is extremely and excruciatingly DISAPPOINTING and WRONG.

We completely and absolutely suck if this is how we will handle any
performance regression report.

And although this case is specific to the scheduler, a lot of
other areas handle well prepared bug reports similarly.  So I'm not
really picking on the scheduler folks, they just happen to be the
current example :-)


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-26 19:03                                 ` Jiri Kosina
@ 2008-10-27  9:29                                   ` Mike Galbraith
  2008-10-27 10:42                                   ` Jiri Kosina
  1 sibling, 0 replies; 94+ messages in thread
From: Mike Galbraith @ 2008-10-27  9:29 UTC (permalink / raw)
  To: Jiri Kosina
  Cc: Andrew Morton, Peter Zijlstra, David Miller, rjw, Ingo Molnar,
	s0mbre, linux-kernel, netdev

On Sun, 2008-10-26 at 20:03 +0100, Jiri Kosina wrote:
> On Sun, 26 Oct 2008, Andrew Morton wrote:
> 
> > > > > 208.4 MB/sec  -- vanilla 2.6.16.60
> > > > > 201.6 MB/sec  -- vanilla 2.6.20.1
> > > > > 172.9 MB/sec  -- vanilla 2.6.22.19
> > > > > 74.2 MB/sec   -- vanilla 2.6.23
> > > > >  46.1 MB/sec  -- vanilla 2.6.24.2
> > > > >  30.6 MB/sec  -- vanilla 2.6.26.1
> > > > > I.e. huge drop for 2.6.23 (this was with default configs for each 
> > > > > respective kernel).
> > Was this when we decreased the default value of
> > /proc/sys/vm/dirty_ratio, perhaps?  dbench is sensitive to that.
> 
> 2.6.28 gives 41.8 MB/s with /proc/sys/vm/dirty_ratio == 50. So small 
> improvement, but still far far away from the throughput of pre-2.6.23 
> kernels.

How many clients?

dbench 160 -t 60

2.6.28-smp (git.today)
Throughput 331.718 MB/sec 160 procs (no logjam)
Throughput 309.85 MB/sec 160 procs (contains logjam)
Throughput 392.746 MB/sec 160 procs (contains logjam)

	-Mike


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-27  2:34                                       ` David Miller
@ 2008-10-27  9:30                                         ` Ingo Molnar
  2008-10-27  9:57                                           ` David Miller
  0 siblings, 1 reply; 94+ messages in thread
From: Ingo Molnar @ 2008-10-27  9:30 UTC (permalink / raw)
  To: David Miller
  Cc: zbr, akpm, a.p.zijlstra, efault, jkosina, rjw, s0mbre,
	linux-kernel, netdev


* David Miller <davem@davemloft.net> wrote:

> From: Evgeniy Polyakov <zbr@ioremap.net>
> Date: Sun, 26 Oct 2008 13:05:55 +0300
> 
> > I'm not surprised there were no changes when I reported hrtimers to be
> > the main guilty factor in my setup for dbench tests, and only when David
> > showed that they also killed his sparks via wake_up(), something was
> > done. Now this regression even dissapeared from the list.
> > Good direction, we should always follow this.
> 
> Yes, this situation was in my opinion a complete fucking joke.  
> Someone like me shouldn't have to do all of the hard work for the 
> scheduler folks in order for a bug like this to get seriously looked 
> at.

yeah, that overhead was bad, and once it became clear that you had 
high-resolution timers enabled for your benchmaking runs (which is 
default-off and which is still rare for benchmarking runs - despite 
being a popular end-user feature) we immediately disabled the hrtick via 
this upstream commit:

  0c4b83d: sched: disable the hrtick for now

that commit is included in v2.6.28-rc1 so this particular issue should 
be resolved.

high-resolution timers are still default-disabled in the upstream 
kernel, so this never affected usual configs that folks keep 
benchmarking - it only affected those who decided they want higher 
resolution timers and more precise scheduling.

Anyway, the sched-hrtick is off now, and we wont turn it back on without 
making sure that it's really low cost in the hotpath.

Regarding tbench, a workload that context-switches in excess of 100,000 
per second is inevitably going to show scheduler overhead - so you'll 
get the best numbers if you eliminate all/most scheduler code from the 
hotpath. We are working on various patches to mitigate the cost some 
more - and your patches and feedback is welcome as well.

But it's a difficult call with no silver bullets. On one hand we have 
folks putting more and more stuff into the context-switching hotpath on 
the (mostly valid) point that the scheduler is a slowpath compared to 
most other things. On the other hand we've got folks doing 
high-context-switch ratio benchmarks and complaining about the overhead 
whenever something goes in that improves the quality of scheduling of a 
workload that does not context-switch as massively as tbench. It's a 
difficult balance and we cannot satisfy both camps.

Nevertheless, this is not a valid argument in favor of the hrtick 
overhead: that was clearly excessive overhead and we zapped it.

	Ingo

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-27  9:30                                         ` Ingo Molnar
@ 2008-10-27  9:57                                           ` David Miller
  0 siblings, 0 replies; 94+ messages in thread
From: David Miller @ 2008-10-27  9:57 UTC (permalink / raw)
  To: mingo
  Cc: zbr, akpm, a.p.zijlstra, efault, jkosina, rjw, s0mbre,
	linux-kernel, netdev

From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 27 Oct 2008 10:30:35 +0100

> But it's a difficult call with no silver bullets. On one hand we have 
> folks putting more and more stuff into the context-switching hotpath on 
> the (mostly valid) point that the scheduler is a slowpath compared to 
> most other things.

This I heavily disagree with.  The scheduler should be so cheap
that you cannot possibly notice that it is even there for a benchmark
like tbench.

If we now think it's ok that picking which task to run is more
expensive than writing 64 bytes over a TCP socket and then blocking on
a read, I'd like to stop using Linux. :-) That's "real work" and if
the scheduler is more expensive than "real work" we lose.

I do want to remind you of a thread you participated in, in April,
where you complained about loopback TCP performance:

	http://marc.info/?l=linux-netdev&m=120696343707674&w=2

It might be fruitful for you to rerun your tests with CFS reverted
(start with 2.6.22 and progressively run your benchmark on every
release), you know, just for fun :-)

> On the other hand we've got folks doing high-context-switch ratio
> benchmarks and complaining about the overhead whenever something
> goes in that improves the quality of scheduling of a workload that
> does not context-switch as massively as tbench. It's a difficult
> balance and we cannot satisfy both camps.

We've always been proud of our scheduling overhead being extremely
low, and you have to face the simple fact that starting in 2.6.23 it's
been getting progressively more and more expensive.

Consistently so.

People even noticed it.

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-26 19:03                                 ` Jiri Kosina
  2008-10-27  9:29                                   ` Mike Galbraith
@ 2008-10-27 10:42                                   ` Jiri Kosina
  2008-10-27 11:27                                     ` Ingo Molnar
  1 sibling, 1 reply; 94+ messages in thread
From: Jiri Kosina @ 2008-10-27 10:42 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Zijlstra, Mike Galbraith, David Miller, rjw, Ingo Molnar,
	s0mbre, linux-kernel, netdev

On Sun, 26 Oct 2008, Jiri Kosina wrote:

> > > > > 208.4 MB/sec  -- vanilla 2.6.16.60
> > > > > 201.6 MB/sec  -- vanilla 2.6.20.1
> > > > > 172.9 MB/sec  -- vanilla 2.6.22.19
> > > > > 74.2 MB/sec   -- vanilla 2.6.23
> > > > >  46.1 MB/sec  -- vanilla 2.6.24.2
> > > > >  30.6 MB/sec  -- vanilla 2.6.26.1
> > > > > I.e. huge drop for 2.6.23 (this was with default configs for each 
> > > > > respective kernel).
> > Was this when we decreased the default value of
> > /proc/sys/vm/dirty_ratio, perhaps?  dbench is sensitive to that.
> 2.6.28 gives 41.8 MB/s with /proc/sys/vm/dirty_ratio == 50. So small 
> improvement, but still far far away from the throughput of pre-2.6.23 
> kernels.

Ok, so another important datapoint:

with c1e4fe711a4 (just before CFS has been merged for 2.6.23), the dbench 
throughput measures

	187.7 MB/s

in our testing conditions (default config).

With c31f2e8a42c4 (just after CFS has been merged for 2.6.23), the 
throughput measured by dbench is

	82.3 MB/s

This is the huge drop we have been looking for. After this, the 
performance was still going down gradually, up to ~45 MS/ we are measuring 
for 2.6.27. But the biggest drop (more than 50%) points directly to CFS 
merge.

-- 
Jiri Kosina
SUSE Labs

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-27 10:42                                   ` Jiri Kosina
@ 2008-10-27 11:27                                     ` Ingo Molnar
  2008-10-27 11:33                                       ` Alan Cox
  0 siblings, 1 reply; 94+ messages in thread
From: Ingo Molnar @ 2008-10-27 11:27 UTC (permalink / raw)
  To: Jiri Kosina
  Cc: Andrew Morton, Peter Zijlstra, Mike Galbraith, David Miller, rjw,
	s0mbre, linux-kernel, netdev


* Jiri Kosina <jkosina@suse.cz> wrote:

> Ok, so another important datapoint:
> 
> with c1e4fe711a4 (just before CFS has been merged for 2.6.23), the dbench 
> throughput measures
> 
> 	187.7 MB/s
> 
> in our testing conditions (default config).
> 
> With c31f2e8a42c4 (just after CFS has been merged for 2.6.23), the 
> throughput measured by dbench is
> 
> 	82.3 MB/s
> 
> This is the huge drop we have been looking for. After this, the 
> performance was still going down gradually, up to ~45 MS/ we are 
> measuring for 2.6.27. But the biggest drop (more than 50%) points 
> directly to CFS merge.

that is a well-known property of dbench: it rewards unfairness in IO, 
memory management and scheduling.

The way to get the best possible dbench numbers in CPU-bound dbench 
runs, you have to throw away the scheduler completely, and do this 
instead:

 - first execute all requests of client 1
 - then execute all requests of client 2
 ....
 - execute all requests of client N

the moment the clients are allowed to overlap, the moment their requests 
are executed more fairly, the dbench numbers drop.

	Ingo

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-27 11:27                                     ` Ingo Molnar
@ 2008-10-27 11:33                                       ` Alan Cox
  2008-10-27 12:06                                         ` Mike Galbraith
  2008-10-27 18:33                                         ` Ingo Molnar
  0 siblings, 2 replies; 94+ messages in thread
From: Alan Cox @ 2008-10-27 11:33 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Jiri Kosina, Andrew Morton, Peter Zijlstra, Mike Galbraith,
	David Miller, rjw, s0mbre, linux-kernel, netdev

> The way to get the best possible dbench numbers in CPU-bound dbench 
> runs, you have to throw away the scheduler completely, and do this 
> instead:
> 
>  - first execute all requests of client 1
>  - then execute all requests of client 2
>  ....
>  - execute all requests of client N

Rubbish. If you do that you'll not get enough I/O in parallel to schedule
the disk well (not that most of our I/O schedulers are doing the job
well, and the vm writeback threads then mess it up and the lack of Arjans
ioprio fixes then totally screw you) </rant>

> the moment the clients are allowed to overlap, the moment their requests 
> are executed more fairly, the dbench numbers drop.

Fairness isn't everything. Dbench is a fairly good tool for studying some
real world workloads. If your fairness hurts throughput that much maybe
your scheduler algorithm is just plain *wrong* as it isn't adapting to
workload at all well.

Alan

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-27 11:33                                       ` Alan Cox
@ 2008-10-27 12:06                                         ` Mike Galbraith
  2008-10-27 13:42                                           ` Jiri Kosina
  2008-10-27 18:33                                         ` Ingo Molnar
  1 sibling, 1 reply; 94+ messages in thread
From: Mike Galbraith @ 2008-10-27 12:06 UTC (permalink / raw)
  To: Alan Cox
  Cc: Ingo Molnar, Jiri Kosina, Andrew Morton, Peter Zijlstra,
	David Miller, rjw, s0mbre, linux-kernel, netdev

On Mon, 2008-10-27 at 11:33 +0000, Alan Cox wrote:
> > The way to get the best possible dbench numbers in CPU-bound dbench 
> > runs, you have to throw away the scheduler completely, and do this 
> > instead:
> > 
> >  - first execute all requests of client 1
> >  - then execute all requests of client 2
> >  ....
> >  - execute all requests of client N
> 
> Rubbish. If you do that you'll not get enough I/O in parallel to schedule
> the disk well (not that most of our I/O schedulers are doing the job
> well, and the vm writeback threads then mess it up and the lack of Arjans
> ioprio fixes then totally screw you) </rant>
> 
> > the moment the clients are allowed to overlap, the moment their requests 
> > are executed more fairly, the dbench numbers drop.
> 
> Fairness isn't everything. Dbench is a fairly good tool for studying some
> real world workloads. If your fairness hurts throughput that much maybe
> your scheduler algorithm is just plain *wrong* as it isn't adapting to
> workload at all well.

Doesn't seem to be scheduler/fairness.  2.6.22.19 is O(1), and falls
apart too, I posted the numbers and full dbench output yesterday.

	-Mike


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-27 12:06                                         ` Mike Galbraith
@ 2008-10-27 13:42                                           ` Jiri Kosina
  2008-10-27 14:17                                             ` Mike Galbraith
  0 siblings, 1 reply; 94+ messages in thread
From: Jiri Kosina @ 2008-10-27 13:42 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: Alan Cox, Ingo Molnar, Andrew Morton, Peter Zijlstra,
	David Miller, rjw, s0mbre, linux-kernel, netdev

On Mon, 27 Oct 2008, Mike Galbraith wrote:

> > real world workloads. If your fairness hurts throughput that much maybe
> > your scheduler algorithm is just plain *wrong* as it isn't adapting to
> > workload at all well.
> Doesn't seem to be scheduler/fairness.  2.6.22.19 is O(1), and falls
> apart too, I posted the numbers and full dbench output yesterday.

We'll need to look into this a little bit more I think. I have sent out 
some numbers too, and these indicate very clearly that there is more than 
50% performance drop (measured by dbench) just after the very merge of CFS 
in 2.6.23-rc1 merge window.

-- 
Jiri Kosina
SUSE Labs

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-27 13:42                                           ` Jiri Kosina
@ 2008-10-27 14:17                                             ` Mike Galbraith
  0 siblings, 0 replies; 94+ messages in thread
From: Mike Galbraith @ 2008-10-27 14:17 UTC (permalink / raw)
  To: Jiri Kosina
  Cc: Alan Cox, Ingo Molnar, Andrew Morton, Peter Zijlstra,
	David Miller, rjw, s0mbre, linux-kernel, netdev

On Mon, 2008-10-27 at 14:42 +0100, Jiri Kosina wrote:
> On Mon, 27 Oct 2008, Mike Galbraith wrote:
> 
> > > real world workloads. If your fairness hurts throughput that much maybe
> > > your scheduler algorithm is just plain *wrong* as it isn't adapting to
> > > workload at all well.
> > Doesn't seem to be scheduler/fairness.  2.6.22.19 is O(1), and falls
> > apart too, I posted the numbers and full dbench output yesterday.
> 
> We'll need to look into this a little bit more I think. I have sent out 
> some numbers too, and these indicate very clearly that there is more than 
> 50% performance drop (measured by dbench) just after the very merge of CFS 
> in 2.6.23-rc1 merge window.

Sure.  Watching the per/sec output, every kernel I have sucks at high
client count dbench, it's just a matter of how badly, and how long.

BTW, the nice pretty 160 client numbers I posted yesterday for ext2
turned out to be because somebody adds _netdev mount option when I mount
-a in order to mount my freshly hotplugged external drive (why?  that
ain't in my fstab).  Without that switch, ext2 output is roughly as
raggedy as ext3, and nowhere near the up to 1.4GB/sec I can get with
dirty_ratio=50 + ext2 + (buy none, get one free) _netdev option.  Free
for the not asking option does nada for ext3.

	-Mike


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-25  7:33                       ` Mike Galbraith
@ 2008-10-27 17:26                         ` Rick Jones
  2008-10-27 19:11                           ` Mike Galbraith
  0 siblings, 1 reply; 94+ messages in thread
From: Rick Jones @ 2008-10-27 17:26 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: David Miller, rjw, mingo, s0mbre, a.p.zijlstra, linux-kernel, netdev

Mike Galbraith wrote:
> That's exactly what I've been trying to look into, but combined with
> netperf.  The thing is an incredibly twisted maze of _this_ affects
> _that_... sometimes involving magic and/or mythical creatures.

I cannot guarantee it will help, but the global -T option to pin netperf 
or netserver to a specific CPU might help cut-down the variables.

FWIW netperf top of trunk omni tests can now also determine and report 
the state of SELinux.  They also have code to accept or generate their 
own RFC4122-esque UUID.  Define some connical tests and then ever closer 
to just needing some database-fu and automagic testing I suppose... 
things I do not presently posess but am curious enough to follow some 
pointers.

happy benchmarking,

rick jones

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-27 11:33                                       ` Alan Cox
  2008-10-27 12:06                                         ` Mike Galbraith
@ 2008-10-27 18:33                                         ` Ingo Molnar
  2008-10-27 19:39                                           ` Evgeniy Polyakov
  2008-10-29  9:59                                           ` Nick Piggin
  1 sibling, 2 replies; 94+ messages in thread
From: Ingo Molnar @ 2008-10-27 18:33 UTC (permalink / raw)
  To: Alan Cox
  Cc: Jiri Kosina, Andrew Morton, Peter Zijlstra, Mike Galbraith,
	David Miller, rjw, s0mbre, linux-kernel, netdev


* Alan Cox <alan@lxorguk.ukuu.org.uk> wrote:

> > The way to get the best possible dbench numbers in CPU-bound dbench 
> > runs, you have to throw away the scheduler completely, and do this 
> > instead:
> > 
> >  - first execute all requests of client 1
> >  - then execute all requests of client 2
> >  ....
> >  - execute all requests of client N
> 
> Rubbish. [...]

i've actually implemented that about a decade ago: i've tracked down 
what makes dbench tick, i've implemented the kernel heuristics for it 
to make dbench scale linearly with the number of clients - just to be 
shot down by Linus about my utter rubbish approach ;-)

> [...] If you do that you'll not get enough I/O in parallel to 
> schedule the disk well (not that most of our I/O schedulers are 
> doing the job well, and the vm writeback threads then mess it up and 
> the lack of Arjans ioprio fixes then totally screw you) </rant>

the best dbench results come from systems that have enough RAM to 
cache the full working set, and a filesystem intelligent enough to not 
insert bogus IO serialization cycles (ext3 is not such a filesystem).

The moment there's real IO it becomes harder to analyze but the same 
basic behavior remains: the more unfair the IO scheduler, the "better" 
dbench results we get.

	Ingo

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-27 17:26                         ` Rick Jones
@ 2008-10-27 19:11                           ` Mike Galbraith
  2008-10-27 19:18                             ` Rick Jones
  0 siblings, 1 reply; 94+ messages in thread
From: Mike Galbraith @ 2008-10-27 19:11 UTC (permalink / raw)
  To: Rick Jones
  Cc: David Miller, rjw, mingo, s0mbre, a.p.zijlstra, linux-kernel, netdev

On Mon, 2008-10-27 at 10:26 -0700, Rick Jones wrote:
> Mike Galbraith wrote:
> > That's exactly what I've been trying to look into, but combined with
> > netperf.  The thing is an incredibly twisted maze of _this_ affects
> > _that_... sometimes involving magic and/or mythical creatures.
> 
> I cannot guarantee it will help, but the global -T option to pin netperf 
> or netserver to a specific CPU might help cut-down the variables.

Yup, and how.  Early on, the other variables drove me bat-shit frigging
_nuts_.  I eventually selected a UP config to test _because_ those other
variables combined with SMP overhead and config options drove crazy ;-) 

> FWIW netperf top of trunk omni tests can now also determine and report 
> the state of SELinux.  They also have code to accept or generate their 
> own RFC4122-esque UUID.  Define some connical tests and then ever closer 
> to just needing some database-fu and automagic testing I suppose... 
> things I do not presently posess but am curious enough to follow some 
> pointers.

Hrm. I'm going to have to save that, and parse a few times. (usual)

> happy benchmarking,

Not really, but I can't seem to give up ;-)

	-Mike


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-27 19:11                           ` Mike Galbraith
@ 2008-10-27 19:18                             ` Rick Jones
  2008-10-27 19:44                               ` Mike Galbraith
  0 siblings, 1 reply; 94+ messages in thread
From: Rick Jones @ 2008-10-27 19:18 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: David Miller, rjw, mingo, s0mbre, a.p.zijlstra, linux-kernel, netdev

>I cannot guarantee it will help, but the global -T option to pin netperf 
>>or netserver to a specific CPU might help cut-down the variables.
> 
> 
> Yup, and how.  Early on, the other variables drove me bat-shit frigging
> _nuts_.  I eventually selected a UP config to test _because_ those other
> variables combined with SMP overhead and config options drove crazy ;-) 
> 
> 
>>FWIW netperf top of trunk omni tests can now also determine and report 
>>the state of SELinux. 

http://www.netperf.org/svn/netperf2/trunk/src/netsec_linux.c

Pointers to programtatic detection of AppArmour and a couple salient 
details about firewall (enabled, perhaps number of rules) from any 
quarter would be welcome.

>> They also have code to accept or generate their 
>>own RFC4122-esque UUID.  Define some connical tests and then ever closer 
>>to just needing some database-fu and automagic testing I suppose... 
>>things I do not presently posess but am curious enough to follow some 
>>pointers.
> 
> 
> Hrm. I'm going to have to save that, and parse a few times. (usual)

Plot thickening, seems that autotest knows about some version of 
netperf2 already...  i'll be trying to see if there is some benefit to 
autotest to netperf2's top of trunk having the keyval output format, and 
if autotest groks paired systems to more easily do over a network testing.

>>happy benchmarking,
> 
> 
> Not really, but I can't seem to give up ;-)

then I guess I'll close with

successful benchmarking,

if not necessarily happy :)

rick jones

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-27 18:33                                         ` Ingo Molnar
@ 2008-10-27 19:39                                           ` Evgeniy Polyakov
  2008-10-27 19:48                                             ` David Miller
  2008-10-29  9:59                                           ` Nick Piggin
  1 sibling, 1 reply; 94+ messages in thread
From: Evgeniy Polyakov @ 2008-10-27 19:39 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Alan Cox, Jiri Kosina, Andrew Morton, Peter Zijlstra,
	Mike Galbraith, David Miller, rjw, s0mbre, linux-kernel, netdev

On Mon, Oct 27, 2008 at 07:33:12PM +0100, Ingo Molnar (mingo@elte.hu) wrote:
> the best dbench results come from systems that have enough RAM to 
> cache the full working set, and a filesystem intelligent enough to not 
> insert bogus IO serialization cycles (ext3 is not such a filesystem).

My test system has 8gb for 8 clients and its performance dropped by 30%.
There is no IO load since tbech uses only network part while dbench
itself uses only disk IO. What we see right now is that usual network
server which handles mixed set of essentially small reads and writes
from the socket from multiple (8) clients suddenly lost one third of
its performance.

> The moment there's real IO it becomes harder to analyze but the same 
> basic behavior remains: the more unfair the IO scheduler, the "better" 
> dbench results we get.

Right now there is no disk IO at all. Only quite usual network and
process load.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-27 19:18                             ` Rick Jones
@ 2008-10-27 19:44                               ` Mike Galbraith
  0 siblings, 0 replies; 94+ messages in thread
From: Mike Galbraith @ 2008-10-27 19:44 UTC (permalink / raw)
  To: Rick Jones
  Cc: David Miller, rjw, mingo, s0mbre, a.p.zijlstra, linux-kernel, netdev

On Mon, 2008-10-27 at 12:18 -0700, Rick Jones wrote:
> >
> > Not really, but I can't seem to give up ;-)
> 
> then I guess I'll close with
> 
> successful benchmarking,
> 
> if not necessarily happy :)

There ya go, happy benchmarking is when they tell you what you want to
hear.  Successful is when you learn something.

	-Mike  (not happy, but learning)


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-27 19:39                                           ` Evgeniy Polyakov
@ 2008-10-27 19:48                                             ` David Miller
  2008-10-28 10:24                                               ` Mike Galbraith
  0 siblings, 1 reply; 94+ messages in thread
From: David Miller @ 2008-10-27 19:48 UTC (permalink / raw)
  To: zbr
  Cc: mingo, alan, jkosina, akpm, a.p.zijlstra, efault, rjw, s0mbre,
	linux-kernel, netdev

From: Evgeniy Polyakov <zbr@ioremap.net>
Date: Mon, 27 Oct 2008 22:39:34 +0300

> On Mon, Oct 27, 2008 at 07:33:12PM +0100, Ingo Molnar (mingo@elte.hu) wrote:
> > The moment there's real IO it becomes harder to analyze but the same 
> > basic behavior remains: the more unfair the IO scheduler, the "better" 
> > dbench results we get.
> 
> Right now there is no disk IO at all. Only quite usual network and
> process load.

I think the hope is that by saying there isn't a problem enough times,
it will become truth. :-)

More seriously, Ingo, what in the world do we need to do in order to get
you to start doing tbench runs and optimizing things (read as: fixing
the regression you added)?

I'm personally working on a test fibonacci heap implementation for
the fair sched code, and I already did all of the cost analysis all
the way back to the 2.6.22 pre-CFS days.

But I'm NOT a scheduler developer, so it isn't my responsibility to do
this crap for you.  You added this regression, why do I have to get my
hands dirty in order for there to be some hope that these regressions
start to get fixed?

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-27 19:48                                             ` David Miller
@ 2008-10-28 10:24                                               ` Mike Galbraith
  2008-10-28 10:37                                                 ` Ingo Molnar
  0 siblings, 1 reply; 94+ messages in thread
From: Mike Galbraith @ 2008-10-28 10:24 UTC (permalink / raw)
  To: David Miller
  Cc: zbr, mingo, alan, jkosina, akpm, a.p.zijlstra, rjw, s0mbre,
	linux-kernel, netdev

On Mon, 2008-10-27 at 12:48 -0700, David Miller wrote: 
> From: Evgeniy Polyakov <zbr@ioremap.net>
> Date: Mon, 27 Oct 2008 22:39:34 +0300
> 
> > On Mon, Oct 27, 2008 at 07:33:12PM +0100, Ingo Molnar (mingo@elte.hu) wrote:
> > > The moment there's real IO it becomes harder to analyze but the same 
> > > basic behavior remains: the more unfair the IO scheduler, the "better" 
> > > dbench results we get.
> > 
> > Right now there is no disk IO at all. Only quite usual network and
> > process load.
> 
> I think the hope is that by saying there isn't a problem enough times,
> it will become truth. :-)
> 
> More seriously, Ingo, what in the world do we need to do in order to get
> you to start doing tbench runs and optimizing things (read as: fixing
> the regression you added)?
> 
> I'm personally working on a test fibonacci heap implementation for
> the fair sched code, and I already did all of the cost analysis all
> the way back to the 2.6.22 pre-CFS days.
> 
> But I'm NOT a scheduler developer, so it isn't my responsibility to do
> this crap for you.  You added this regression, why do I have to get my
> hands dirty in order for there to be some hope that these regressions
> start to get fixed?

I don't want to ruffle any feathers, but my box has comment or two..

Has anyone looked at the numbers box emitted?  Some what I believe to be
very interesting data-points may have been overlooked.

Here's a piece thereof again for better of worse.  One last post won't
burn the last electron.  If they don't agree anyone else's numbers,
that's ok, their numbers have meaning too, and speak for themselves.

Retest hrtick pain:

2.6.26.7-up virgin no highres timers enabled
ring-test   - 1.155 us/cycle  = 865 KHz                                 1.000
netperf     - 130470.93 130771.00 129872.41 rr/s    avg 130371.44 rr/s  1.000 (within jitter of previous tests)
tbench      - 355.153 357.163 356.836 MB/sec        avg 356.384 MB/sec  1.000

2.6.26.7-up virgin highres timers enabled, hrtick enabled
ring-test   - 1.368 us/cycle  = 730 KHz                                  .843
netperf     - 118959.08 118853.16 117761.42 rr/s    avg 118524.55 rr/s   .909
tbench      - 340.999 338.655 340.005 MB/sec        avg 339.886 MB/sec   .953

OK, there's the htrick regression in all it's gory.  Ouch, that hurt.  

Remember those numbers, box muttered them again in 27 testing.  These
previously tested kernels don't even have highres timers enabled, so
obviously hrtick is a non-issue for them.

2.6.26.6-up + clock + buddy + weight
ring-test   - 1.234 us/cycle  = 810 KHz                                  .947 [cmp1]
netperf     - 128026.62 128118.48 127973.54 rr/s    avg 128039.54 rr/s   .977
tbench      - 342.011 345.307 343.535 MB/sec        avg 343.617 MB/sec   .964

2.6.26.6-up + clock + buddy + weight + revert_to_per_rq_vruntime + buddy_overhead
ring-test   - 1.174 us/cycle  = 851 KHz                                  .995 [cmp2]
netperf     - 133928.03 134265.41 134297.06 rr/s    avg 134163.50 rr/s  1.024
tbench      - 358.049 359.529 358.342 MB/sec        avg 358.640 MB/sec  1.006

Note that I added all .27 additional scheduler overhead to .26, and then
removed every last bit of it, theoretically leaving nothing but improved
clock accuracy in the wake.  The ring-test number indicates that our max
context switch rate was thereby indeed fully recovered.  We even got a
modest throughput improvement for our trouble.

However.. 
                                                       versus .26 counterpart
2.6.27-up virgin
ring-test   - 1.193 us/cycle  = 838 KHz                                 1.034 [vs cmp1]
netperf     - 121293.48 121700.96 120716.98 rr/s    avg 121237.14 rr/s   .946
tbench      - 340.362 339.780 341.353 MB/sec        avg 340.498 MB/sec   .990

2.6.27-up + revert_to_per_rq_vruntime + buddy_overhead
ring-test   - 1.122 us/cycle  = 891 KHz                                 1.047 [vs cmp2]
netperf     - 119353.27 118600.98 119719.12 rr/s    avg 119224.45 rr/s   .900
tbench      - 338.701 338.508 338.562 MB/sec        avg 338.590 MB/sec   .951

..removing the overhead from .27 does not produce the anticipated result
despite a max context switch rate markedly above that of 2.6.26.

There lies an as yet unaddressed regression IMBHO.  The hrtick has been
addressed.  It sucked at high frequency, and it's gone.  The added math
overhead in .27 hurt some too, and is now history as well.

These two regressions are nearly identical in magnitude per box.

I don't know who owns that regression, neither does box or git.  I'm not
pointing fingers in any direction.  I've walked the regression hunting
path, and know first-hand how rocky that path is.

There are other things along the regression path that are worth noting:

Three of the releases I tested were tested with identical schedulers,
cfs-v24.1, yet they produced markedly different output, output which
regresses.  Again, I'm not pointing fingers, I'm merely illustrating how
rocky this regression hunting path is.  In 25, the sum of all kernel
changes dropped our max switch rate markedly, yet both tbench and
netperf _improved_ markedly.  More rocks in the road.  etc etc etc.

To really illustrate rockiness, cutting network config down from distro
lard-ball to something leaner and meaner took SMP throughput from this
(was only testing netperf at that time) on 19 Aug..

2.6.22.19 pinned
16384  87380  1        1       300.00   59866.40   
16384  87380  1        1       300.01   59852.78   
16384  87380  1        1       300.01   59618.48   
16384  87380  1        1       300.01   59655.35   

..to this on 13 Sept..

2.6.22.19 (also pinned)
Throughput 1136.02 MB/sec 4 procs

16384  87380  1        1       60.01    94179.12
16384  87380  1        1       60.01    88780.61
16384  87380  1        1       60.01    91057.72
16384  87380  1        1       60.01    94242.16

..and to this on 15 Sept.

2.6.22.19 (also pinned)
Throughput 1250.73 MB/sec 4 procs                  1.00

16384  87380  1        1       60.01    111272.55  1.00
16384  87380  1        1       60.00    104689.58
16384  87380  1        1       60.00    110733.05
16384  87380  1        1       60.00    110748.88

2.6.22.19-cfs-v24.1

Throughput 1204.14 MB/sec 4 procs                  .962

16384  87380  1        1       60.01    101799.85  .929
16384  87380  1        1       60.01    101659.41
16384  87380  1        1       60.01    101628.78
16384  87380  1        1       60.01    101700.53

wakeup granularity = 0 (make scheduler as preempt happy as 2.6.22 is)

Throughput 1213.21 MB/sec 4 procs                  .970

16384  87380  1        1       60.01    108569.27  .992
16384  87380  1        1       60.01    108541.04
16384  87380  1        1       60.00    108579.63
16384  87380  1        1       60.01    108519.09

Is that a rock in my let's double triple quintuple examine scheduler
performance along the regression path or what?  Same box, same
benchmarks and same schedulers I've been examining the whole time.

.992 and .970.

The list goes on and on and on, including SCHED_RR testing where I saw
regression despite no CFS math.  My point here is that every little
change of anything changes the picture up to and including radically.
These configuration changes, if viewed in regression terms, are HUGE.
Build a fully enabled netfilter into the kernel vs modular, and it
becomes even more so. 

The picture with UP config is different, but as far as box is concerned,
while scheduler involvement is certainly interesting, there are even
more interesting places.  Somewhere.

Hopefully this post won't be viewed in the rather cynical light of your
first quoted stanza.  Box is incapable of such, and I have no incentive
to do such ;-)  I just run the benchmarks, collect whatever numbers box
feels like emitting, and run around trying to find the missing bits.

	-Mike


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-28 10:24                                               ` Mike Galbraith
@ 2008-10-28 10:37                                                 ` Ingo Molnar
  2008-10-28 10:57                                                   ` Mike Galbraith
  2008-10-29  9:14                                                   ` Evgeniy Polyakov
  0 siblings, 2 replies; 94+ messages in thread
From: Ingo Molnar @ 2008-10-28 10:37 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: David Miller, zbr, alan, jkosina, akpm, a.p.zijlstra, rjw,
	s0mbre, linux-kernel, netdev


* Mike Galbraith <efault@gmx.de> wrote:

> ..removing the overhead from .27 does not produce the anticipated 
> result despite a max context switch rate markedly above that of 
> 2.6.26.
> 
> There lies an as yet unaddressed regression IMBHO.  The hrtick has 
> been addressed.  It sucked at high frequency, and it's gone.  The 
> added math overhead in .27 hurt some too, and is now history as 
> well.

thanks Mike for the _extensive_ testing and bug hunting session you've 
done in the past couple of weeks! All the relevant fixlets you found 
are now queued up properly in sched/urgent, correct?

What's your gut feeling, is that remaining small regression scheduler 
or networking related?

i'm cutting the ball in half and i'm passing over one half of it to 
the networking folks, because your numbers show _huge_ sensitivity in 
this workload, depending on networking settings:

> To really illustrate rockiness, cutting network config down from distro
> lard-ball to something leaner and meaner took SMP throughput from this
> (was only testing netperf at that time) on 19 Aug..
> 
> 2.6.22.19 pinned
> 16384  87380  1        1       300.00   59866.40   

> 2.6.22.19 (also pinned)
> 16384  87380  1        1       60.01    94179.12

> 2.6.22.19 (also pinned)
> 16384  87380  1        1       60.01    111272.55  1.00

any scheduler micro-overhead detail is going to be a drop in the 
ocean, compared to such huge variations. We could change the scheduler 
to the old O(N) design of the 2.2 kernel and the impact of that would 
be a blip on the radar, compared to the overhead shown above.

	Ingo

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-28 10:37                                                 ` Ingo Molnar
@ 2008-10-28 10:57                                                   ` Mike Galbraith
  2008-10-28 11:02                                                     ` Ingo Molnar
  2008-10-28 14:00                                                     ` Mike Galbraith
  2008-10-29  9:14                                                   ` Evgeniy Polyakov
  1 sibling, 2 replies; 94+ messages in thread
From: Mike Galbraith @ 2008-10-28 10:57 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: David Miller, zbr, alan, jkosina, akpm, a.p.zijlstra, rjw,
	s0mbre, linux-kernel, netdev

On Tue, 2008-10-28 at 11:37 +0100, Ingo Molnar wrote:
> * Mike Galbraith <efault@gmx.de> wrote:
> 
> > ..removing the overhead from .27 does not produce the anticipated 
> > result despite a max context switch rate markedly above that of 
> > 2.6.26.
> > 
> > There lies an as yet unaddressed regression IMBHO.  The hrtick has 
> > been addressed.  It sucked at high frequency, and it's gone.  The 
> > added math overhead in .27 hurt some too, and is now history as 
> > well.
> 
> thanks Mike for the _extensive_ testing and bug hunting session you've 
> done in the past couple of weeks! All the relevant fixlets you found 
> are now queued up properly in sched/urgent, correct?

Yeah.

> What's your gut feeling, is that remaining small regression scheduler 
> or networking related?

I don't know where it lives.  I'm still looking, and the numbers are
still playing games with my head.

> i'm cutting the ball in half and i'm passing over one half of it to 
> the networking folks, because your numbers show _huge_ sensitivity in 
> this workload, depending on networking settings:

I strongly _suspect_ that the network folks have some things they could
investigate, but given my utter failure at finding the smoking gun, I
can't say one way of the other.  IMHO, sharing with network folks would
likely turn out to be a fair thing to do.

Am I waffling?  Me?  You bet your a$$! My clock is already squeaky clean
thank you very much :-)

What I can say is that my box is quite certain that there are influences
outside the scheduler which have more influence benchmark results than
the scheduler does through the life of testing.

	-Mike


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-28 10:57                                                   ` Mike Galbraith
@ 2008-10-28 11:02                                                     ` Ingo Molnar
  2008-10-28 14:00                                                     ` Mike Galbraith
  1 sibling, 0 replies; 94+ messages in thread
From: Ingo Molnar @ 2008-10-28 11:02 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: David Miller, zbr, alan, jkosina, akpm, a.p.zijlstra, rjw,
	s0mbre, linux-kernel, netdev


* Mike Galbraith <efault@gmx.de> wrote:

> What I can say is that my box is quite certain that there are 
> influences outside the scheduler which have more influence benchmark 
> results than the scheduler does through the life of testing.

okay, that's an important observation.

	Ingo

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-28 10:57                                                   ` Mike Galbraith
  2008-10-28 11:02                                                     ` Ingo Molnar
@ 2008-10-28 14:00                                                     ` Mike Galbraith
  2008-10-28 15:22                                                       ` Mike Galbraith
  1 sibling, 1 reply; 94+ messages in thread
From: Mike Galbraith @ 2008-10-28 14:00 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: David Miller, zbr, alan, jkosina, akpm, a.p.zijlstra, rjw,
	s0mbre, linux-kernel, netdev

On Tue, 2008-10-28 at 11:57 +0100, Mike Galbraith wrote:

> I don't know where it lives.  I'm still looking, and the numbers are
> still playing games with my head.

Hm.  _Maybe_ someone needs to take a look at c7aceab.  I took it to a 26
test tree yesterday, and it lowered my throughput, though I didn't
repeat a lot, was too busy.  I just backed it out of one of my 27 test
trees, and the netperf number is 1.030, tbench is 1.040.  I'll test this
in virgin source later, but thought I should drop a note, so perhaps
someone interested in this thread can confirm/deny loss.

	-Mike


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-28 14:00                                                     ` Mike Galbraith
@ 2008-10-28 15:22                                                       ` Mike Galbraith
  0 siblings, 0 replies; 94+ messages in thread
From: Mike Galbraith @ 2008-10-28 15:22 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: David Miller, zbr, alan, jkosina, akpm, a.p.zijlstra, rjw,
	s0mbre, linux-kernel, netdev

On Tue, 2008-10-28 at 15:00 +0100, Mike Galbraith wrote:
> On Tue, 2008-10-28 at 11:57 +0100, Mike Galbraith wrote:
> 
> > I don't know where it lives.  I'm still looking, and the numbers are
> > still playing games with my head.
> 
> Hm.  _Maybe_ someone needs to take a look at c7aceab.

Bah, too much testing, must have done something stupid.

	-Mike


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-28 10:37                                                 ` Ingo Molnar
  2008-10-28 10:57                                                   ` Mike Galbraith
@ 2008-10-29  9:14                                                   ` Evgeniy Polyakov
  2008-10-29  9:50                                                     ` Evgeniy Polyakov
  1 sibling, 1 reply; 94+ messages in thread
From: Evgeniy Polyakov @ 2008-10-29  9:14 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Mike Galbraith, David Miller, alan, jkosina, akpm, a.p.zijlstra,
	rjw, s0mbre, linux-kernel, netdev

Hi.

Cooled down? Let's start over the sched vs network fight.

On Tue, Oct 28, 2008 at 11:37:41AM +0100, Ingo Molnar (mingo@elte.hu) wrote:
> What's your gut feeling, is that remaining small regression scheduler 
> or networking related?
> 
> i'm cutting the ball in half and i'm passing over one half of it to 
> the networking folks, because your numbers show _huge_ sensitivity in 
> this workload, depending on networking settings:

Sorry for interrupting your conversation, Ingo, but before throwing
a stone one should be clear himself, doesn't it? When you asked to
test -tip tree and it was showed it regressed about 20 MB/s in my
test against the last tweaks you suggested, -tip still was merged and no
work on this issue was performed. Now previous tweaks (nohrticks and
nobalalance, and although hrticks are now disabled, performance did not
return to the vanilla .27 with tweaks) do not help anymore, so
apparently there is additional problem.

So for the reference:
vanilla 27	: 347.222
no TSO/GSO	: 357.331
no hrticks	: 382.983
no balance	: 389.802
4403b4 commit	: 361.184
dirty_ratio-50	: 361.086
no-sched-tweaks	: 361.367

So scheduler _does_ regress even right now when this thread is being
discussed.

Now let's return to the network. Ilpo Järvinen showed a nasty modulo
operation in the fast path which David thinks on how to resolve it, but
it happend that this change was introduced back in 2005, and although
some naive change allows to increase performance upto 370 MB/s, i.e. it
gained us 2.5%, this was never accounted in previous changes.

So, probably, if we revert -tip merge to vanilla .27, add nohrtick patch
and nobalance tweak _only_, and apply naive TSO patch we could bring
system to 400 MB/s. Note, that .22 has 479.82 and .23 454.36 MB/s.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-29  9:14                                                   ` Evgeniy Polyakov
@ 2008-10-29  9:50                                                     ` Evgeniy Polyakov
  2008-11-01 12:51                                                       ` Paolo Ciarrocchi
  0 siblings, 1 reply; 94+ messages in thread
From: Evgeniy Polyakov @ 2008-10-29  9:50 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Mike Galbraith, David Miller, alan, jkosina, akpm, a.p.zijlstra,
	rjw, s0mbre, linux-kernel, netdev

On Wed, Oct 29, 2008 at 12:14:05PM +0300, Evgeniy Polyakov (zbr@ioremap.net) wrote:
> vanilla 27	: 347.222
> no TSO/GSO	: 357.331
> no hrticks	: 382.983
> no balance	: 389.802
> 4403b4 commit	: 361.184
> dirty_ratio-50	: 361.086
> no-sched-tweaks	: 361.367
> 
> So, probably, if we revert -tip merge to vanilla .27, add nohrtick patch
> and nobalance tweak _only_, and apply naive TSO patch we could bring
> system to 400 MB/s. Note, that .22 has 479.82 and .23 454.36 MB/s.

And now I have to admit that the very last -top merge did noticebly
improve the situation upto 391.331 MB/s (189 in domains, with tso/gso
off and naive tcp_tso_should_defer() hange).

So we are now essentially at the level of 24-25 trees in my tests.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-27 18:33                                         ` Ingo Molnar
  2008-10-27 19:39                                           ` Evgeniy Polyakov
@ 2008-10-29  9:59                                           ` Nick Piggin
  1 sibling, 0 replies; 94+ messages in thread
From: Nick Piggin @ 2008-10-29  9:59 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Alan Cox, Jiri Kosina, Andrew Morton, Peter Zijlstra,
	Mike Galbraith, David Miller, rjw, s0mbre, linux-kernel, netdev

On Tuesday 28 October 2008 05:33, Ingo Molnar wrote:
> * Alan Cox <alan@lxorguk.ukuu.org.uk> wrote:
> > > The way to get the best possible dbench numbers in CPU-bound dbench
> > > runs, you have to throw away the scheduler completely, and do this
> > > instead:
> > >
> > >  - first execute all requests of client 1
> > >  - then execute all requests of client 2
> > >  ....
> > >  - execute all requests of client N
> >
> > Rubbish. [...]
>
> i've actually implemented that about a decade ago: i've tracked down
> what makes dbench tick, i've implemented the kernel heuristics for it
> to make dbench scale linearly with the number of clients - just to be
> shot down by Linus about my utter rubbish approach ;-)
>
> > [...] If you do that you'll not get enough I/O in parallel to
> > schedule the disk well (not that most of our I/O schedulers are
> > doing the job well, and the vm writeback threads then mess it up and
> > the lack of Arjans ioprio fixes then totally screw you) </rant>
>
> the best dbench results come from systems that have enough RAM to
> cache the full working set, and a filesystem intelligent enough to not
> insert bogus IO serialization cycles (ext3 is not such a filesystem).

You can get good dbench results come from dbench on tmpfs, which
exercises the vm vfs scheduler etc without IO or filesystems.

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-26 12:23                 ` Evgeniy Polyakov
@ 2008-10-30 18:15                   ` Stephen Hemminger
  2008-10-30 18:40                     ` Evgeniy Polyakov
                                       ` (2 more replies)
  0 siblings, 3 replies; 94+ messages in thread
From: Stephen Hemminger @ 2008-10-30 18:15 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Rafael J. Wysocki, Ingo Molnar, Evgeniy Polyakov, Peter Zijlstra,
	linux-kernel, netdev, David Miller, Mike Galbraith,
	Andrew Morton

Has anyone looked into the impact of port randomization on this benchmark.
If it is generating lots of sockets quickly there could be an impact:
  * port randomization causes available port space to get filled non-uniformly
    and what was once a linear scan may have to walk over existing ports.
    (This could be improved by a hint bitmap)

  * port randomization adds at least one modulus operation per socket
    creation. This could be optimized by using a loop instead.

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-30 18:15                   ` Stephen Hemminger
@ 2008-10-30 18:40                     ` Evgeniy Polyakov
  2008-10-30 18:43                     ` Eric Dumazet
  2008-10-30 19:01                     ` Ilpo Järvinen
  2 siblings, 0 replies; 94+ messages in thread
From: Evgeniy Polyakov @ 2008-10-30 18:40 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Rafael J. Wysocki, Ingo Molnar, Evgeniy Polyakov, Peter Zijlstra,
	linux-kernel, netdev, David Miller, Mike Galbraith,
	Andrew Morton

On Thu, Oct 30, 2008 at 11:15:26AM -0700, Stephen Hemminger (shemminger@vyatta.com) wrote:
> Has anyone looked into the impact of port randomization on this benchmark.
> If it is generating lots of sockets quickly there could be an impact:
>   * port randomization causes available port space to get filled non-uniformly
>     and what was once a linear scan may have to walk over existing ports.
>     (This could be improved by a hint bitmap)
> 
>   * port randomization adds at least one modulus operation per socket
>     creation. This could be optimized by using a loop instead.

In this benchmark only two sockets are created per client for the whole
run, so this should not have any impact on performance.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-30 18:15                   ` Stephen Hemminger
  2008-10-30 18:40                     ` Evgeniy Polyakov
@ 2008-10-30 18:43                     ` Eric Dumazet
  2008-10-30 18:56                       ` Eric Dumazet
  2008-10-30 19:01                     ` Ilpo Järvinen
  2 siblings, 1 reply; 94+ messages in thread
From: Eric Dumazet @ 2008-10-30 18:43 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Evgeniy Polyakov, Rafael J. Wysocki, Ingo Molnar,
	Evgeniy Polyakov, Peter Zijlstra, linux-kernel, netdev,
	David Miller, Mike Galbraith, Andrew Morton

Stephen Hemminger a écrit :
> Has anyone looked into the impact of port randomization on this benchmark.
> If it is generating lots of sockets quickly there could be an impact:
>   * port randomization causes available port space to get filled non-uniformly
>     and what was once a linear scan may have to walk over existing ports.
>     (This could be improved by a hint bitmap)
> 
>   * port randomization adds at least one modulus operation per socket
>     creation. This could be optimized by using a loop instead.



tbench setups one socket per client, then send/receive lot of messages on this socket.

Connection setup time can be ignored for the tbench regression analysis



^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-30 18:43                     ` Eric Dumazet
@ 2008-10-30 18:56                       ` Eric Dumazet
  0 siblings, 0 replies; 94+ messages in thread
From: Eric Dumazet @ 2008-10-30 18:56 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Evgeniy Polyakov, Rafael J. Wysocki, Ingo Molnar,
	Evgeniy Polyakov, Peter Zijlstra, linux-kernel, netdev,
	David Miller, Mike Galbraith, Andrew Morton

Eric Dumazet a écrit :
> Stephen Hemminger a écrit :
>> Has anyone looked into the impact of port randomization on this 
>> benchmark.
>> If it is generating lots of sockets quickly there could be an impact:
>>   * port randomization causes available port space to get filled 
>> non-uniformly
>>     and what was once a linear scan may have to walk over existing ports.
>>     (This could be improved by a hint bitmap)
>>
>>   * port randomization adds at least one modulus operation per socket
>>     creation. This could be optimized by using a loop instead.
> 
> 
> 
> tbench setups one socket per client, then send/receive lot of messages 
> on this socket.
> 
> Connection setup time can be ignored for the tbench regression analysis
> 

Hum, re-reading your question, I feel you might have a valid point after all :)

Not because of connection setup time, but because the rwlocks used on tcp hash table.

tcp sessions used on this tbench test might now be on the same cache lines,
because of port randomization or so.

CPUS might do cache-line ping pongs on those rwlocks.

# netstat -tn|grep 7003
tcp        0     59 127.0.0.1:37248         127.0.0.1:7003          ESTABLISHED
tcp        0     71 127.0.0.1:7003          127.0.0.1:37252         ESTABLISHED
tcp        0      0 127.0.0.1:37251         127.0.0.1:7003          ESTABLISHED
tcp        0   4155 127.0.0.1:7003          127.0.0.1:37249         ESTABLISHED
tcp        0     55 127.0.0.1:7003          127.0.0.1:37248         ESTABLISHED
tcp        0      0 127.0.0.1:37252         127.0.0.1:7003          ESTABLISHED
tcp        0      0 127.0.0.1:37249         127.0.0.1:7003          ESTABLISHED
tcp        0     59 127.0.0.1:37246         127.0.0.1:7003          ESTABLISHED
tcp        0      0 127.0.0.1:37250         127.0.0.1:7003          ESTABLISHED
tcp       71      0 127.0.0.1:37245         127.0.0.1:7003          ESTABLISHED
tcp        0      0 127.0.0.1:37244         127.0.0.1:7003          ESTABLISHED
tcp        0     87 127.0.0.1:7003          127.0.0.1:37250         ESTABLISHED
tcp        0   4155 127.0.0.1:7003          127.0.0.1:37251         ESTABLISHED
tcp        0   4155 127.0.0.1:7003          127.0.0.1:37246         ESTABLISHED
tcp        0     71 127.0.0.1:7003          127.0.0.1:37245         ESTABLISHED
tcp        0   4155 127.0.0.1:7003          127.0.0.1:37244         ESTABLISHED

We use a jhash, so normally we could expect a really random split of hash values
for all these sessions, but it would be worth to check :)

You know understand why we want to avoid those rwlocks Stephen, and switch to RCU...


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-30 18:15                   ` Stephen Hemminger
  2008-10-30 18:40                     ` Evgeniy Polyakov
  2008-10-30 18:43                     ` Eric Dumazet
@ 2008-10-30 19:01                     ` Ilpo Järvinen
  2008-10-31  7:52                       ` David Miller
  2 siblings, 1 reply; 94+ messages in thread
From: Ilpo Järvinen @ 2008-10-30 19:01 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Evgeniy Polyakov, Rafael J. Wysocki, Ingo Molnar,
	Evgeniy Polyakov, Peter Zijlstra, LKML, Netdev, David Miller,
	Mike Galbraith, Andrew Morton

On Thu, 30 Oct 2008, Stephen Hemminger wrote:

> Has anyone looked into the impact of port randomization on this benchmark.
> If it is generating lots of sockets quickly there could be an impact:
>   * port randomization causes available port space to get filled non-uniformly
>     and what was once a linear scan may have to walk over existing ports.
>     (This could be improved by a hint bitmap)
> 
>   * port randomization adds at least one modulus operation per socket
>     creation. This could be optimized by using a loop instead.

I did something with AIM9's tcp_test recently (1-2 days ago depending on 
how one calculates that so didn't yet have time summarize the details in 
the AIM9 thread) by deterministicly binding in userspace and got much more 
sensible numbers than with randomized ports (2-4%/5-7% vs 25% variation 
some difference in variation in different kernel versions even with 
deterministic binding). Also, I'm still to actually oprofile and bisect 
the remaining ~4% regression (around 20% was reported by Christoph). For 
oprofiling I might have to change aim9 to do predefined number of loops 
instead of a deadline to get more consistent view on changes in per func 
runtime.

AIM9 is one process only so scheduler has a bit less to do in that 
benchmark anyway.

It would probably be nice to test just the port randomizer separately to 
see if there's some regression in that but I don't expect it to happen
any time soon unless I quickly come up with something in the bisection.


-- 
 i.

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-30 19:01                     ` Ilpo Järvinen
@ 2008-10-31  7:52                       ` David Miller
  2008-10-31  9:40                         ` Ilpo Järvinen
  0 siblings, 1 reply; 94+ messages in thread
From: David Miller @ 2008-10-31  7:52 UTC (permalink / raw)
  To: ilpo.jarvinen
  Cc: shemminger, zbr, rjw, mingo, s0mbre, a.p.zijlstra, linux-kernel,
	netdev, efault, akpm

From: "Ilpo Järvinen" <ilpo.jarvinen@helsinki.fi>
Date: Thu, 30 Oct 2008 21:01:19 +0200 (EET)

> On Thu, 30 Oct 2008, Stephen Hemminger wrote:
> 
> > Has anyone looked into the impact of port randomization on this benchmark.
> > If it is generating lots of sockets quickly there could be an impact:
> >   * port randomization causes available port space to get filled non-uniformly
> >     and what was once a linear scan may have to walk over existing ports.
> >     (This could be improved by a hint bitmap)
> > 
> >   * port randomization adds at least one modulus operation per socket
> >     creation. This could be optimized by using a loop instead.
> 
> I did something with AIM9's tcp_test recently (1-2 days ago depending on 
> how one calculates that so didn't yet have time summarize the details in 
> the AIM9 thread) by deterministicly binding in userspace and got much more 
> sensible numbers than with randomized ports (2-4%/5-7% vs 25% variation 
> some difference in variation in different kernel versions even with 
> deterministic binding). Also, I'm still to actually oprofile and bisect 
> the remaining ~4% regression (around 20% was reported by Christoph). For 
> oprofiling I might have to change aim9 to do predefined number of loops 
> instead of a deadline to get more consistent view on changes in per func 
> runtime.

Yes, it looks like port selection cache and locking effects are
a very real issue.

Good find.

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-31  7:52                       ` David Miller
@ 2008-10-31  9:40                         ` Ilpo Järvinen
  2008-10-31  9:51                           ` David Miller
  0 siblings, 1 reply; 94+ messages in thread
From: Ilpo Järvinen @ 2008-10-31  9:40 UTC (permalink / raw)
  To: David Miller
  Cc: shemminger, zbr, rjw, mingo, s0mbre, a.p.zijlstra, LKML, Netdev,
	efault, Andrew Morton

[-- Attachment #1: Type: TEXT/PLAIN, Size: 2152 bytes --]

On Fri, 31 Oct 2008, David Miller wrote:

> From: "Ilpo Järvinen" <ilpo.jarvinen@helsinki.fi>
> Date: Thu, 30 Oct 2008 21:01:19 +0200 (EET)
> 
> > On Thu, 30 Oct 2008, Stephen Hemminger wrote:
> > 
> > > Has anyone looked into the impact of port randomization on this benchmark.
> > > If it is generating lots of sockets quickly there could be an impact:
> > >   * port randomization causes available port space to get filled non-uniformly
> > >     and what was once a linear scan may have to walk over existing ports.
> > >     (This could be improved by a hint bitmap)
> > > 
> > >   * port randomization adds at least one modulus operation per socket
> > >     creation. This could be optimized by using a loop instead.
> > 
> > I did something with AIM9's tcp_test recently (1-2 days ago depending on 
> > how one calculates that so didn't yet have time summarize the details in 
> > the AIM9 thread) by deterministicly binding in userspace and got much more 
> > sensible numbers than with randomized ports (2-4%/5-7% vs 25% variation 
> > some difference in variation in different kernel versions even with 
> > deterministic binding). Also, I'm still to actually oprofile and bisect 
> > the remaining ~4% regression (around 20% was reported by Christoph). For 
> > oprofiling I might have to change aim9 to do predefined number of loops 
> > instead of a deadline to get more consistent view on changes in per func 
> > runtime.
> 
> Yes, it looks like port selection cache and locking effects are
> a very real issue.
> 
> Good find.

Let me remind that it is just a single process, so no ping-pong & other 
lock related cache effects should play any significant role here, no? (I'm 
no expert though :-)).

One thing I didn't mention earlier, is that I also turned on 
tcp_tw_recycle to get the binding to work without giving
-ESOMETHING very early (also did some, possibly meaningless
things, like drop_caches before each test run, might be
significant only because of the test harness cause minor
variantions). I intend to try w/o binding of the client end
but I guess I might again get more variation between different
test runs.

-- 
 i.

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-31  9:40                         ` Ilpo Järvinen
@ 2008-10-31  9:51                           ` David Miller
  2008-10-31 10:42                             ` Ilpo Järvinen
  2008-10-31 10:45                             ` Eric Dumazet
  0 siblings, 2 replies; 94+ messages in thread
From: David Miller @ 2008-10-31  9:51 UTC (permalink / raw)
  To: ilpo.jarvinen
  Cc: shemminger, zbr, rjw, mingo, s0mbre, a.p.zijlstra, linux-kernel,
	netdev, efault, akpm

From: "Ilpo Järvinen" <ilpo.jarvinen@helsinki.fi>
Date: Fri, 31 Oct 2008 11:40:16 +0200 (EET)

> Let me remind that it is just a single process, so no ping-pong & other 
> lock related cache effects should play any significant role here, no? (I'm 
> no expert though :-)).

Not locks or ping-pongs perhaps, I guess.  So it just sends and
receives over a socket, implementing both ends of the communication
in the same process?

If hash chain conflicts do happen for those 2 sockets, just traversing
the chain 2 entries deep could show up.

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-31  9:51                           ` David Miller
@ 2008-10-31 10:42                             ` Ilpo Järvinen
  2008-10-31 10:45                             ` Eric Dumazet
  1 sibling, 0 replies; 94+ messages in thread
From: Ilpo Järvinen @ 2008-10-31 10:42 UTC (permalink / raw)
  To: David Miller
  Cc: shemminger, zbr, rjw, mingo, s0mbre, a.p.zijlstra, LKML, Netdev,
	efault, Andrew Morton

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1471 bytes --]

On Fri, 31 Oct 2008, David Miller wrote:

> From: "Ilpo Järvinen" <ilpo.jarvinen@helsinki.fi>
> Date: Fri, 31 Oct 2008 11:40:16 +0200 (EET)
> 
> > Let me remind that it is just a single process, so no ping-pong & other 
> > lock related cache effects should play any significant role here, no? (I'm 
> > no expert though :-)).
> 
> Not locks or ping-pongs perhaps, I guess.  So it just sends and
> receives over a socket, implementing both ends of the communication
> in the same process?

Effectively its this:

signal(SIGALRM, alarm_handler);
...
while (flag) { /* flagged by alarm_handler */
	loops = 90
	open & setup sockets & connection
	while (--loops > 0) {
		write(wr_fd, buf, size);
		read(rd_fd, buf, size);
	}
	close sockets
}

where size comes from this array (advancing in the inner loop one by one):

static int sizes[] = {
        1, 3, 5, 7, 16, 32, 64, 512, 1024, 2048,        /* misc. sizes */
        1, 3, 5, 7, 16, 32, 64, 512, 1024, 2048,
        32, 32, 32, 32, 32, 32,         /* x windows mostly... */
        512, 512, 512, 512, 512,        /* DBMS's mostly */
};

buf sits in the stack and is not initialized (besides reading into it).

...I think the rest is just bogus complexity :-) ...maybe I should just 
take that from above as basis for Reduced AIM9 benchmark, it nearly 
compiles already.

> If hash chain conflicts do happen for those 2 sockets, just traversing
> the chain 2 entries deep could show up.

No idea on this one.


-- 
 i.

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-31  9:51                           ` David Miller
  2008-10-31 10:42                             ` Ilpo Järvinen
@ 2008-10-31 10:45                             ` Eric Dumazet
  2008-10-31 11:01                               ` Ilpo Järvinen
  2008-10-31 19:57                               ` Stephen Hemminger
  1 sibling, 2 replies; 94+ messages in thread
From: Eric Dumazet @ 2008-10-31 10:45 UTC (permalink / raw)
  To: David Miller
  Cc: ilpo.jarvinen, shemminger, zbr, rjw, mingo, s0mbre, a.p.zijlstra,
	linux-kernel, netdev, efault, akpm

[-- Attachment #1: Type: text/plain, Size: 1359 bytes --]

David Miller a écrit :
> From: "Ilpo Järvinen" <ilpo.jarvinen@helsinki.fi>
> Date: Fri, 31 Oct 2008 11:40:16 +0200 (EET)
> 
>> Let me remind that it is just a single process, so no ping-pong & other 
>> lock related cache effects should play any significant role here, no? (I'm 
>> no expert though :-)).
> 
> Not locks or ping-pongs perhaps, I guess.  So it just sends and
> receives over a socket, implementing both ends of the communication
> in the same process?
> 
> If hash chain conflicts do happen for those 2 sockets, just traversing
> the chain 2 entries deep could show up.

tbench is very sensible to cache line ping-pongs (on SMP machines of course)

Just to prove my point, I coded the following patch and tried it
on a HP BL460c G1. This machine has 2 quad cores cpu 
(Intel(R) Xeon(R) CPU E5450  @3.00GHz)

tbench 8 went from 2240 MB/s to 2310 MB/s after this patch applied

[PATCH] net: Introduce netif_set_last_rx() helper

On SMP machine, loopback device (and possibly others net device)
should try to avoid dirty the memory cache line containing "last_rx"
field. Got 3% increase on tbench on a 8 cpus machine.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
 drivers/net/loopback.c    |    2 +-
 include/linux/netdevice.h |   16 ++++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)


[-- Attachment #2: netif_set_last_rx.patch --]
[-- Type: text/plain, Size: 1232 bytes --]

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 3b43bfd..cf17238 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -85,7 +85,7 @@ static int loopback_xmit(struct sk_buff *skb, struct net_device *dev)
 		return 0;
 	}
 #endif
-	dev->last_rx = jiffies;
+	netif_set_last_rx(dev);
 
 	/* it's OK to use per_cpu_ptr() because BHs are off */
 	pcpu_lstats = dev->ml_priv;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c8bcb59..6729865 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -849,6 +849,22 @@ static inline void *netdev_priv(const struct net_device *dev)
 #define SET_NETDEV_DEV(net, pdev)	((net)->dev.parent = (pdev))
 
 /**
+ *	netif_set_last_rx - Set last_rx field of a device
+ *	@dev:  network device
+ *
+ * Instead of setting net->last_rx to jiffies, drivers should call this helper
+ * to avoid dirtying a cache line if last_rx already has the current jiffies
+ */
+static inline void netif_set_last_rx(struct net_device *dev)
+{
+#ifdef CONFIG_SMP
+	if (dev->last_rx == jiffies)
+		return;
+#endif
+	dev->last_rx = jiffies;
+}
+
+/**
  *	netif_napi_add - initialize a napi context
  *	@dev:  network device
  *	@napi: napi context

^ permalink raw reply related	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-31 10:45                             ` Eric Dumazet
@ 2008-10-31 11:01                               ` Ilpo Järvinen
  2008-10-31 11:10                                 ` Eric Dumazet
  2008-10-31 19:57                               ` Stephen Hemminger
  1 sibling, 1 reply; 94+ messages in thread
From: Ilpo Järvinen @ 2008-10-31 11:01 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, shemminger, zbr, rjw, mingo, s0mbre, a.p.zijlstra,
	LKML, Netdev, efault, Andrew Morton

[-- Attachment #1: Type: TEXT/PLAIN, Size: 837 bytes --]

On Fri, 31 Oct 2008, Eric Dumazet wrote:

> David Miller a écrit :
> > From: "Ilpo Järvinen" <ilpo.jarvinen@helsinki.fi>
> > Date: Fri, 31 Oct 2008 11:40:16 +0200 (EET)
> > 
> > > Let me remind that it is just a single process, so no ping-pong & other
> > > lock related cache effects should play any significant role here, no? (I'm
> > > no expert though :-)).
> > 
> > Not locks or ping-pongs perhaps, I guess.  So it just sends and
> > receives over a socket, implementing both ends of the communication
> > in the same process?
> > 
> > If hash chain conflicts do happen for those 2 sockets, just traversing
> > the chain 2 entries deep could show up.
> 
> tbench is very sensible to cache line ping-pongs (on SMP machines of course)

...Sorry to disappoint you but we were discussion there on my AIM9 
tcp_test results :-).

-- 
 i.

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-31 11:01                               ` Ilpo Järvinen
@ 2008-10-31 11:10                                 ` Eric Dumazet
  2008-10-31 11:15                                   ` Ilpo Järvinen
  0 siblings, 1 reply; 94+ messages in thread
From: Eric Dumazet @ 2008-10-31 11:10 UTC (permalink / raw)
  To: Ilpo Järvinen
  Cc: David Miller, shemminger, zbr, rjw, mingo, s0mbre, a.p.zijlstra,
	LKML, Netdev, efault, Andrew Morton

Ilpo Järvinen a écrit :
> On Fri, 31 Oct 2008, Eric Dumazet wrote:
> 
>> David Miller a écrit :
>> > From: "Ilpo Järvinen" <ilpo.jarvinen@helsinki.fi>
>> > Date: Fri, 31 Oct 2008 11:40:16 +0200 (EET)
>> > > > Let me remind that it is just a single process, so no ping-pong 
>> & other
>> > > lock related cache effects should play any significant role here, 
>> no? (I'm
>> > > no expert though :-)).
>> > > Not locks or ping-pongs perhaps, I guess.  So it just sends and
>> > receives over a socket, implementing both ends of the communication
>> > in the same process?
>> > > If hash chain conflicts do happen for those 2 sockets, just 
>> traversing
>> > the chain 2 entries deep could show up.
>>
>> tbench is very sensible to cache line ping-pongs (on SMP machines of 
>> course)
> 
> ...Sorry to disappoint you but we were discussion there on my AIM9 
> tcp_test results :-).
> 

Well, before you added AIM9 on this topic, we were focusing on tbench :)

Sorry to disappoint you :)


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-31 11:10                                 ` Eric Dumazet
@ 2008-10-31 11:15                                   ` Ilpo Järvinen
  0 siblings, 0 replies; 94+ messages in thread
From: Ilpo Järvinen @ 2008-10-31 11:15 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, shemminger, zbr, rjw, mingo, s0mbre, a.p.zijlstra,
	LKML, Netdev, efault, Andrew Morton

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1174 bytes --]

On Fri, 31 Oct 2008, Eric Dumazet wrote:

> Ilpo Järvinen a écrit :
> > On Fri, 31 Oct 2008, Eric Dumazet wrote:
> > 
> > > David Miller a écrit :
> > > > From: "Ilpo Järvinen" <ilpo.jarvinen@helsinki.fi>
> > > > Date: Fri, 31 Oct 2008 11:40:16 +0200 (EET)
> > > > > > Let me remind that it is just a single process, so no ping-pong 
> >> & other
> > > > > lock related cache effects should play any significant role here, 
> > > no? (I'm
> > > > > no expert though :-)).
> > > > > Not locks or ping-pongs perhaps, I guess.  So it just sends and
> > > > receives over a socket, implementing both ends of the communication
> > > > in the same process?
> > > > > If hash chain conflicts do happen for those 2 sockets, just 
> > > traversing
> > > > the chain 2 entries deep could show up.
> > >
> > > tbench is very sensible to cache line ping-pongs (on SMP machines of
> > > course)
> > 
> > ...Sorry to disappoint you but we were discussion there on my AIM9 tcp_test
> > results :-).
> > 
> 
> Well, before you added AIM9 on this topic, we were focusing on tbench :)
>
> Sorry to disappoint you :)

It's all Stephen's fault, he added port randomization first... ;-)

-- 
 i.

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-31 10:45                             ` Eric Dumazet
  2008-10-31 11:01                               ` Ilpo Järvinen
@ 2008-10-31 19:57                               ` Stephen Hemminger
  2008-10-31 20:10                                 ` Evgeniy Polyakov
  1 sibling, 1 reply; 94+ messages in thread
From: Stephen Hemminger @ 2008-10-31 19:57 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, ilpo.jarvinen, zbr, rjw, mingo, s0mbre,
	a.p.zijlstra, linux-kernel, netdev, efault, akpm

On Fri, 31 Oct 2008 11:45:33 +0100
Eric Dumazet <dada1@cosmosbay.com> wrote:

> David Miller a écrit :
> > From: "Ilpo Järvinen" <ilpo.jarvinen@helsinki.fi>
> > Date: Fri, 31 Oct 2008 11:40:16 +0200 (EET)
> > 
> >> Let me remind that it is just a single process, so no ping-pong & other 
> >> lock related cache effects should play any significant role here, no? (I'm 
> >> no expert though :-)).
> > 
> > Not locks or ping-pongs perhaps, I guess.  So it just sends and
> > receives over a socket, implementing both ends of the communication
> > in the same process?
> > 
> > If hash chain conflicts do happen for those 2 sockets, just traversing
> > the chain 2 entries deep could show up.
> 
> tbench is very sensible to cache line ping-pongs (on SMP machines of course)
> 
> Just to prove my point, I coded the following patch and tried it
> on a HP BL460c G1. This machine has 2 quad cores cpu 
> (Intel(R) Xeon(R) CPU E5450  @3.00GHz)
> 
> tbench 8 went from 2240 MB/s to 2310 MB/s after this patch applied
> 
> [PATCH] net: Introduce netif_set_last_rx() helper
> 
> On SMP machine, loopback device (and possibly others net device)
> should try to avoid dirty the memory cache line containing "last_rx"
> field. Got 3% increase on tbench on a 8 cpus machine.
> 
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
> ---
>  drivers/net/loopback.c    |    2 +-
>  include/linux/netdevice.h |   16 ++++++++++++++++
>  2 files changed, 17 insertions(+), 1 deletion(-)
> 
> 

Why bother with last_rx at all on loopback.  I have been thinking
we should figure out a way to get rid of last_rx all together. It only
seems to be used by bonding, and the bonding driver could do the calculation
in its receive handling.

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-31 19:57                               ` Stephen Hemminger
@ 2008-10-31 20:10                                 ` Evgeniy Polyakov
  2008-10-31 21:03                                   ` Eric Dumazet
  0 siblings, 1 reply; 94+ messages in thread
From: Evgeniy Polyakov @ 2008-10-31 20:10 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Eric Dumazet, David Miller, ilpo.jarvinen, rjw, mingo, s0mbre,
	a.p.zijlstra, linux-kernel, netdev, efault, akpm

On Fri, Oct 31, 2008 at 12:57:13PM -0700, Stephen Hemminger (shemminger@vyatta.com) wrote:
> Why bother with last_rx at all on loopback.  I have been thinking
> we should figure out a way to get rid of last_rx all together. It only
> seems to be used by bonding, and the bonding driver could do the calculation
> in its receive handling.

Not related to the regression: bug will be just papered out by this
changes. Having bonding on loopback is somewhat strange idea, but still
this kind of changes is an attempt to make a good play in the bad game:
this loopback-only optimization does not fix the problem.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-31 20:10                                 ` Evgeniy Polyakov
@ 2008-10-31 21:03                                   ` Eric Dumazet
  2008-10-31 21:18                                     ` Evgeniy Polyakov
  2008-10-31 23:51                                     ` David Miller
  0 siblings, 2 replies; 94+ messages in thread
From: Eric Dumazet @ 2008-10-31 21:03 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Stephen Hemminger, David Miller, ilpo.jarvinen, rjw, mingo,
	s0mbre, a.p.zijlstra, linux-kernel, netdev, efault, akpm

Evgeniy Polyakov a écrit :
> On Fri, Oct 31, 2008 at 12:57:13PM -0700, Stephen Hemminger (shemminger@vyatta.com) wrote:
>> Why bother with last_rx at all on loopback.  I have been thinking
>> we should figure out a way to get rid of last_rx all together. It only
>> seems to be used by bonding, and the bonding driver could do the calculation
>> in its receive handling.
> 
> Not related to the regression: bug will be just papered out by this
> changes. Having bonding on loopback is somewhat strange idea, but still
> this kind of changes is an attempt to make a good play in the bad game:
> this loopback-only optimization does not fix the problem.
> 

Just to be clear, this change was not meant to be committed.
It already was rejected by David some years ago (2005, and 2006)

http://www.mail-archive.com/netdev@vger.kernel.org/msg07382.html

If you read my mail, I was *only* saying that tbench results can be sensible to
cache line ping pongs. tbench is a crazy benchmark, and only is a crazy benchmark.

Optimizing linux for tbench sake would be .... crazy ?


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-31 21:03                                   ` Eric Dumazet
@ 2008-10-31 21:18                                     ` Evgeniy Polyakov
  2008-10-31 23:51                                     ` David Miller
  1 sibling, 0 replies; 94+ messages in thread
From: Evgeniy Polyakov @ 2008-10-31 21:18 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, David Miller, ilpo.jarvinen, rjw, mingo,
	s0mbre, a.p.zijlstra, linux-kernel, netdev, efault, akpm

Hi Eric.

On Fri, Oct 31, 2008 at 10:03:00PM +0100, Eric Dumazet (dada1@cosmosbay.com) wrote:
> Just to be clear, this change was not meant to be committed.
> It already was rejected by David some years ago (2005, and 2006)
> 
> http://www.mail-archive.com/netdev@vger.kernel.org/msg07382.html
> 
> If you read my mail, I was *only* saying that tbench results can be 
> sensible to
> cache line ping pongs. tbench is a crazy benchmark, and only is a crazy 
> benchmark.

No problem Eric, I just pointed that this particular case is rather
fluffy, which really does not fix anything. It improves the case, but
the way it does it, is not the right one imho.
We would definitely want to eliminate assignment of global constantly
updated variables in the pathes where it is not required, but in a
way which does improve the design and implementation, but not to
hide some other problem.

Tbench is, well, as is it is quite usual network server :)
Dbench side is rather non-optimized, but still it is quite common
pattern of small-sized IO. Anyway, optimizing for some kind of the
workload tends to force other side to become slower, so I agree of
course that any narrow-viewed optimizations are bad, and instead we
should focus on searching error patter more widerspread.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-31 21:03                                   ` Eric Dumazet
  2008-10-31 21:18                                     ` Evgeniy Polyakov
@ 2008-10-31 23:51                                     ` David Miller
  2008-10-31 23:56                                       ` Stephen Hemminger
  1 sibling, 1 reply; 94+ messages in thread
From: David Miller @ 2008-10-31 23:51 UTC (permalink / raw)
  To: dada1
  Cc: zbr, shemminger, ilpo.jarvinen, rjw, mingo, s0mbre, a.p.zijlstra,
	linux-kernel, netdev, efault, akpm

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Fri, 31 Oct 2008 22:03:00 +0100

> Evgeniy Polyakov a écrit :
> > On Fri, Oct 31, 2008 at 12:57:13PM -0700, Stephen Hemminger (shemminger@vyatta.com) wrote:
> >> Why bother with last_rx at all on loopback.  I have been thinking
> >> we should figure out a way to get rid of last_rx all together. It only
> >> seems to be used by bonding, and the bonding driver could do the calculation
> >> in its receive handling.
> > Not related to the regression: bug will be just papered out by this
> > changes. Having bonding on loopback is somewhat strange idea, but still
> > this kind of changes is an attempt to make a good play in the bad game:
> > this loopback-only optimization does not fix the problem.
> 
> Just to be clear, this change was not meant to be committed.
> It already was rejected by David some years ago (2005, and 2006)
> 
> http://www.mail-archive.com/netdev@vger.kernel.org/msg07382.html

However, I do like Stephen's suggestion that maybe we can get rid of
this ->last_rx thing by encapsulating the logic completely in the
bonding driver.

> If you read my mail, I was *only* saying that tbench results can be sensible to
> cache line ping pongs. tbench is a crazy benchmark, and only is a crazy benchmark.
> 
> Optimizing linux for tbench sake would be .... crazy ?

Unlike dbench I think tbench is worth cranking up as much as possible.

It doesn't have a huge memory working set, it just writes mostly small
messages over a TCP socket back and forth, and does a lot of blocking

And I think we'd like all of those operating to run as fast as possible.

When Tridge first wrote tbench I would see the expected things at the
top of the profiles.  Things like tcp_ack(), copy to/from user, and
perhaps SLAB.

Things have changed considerably.


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-31 23:51                                     ` David Miller
@ 2008-10-31 23:56                                       ` Stephen Hemminger
  2008-11-01  0:16                                         ` Jay Vosburgh
  0 siblings, 1 reply; 94+ messages in thread
From: Stephen Hemminger @ 2008-10-31 23:56 UTC (permalink / raw)
  To: David Miller
  Cc: dada1, zbr, ilpo.jarvinen, rjw, mingo, s0mbre, a.p.zijlstra,
	linux-kernel, netdev, efault, akpm

On Fri, 31 Oct 2008 16:51:44 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Fri, 31 Oct 2008 22:03:00 +0100
> 
> > Evgeniy Polyakov a écrit :
> > > On Fri, Oct 31, 2008 at 12:57:13PM -0700, Stephen Hemminger (shemminger@vyatta.com) wrote:
> > >> Why bother with last_rx at all on loopback.  I have been thinking
> > >> we should figure out a way to get rid of last_rx all together. It only
> > >> seems to be used by bonding, and the bonding driver could do the calculation
> > >> in its receive handling.
> > > Not related to the regression: bug will be just papered out by this
> > > changes. Having bonding on loopback is somewhat strange idea, but still
> > > this kind of changes is an attempt to make a good play in the bad game:
> > > this loopback-only optimization does not fix the problem.
> > 
> > Just to be clear, this change was not meant to be committed.
> > It already was rejected by David some years ago (2005, and 2006)
> > 
> > http://www.mail-archive.com/netdev@vger.kernel.org/msg07382.html
> 
> However, I do like Stephen's suggestion that maybe we can get rid of
> this ->last_rx thing by encapsulating the logic completely in the
> bonding driver.

Since bonding driver doesn't actually see the rx packets, that isn't
really possible.  But it would be possible to change last_rx from a variable
to an function pointer, so that device's could apply other logic to derive
the last value.  One example would be to keep it per cpu and then take the
maximum.

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-31 23:56                                       ` Stephen Hemminger
@ 2008-11-01  0:16                                         ` Jay Vosburgh
  2008-11-02  4:40                                           ` David Miller
  0 siblings, 1 reply; 94+ messages in thread
From: Jay Vosburgh @ 2008-11-01  0:16 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, dada1, zbr, ilpo.jarvinen, rjw, mingo, s0mbre,
	a.p.zijlstra, linux-kernel, netdev, efault, akpm

Stephen Hemminger <shemminger@vyatta.com> wrote:

>On Fri, 31 Oct 2008 16:51:44 -0700 (PDT)
>David Miller <davem@davemloft.net> wrote:
[...]
>> However, I do like Stephen's suggestion that maybe we can get rid of
>> this ->last_rx thing by encapsulating the logic completely in the
>> bonding driver.
>
>Since bonding driver doesn't actually see the rx packets, that isn't
>really possible.  But it would be possible to change last_rx from a variable
>to an function pointer, so that device's could apply other logic to derive
>the last value.  One example would be to keep it per cpu and then take the
>maximum.

	I suspect it could also be tucked away in skb_bond_should_drop,
which is called both by the standard input path and the VLAN accelerated
path to see if the packet should be tossed (e.g., it arrived on an
inactive bonding slave).

	Since last_rx is part of struct net_device, I don't think any
additional bonding internals knowledge would be needed.  It could be
arranged to only update last_rx for devices that are actually bonding
slaves.

	Just off the top of my head (haven't tested this), something
like this:

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c8bcb59..ed1e58f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1743,22 +1743,24 @@ static inline int skb_bond_should_drop(struct sk_buff *skb)
 	struct net_device *dev = skb->dev;
 	struct net_device *master = dev->master;
 
-	if (master &&
-	    (dev->priv_flags & IFF_SLAVE_INACTIVE)) {
-		if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
-		    skb->protocol == __constant_htons(ETH_P_ARP))
-			return 0;
-
-		if (master->priv_flags & IFF_MASTER_ALB) {
-			if (skb->pkt_type != PACKET_BROADCAST &&
-			    skb->pkt_type != PACKET_MULTICAST)
+	if (master) {
+		dev->last_rx = jiffies;
+		if (dev->priv_flags & IFF_SLAVE_INACTIVE)) {
+			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
+			    skb->protocol == __constant_htons(ETH_P_ARP))
 				return 0;
-		}
-		if (master->priv_flags & IFF_MASTER_8023AD &&
-		    skb->protocol == __constant_htons(ETH_P_SLOW))
-			return 0;
 
-		return 1;
+			if (master->priv_flags & IFF_MASTER_ALB) {
+				if (skb->pkt_type != PACKET_BROADCAST &&
+				    skb->pkt_type != PACKET_MULTICAST)
+					return 0;
+			}
+			if (master->priv_flags & IFF_MASTER_8023AD &&
+			    skb->protocol == __constant_htons(ETH_P_SLOW))
+				return 0;
+
+			return 1;
+		}
 	}
 	return 0;
 }


	That doesn't move the storage out of struct net_device, but it
does stop the updates for devices that aren't bonding slaves.  It could
probably be refined further to only update when the ARP monitor is
running (the gizmo that uses last_rx).

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply related	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-10-29  9:50                                                     ` Evgeniy Polyakov
@ 2008-11-01 12:51                                                       ` Paolo Ciarrocchi
  0 siblings, 0 replies; 94+ messages in thread
From: Paolo Ciarrocchi @ 2008-11-01 12:51 UTC (permalink / raw)
  To: Evgeniy Polyakov, Ingo Molnar, Peter Zijlstra
  Cc: Mike Galbraith, David Miller, alan, jkosina, akpm, rjw, s0mbre,
	linux-kernel, netdev

On Wed, Oct 29, 2008 at 10:50 AM, Evgeniy Polyakov <zbr@ioremap.net> wrote:
> On Wed, Oct 29, 2008 at 12:14:05PM +0300, Evgeniy Polyakov (zbr@ioremap.net) wrote:
>> vanilla 27    : 347.222
>> no TSO/GSO    : 357.331
>> no hrticks    : 382.983
>> no balance    : 389.802
>> 4403b4 commit : 361.184
>> dirty_ratio-50        : 361.086
>> no-sched-tweaks       : 361.367
>>
>> So, probably, if we revert -tip merge to vanilla .27, add nohrtick patch
>> and nobalance tweak _only_, and apply naive TSO patch we could bring
>> system to 400 MB/s. Note, that .22 has 479.82 and .23 454.36 MB/s.
>
> And now I have to admit that the very last -top merge did noticebly
> improve the situation upto 391.331 MB/s (189 in domains, with tso/gso
> off and naive tcp_tso_should_defer() hange).
>
> So we are now essentially at the level of 24-25 trees in my tests.

That's good and make think whether it would be a good idea to add some
performance number in each pull request that affect the core part of
the kernel.

Ciao,
-- 
Paolo
http://paolo.ciarrocchi.googlepages.com/

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [tbench regression fixes]: digging out smelly deadmen.
  2008-11-01  0:16                                         ` Jay Vosburgh
@ 2008-11-02  4:40                                           ` David Miller
  2008-11-04  2:13                                             ` [PATCH net-next-2.6] bonding, net: Move last_rx update into bonding recv logic Jay Vosburgh
  0 siblings, 1 reply; 94+ messages in thread
From: David Miller @ 2008-11-02  4:40 UTC (permalink / raw)
  To: fubar
  Cc: shemminger, dada1, zbr, ilpo.jarvinen, rjw, mingo, s0mbre,
	a.p.zijlstra, linux-kernel, netdev, efault, akpm

From: Jay Vosburgh <fubar@us.ibm.com>
Date: Fri, 31 Oct 2008 17:16:33 -0700

> 	I suspect it could also be tucked away in skb_bond_should_drop,
> which is called both by the standard input path and the VLAN accelerated
> path to see if the packet should be tossed (e.g., it arrived on an
> inactive bonding slave).
> 
> 	Since last_rx is part of struct net_device, I don't think any
> additional bonding internals knowledge would be needed.  It could be
> arranged to only update last_rx for devices that are actually bonding
> slaves.
> 
> 	Just off the top of my head (haven't tested this), something
> like this:
 ...
> 
> 	That doesn't move the storage out of struct net_device, but it
> does stop the updates for devices that aren't bonding slaves.  It could
> probably be refined further to only update when the ARP monitor is
> running (the gizmo that uses last_rx).

I like this very much.

Jay can you give this a quick test by just trying this patch
and removing the ->last_rx setting in the driver you use for
your test?

Once you do that, I'll apply this to net-next-2.6 and do the
leg work to zap all of the ->last_rx updates from the entire tree.

Thanks!

^ permalink raw reply	[flat|nested] 94+ messages in thread

* [PATCH net-next-2.6] bonding, net: Move last_rx update into bonding recv logic
  2008-11-02  4:40                                           ` David Miller
@ 2008-11-04  2:13                                             ` Jay Vosburgh
  2008-11-04  2:17                                               ` David Miller
  0 siblings, 1 reply; 94+ messages in thread
From: Jay Vosburgh @ 2008-11-04  2:13 UTC (permalink / raw)
  To: David Miller
  Cc: shemminger, dada1, zbr, ilpo.jarvinen, rjw, mingo, s0mbre,
	a.p.zijlstra, linux-kernel, netdev, efault, akpm


	The only user of the net_device->last_rx field is bonding.  This
patch adds a conditional update of last_rx to the bonding special logic
in skb_bond_should_drop, causing last_rx to only be updated when the ARP
monitor is running.

	This frees network device drivers from the necessity of updating
last_rx, which can have cache line thrash issues.

Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 56c823c..39575d7 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4564,6 +4564,8 @@ static int bond_init(struct net_device *bond_dev, struct bond_params *params)
 	bond_dev->tx_queue_len = 0;
 	bond_dev->flags |= IFF_MASTER|IFF_MULTICAST;
 	bond_dev->priv_flags |= IFF_BONDING;
+	if (bond->params.arp_interval)
+		bond_dev->priv_flags |= IFF_MASTER_ARPMON;
 
 	/* At first, we block adding VLANs. That's the only way to
 	 * prevent problems that occur when adding VLANs over an
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 296a865..e400d7d 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -620,6 +620,8 @@ static ssize_t bonding_store_arp_interval(struct device *d,
 	       ": %s: Setting ARP monitoring interval to %d.\n",
 	       bond->dev->name, new_value);
 	bond->params.arp_interval = new_value;
+	if (bond->params.arp_interval)
+		bond->dev->priv_flags |= IFF_MASTER_ARPMON;
 	if (bond->params.miimon) {
 		printk(KERN_INFO DRV_NAME
 		       ": %s: ARP monitoring cannot be used with MII monitoring. "
@@ -1039,6 +1041,7 @@ static ssize_t bonding_store_miimon(struct device *d,
 			       "ARP monitoring. Disabling ARP monitoring...\n",
 			       bond->dev->name);
 			bond->params.arp_interval = 0;
+			bond->dev->priv_flags &= ~IFF_MASTER_ARPMON;
 			if (bond->params.arp_validate) {
 				bond_unregister_arp(bond);
 				bond->params.arp_validate =
diff --git a/include/linux/if.h b/include/linux/if.h
index 6524684..2a6e296 100644
--- a/include/linux/if.h
+++ b/include/linux/if.h
@@ -65,6 +65,7 @@
 #define IFF_BONDING	0x20		/* bonding master or slave	*/
 #define IFF_SLAVE_NEEDARP 0x40		/* need ARPs for validation	*/
 #define IFF_ISATAP	0x80		/* ISATAP interface (RFC4214)	*/
+#define IFF_MASTER_ARPMON 0x100		/* bonding master, ARP mon in use */
 
 #define IF_GET_IFACE	0x0001		/* for querying only */
 #define IF_GET_PROTO	0x0002
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9d77b1d..f1b0dbe 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1742,22 +1742,26 @@ static inline int skb_bond_should_drop(struct sk_buff *skb)
 	struct net_device *dev = skb->dev;
 	struct net_device *master = dev->master;
 
-	if (master &&
-	    (dev->priv_flags & IFF_SLAVE_INACTIVE)) {
-		if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
-		    skb->protocol == __constant_htons(ETH_P_ARP))
-			return 0;
-
-		if (master->priv_flags & IFF_MASTER_ALB) {
-			if (skb->pkt_type != PACKET_BROADCAST &&
-			    skb->pkt_type != PACKET_MULTICAST)
+	if (master) {
+		if (master->priv_flags & IFF_MASTER_ARPMON)
+			dev->last_rx = jiffies;
+
+		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
+			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
+			    skb->protocol == __constant_htons(ETH_P_ARP))
 				return 0;
-		}
-		if (master->priv_flags & IFF_MASTER_8023AD &&
-		    skb->protocol == __constant_htons(ETH_P_SLOW))
-			return 0;
 
-		return 1;
+			if (master->priv_flags & IFF_MASTER_ALB) {
+				if (skb->pkt_type != PACKET_BROADCAST &&
+				    skb->pkt_type != PACKET_MULTICAST)
+					return 0;
+			}
+			if (master->priv_flags & IFF_MASTER_8023AD &&
+			    skb->protocol == __constant_htons(ETH_P_SLOW))
+				return 0;
+
+			return 1;
+		}
 	}
 	return 0;
 }

^ permalink raw reply related	[flat|nested] 94+ messages in thread

* Re: [PATCH net-next-2.6] bonding, net: Move last_rx update into bonding recv logic
  2008-11-04  2:13                                             ` [PATCH net-next-2.6] bonding, net: Move last_rx update into bonding recv logic Jay Vosburgh
@ 2008-11-04  2:17                                               ` David Miller
  0 siblings, 0 replies; 94+ messages in thread
From: David Miller @ 2008-11-04  2:17 UTC (permalink / raw)
  To: fubar
  Cc: shemminger, dada1, zbr, ilpo.jarvinen, rjw, mingo, s0mbre,
	a.p.zijlstra, linux-kernel, netdev, efault, akpm

From: Jay Vosburgh <fubar@us.ibm.com>
Date: Mon, 03 Nov 2008 18:13:09 -0800

> 
> 	The only user of the net_device->last_rx field is bonding.  This
> patch adds a conditional update of last_rx to the bonding special logic
> in skb_bond_should_drop, causing last_rx to only be updated when the ARP
> monitor is running.
> 
> 	This frees network device drivers from the necessity of updating
> last_rx, which can have cache line thrash issues.
> 
> Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>

Applied, thanks a lot Jay.

^ permalink raw reply	[flat|nested] 94+ messages in thread

end of thread, other threads:[~2008-11-04  2:17 UTC | newest]

Thread overview: 94+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-10-09 23:17 [tbench regression fixes]: digging out smelly deadmen Evgeniy Polyakov
2008-10-10  5:40 ` Peter Zijlstra
2008-10-10  8:09 ` Evgeniy Polyakov
2008-10-10  9:15   ` Ingo Molnar
2008-10-10 11:31     ` Evgeniy Polyakov
2008-10-10 11:40       ` Ingo Molnar
2008-10-10 13:25         ` Evgeniy Polyakov
2008-10-10 11:42       ` Ingo Molnar
2008-10-10 11:55         ` Evgeniy Polyakov
2008-10-10 11:57           ` Ingo Molnar
2008-10-24 22:25             ` Rafael J. Wysocki
2008-10-24 23:31               ` David Miller
2008-10-25  4:05                 ` Mike Galbraith
2008-10-25  5:15                   ` David Miller
2008-10-25  5:53                     ` Mike Galbraith
2008-10-25 11:13                 ` Rafael J. Wysocki
2008-10-26  3:55                   ` David Miller
2008-10-26 11:33                     ` Rafael J. Wysocki
2008-10-25  3:37               ` Mike Galbraith
2008-10-25  5:16                 ` David Miller
2008-10-25  5:58                   ` Mike Galbraith
2008-10-25  6:53                     ` Mike Galbraith
2008-10-25  7:24                       ` David Miller
2008-10-25  7:52                         ` Mike Galbraith
2008-10-25 23:10                         ` Jiri Kosina
2008-10-26  8:46                           ` Mike Galbraith
2008-10-26  9:00                             ` Peter Zijlstra
2008-10-26  9:11                               ` Andrew Morton
2008-10-26  9:27                                 ` Evgeniy Polyakov
2008-10-26  9:34                                   ` Andrew Morton
2008-10-26 10:05                                     ` Evgeniy Polyakov
2008-10-27  2:34                                       ` David Miller
2008-10-27  9:30                                         ` Ingo Molnar
2008-10-27  9:57                                           ` David Miller
2008-10-26 10:23                                 ` Mike Galbraith
2008-10-26 19:03                                 ` Jiri Kosina
2008-10-27  9:29                                   ` Mike Galbraith
2008-10-27 10:42                                   ` Jiri Kosina
2008-10-27 11:27                                     ` Ingo Molnar
2008-10-27 11:33                                       ` Alan Cox
2008-10-27 12:06                                         ` Mike Galbraith
2008-10-27 13:42                                           ` Jiri Kosina
2008-10-27 14:17                                             ` Mike Galbraith
2008-10-27 18:33                                         ` Ingo Molnar
2008-10-27 19:39                                           ` Evgeniy Polyakov
2008-10-27 19:48                                             ` David Miller
2008-10-28 10:24                                               ` Mike Galbraith
2008-10-28 10:37                                                 ` Ingo Molnar
2008-10-28 10:57                                                   ` Mike Galbraith
2008-10-28 11:02                                                     ` Ingo Molnar
2008-10-28 14:00                                                     ` Mike Galbraith
2008-10-28 15:22                                                       ` Mike Galbraith
2008-10-29  9:14                                                   ` Evgeniy Polyakov
2008-10-29  9:50                                                     ` Evgeniy Polyakov
2008-11-01 12:51                                                       ` Paolo Ciarrocchi
2008-10-29  9:59                                           ` Nick Piggin
2008-10-26  9:15                               ` Mike Galbraith
2008-10-25  7:19                     ` David Miller
2008-10-25  7:33                       ` Mike Galbraith
2008-10-27 17:26                         ` Rick Jones
2008-10-27 19:11                           ` Mike Galbraith
2008-10-27 19:18                             ` Rick Jones
2008-10-27 19:44                               ` Mike Galbraith
2008-10-26 11:29               ` Evgeniy Polyakov
2008-10-26 12:23                 ` Evgeniy Polyakov
2008-10-30 18:15                   ` Stephen Hemminger
2008-10-30 18:40                     ` Evgeniy Polyakov
2008-10-30 18:43                     ` Eric Dumazet
2008-10-30 18:56                       ` Eric Dumazet
2008-10-30 19:01                     ` Ilpo Järvinen
2008-10-31  7:52                       ` David Miller
2008-10-31  9:40                         ` Ilpo Järvinen
2008-10-31  9:51                           ` David Miller
2008-10-31 10:42                             ` Ilpo Järvinen
2008-10-31 10:45                             ` Eric Dumazet
2008-10-31 11:01                               ` Ilpo Järvinen
2008-10-31 11:10                                 ` Eric Dumazet
2008-10-31 11:15                                   ` Ilpo Järvinen
2008-10-31 19:57                               ` Stephen Hemminger
2008-10-31 20:10                                 ` Evgeniy Polyakov
2008-10-31 21:03                                   ` Eric Dumazet
2008-10-31 21:18                                     ` Evgeniy Polyakov
2008-10-31 23:51                                     ` David Miller
2008-10-31 23:56                                       ` Stephen Hemminger
2008-11-01  0:16                                         ` Jay Vosburgh
2008-11-02  4:40                                           ` David Miller
2008-11-04  2:13                                             ` [PATCH net-next-2.6] bonding, net: Move last_rx update into bonding recv logic Jay Vosburgh
2008-11-04  2:17                                               ` David Miller
2008-10-10 10:13 ` [tbench regression fixes]: digging out smelly deadmen Mike Galbraith
2008-10-11 13:13   ` Evgeniy Polyakov
2008-10-11 14:39     ` Peter Zijlstra
2008-10-11 18:13       ` Mike Galbraith
2008-10-12  6:02         ` Mike Galbraith
2008-10-12  6:33           ` Mike Galbraith

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).