linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* SCHED_FIFO and SCHED_RR broken by cfs
@ 2008-08-16  9:55 Stefani Seibold
  2008-08-16 14:53 ` Peter Zijlstra
  0 siblings, 1 reply; 17+ messages in thread
From: Stefani Seibold @ 2008-08-16  9:55 UTC (permalink / raw)
  To: linux-kernel, mingo

Hi kernel hackers,

it seems that the new completely fair scheduler breaks the SCHED_RR and
SCHED_FIFO realtime scheduler.

In my opinion a high priority real time user process with SCHED_FIFO
should be only interrupted by the kernel or a process with an higher
priority. So a user process running under SCHED_FIFO and priority 99
should never be interrupted by any other process.  This was true under
kernel 2.6.20. 

On my pentium/celeron III/400 MHz system with kernel 2.6.20 a busy loop
using the "time stamp counter" of the x86 cpu for delaying, this was
very accurate. The max. jitter of the delaying was about 5 microseconds.

With the new kernel 2.6.26 the jitter is about 51177 microseconds or in
other words 51 milliseconds or more the 10000 times greater than kernel
2.6.20. This huge latency is far away from realtime.

Below are the results of the attached test program. Maybe somebody else
can confirm this results. All measurements was done with no other
process running, only the busybox 1.11.1 shell and the init process was
there.

kernel 2.6.20
-------------

reported cpufreq: 398816000 Hz

time chrt -f 99 /tmp/a.out      time chrt -o 0 /tmp/a.out
 average: 0                      average: 0
 min. jitter: 0 usec             min. jitter: 0 usec
 max. jitter: 5 usec             max. jitter: 113 usec
real    0m 5.02s                real    0m 5.02s
user    0m 5.00s                user    0m 5.01s
sys     0m 0.01s                sys     0m 0.01s

kernel 2.6.26
-------------

reported cpufreq: 400000000 Hz

time chrt -f 99 /tmp/a.out      time chrt -o 0 /tmp/a.out
 average: 189                    average: 1
 min. jitter: 0 use              min. jitter: 0 usec
 max. jitter: 51177 us           max. jitter: 368 usec
real    0m 5.21s                real    0m 5.03s
user    0m 4.99s                user    0m 5.00s
sys     0m 0.01s                sys     0m 0.02s

I tried the test also on a pentium-m 2,267 GHz notebook with the kernel
2.6.26 and 2.6.27-rc3-git2 and the behavior is the same. The process
started the SCHED_OTHER has less max. jitter than the process started
with SCHED_FIFO.

Below the attached test program and an extract of my kernel 2.6.26
configuration.

<---test program starts here

#include <errno.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>

#define	USECS	1000000ULL

typedef unsigned long long	u64;

static u64	cpufreq;

static inline void getCPUfreq(void)
{
	FILE *	file;
	char	buffer[32];

	file=popen("cat /proc/cpuinfo|grep 'cpu MHz'|cut -f2 -d':'","r");

	if (file==NULL) {
		fprintf(stderr,"get cpuinfo failed\n");
		abort();
	}
	if (fgets(buffer,sizeof(buffer),file)==NULL) {
		fprintf(stderr,"read cpuinfo data failed\n");
		abort();
	}
	fclose(file);

	cpufreq=atof(buffer)*USECS;

	printf("cpufreq: %llu Hz\n",cpufreq);
}

static inline u64 readtsc(void)
{
	u64 val;
	
	__asm__ __volatile__ ("rdtsc" : "=&A" (val));
	
	return val;
}

static inline int usleep(long usec)
{
	u64	start;
	u64	end;

	start=readtsc();

	for(end=(cpufreq*usec)/USECS;(readtsc()-start)<=end;);

	return 0;
}

static inline struct timeval subtimeval(const struct timeval *t1,const
struct timeval *t2)
{
	struct timeval	t;

	if (t1->tv_usec<t2->tv_usec) {
		t.tv_sec =t1->tv_sec -t2->tv_sec-1;
		t.tv_usec=t1->tv_usec-t2->tv_usec+USECS;
	}
	else {
		t.tv_sec =t1->tv_sec -t2->tv_sec;
		t.tv_usec=t1->tv_usec-t2->tv_usec;
	}
	return t;
}

int main(void)
{
	long		elapsed,usec,jitter;
	long		min_jitter=LONG_MAX,max_jitter=0;
	struct timeval	overhead,tv1,tv2,delta;
	unsigned long	cnt=0,sum=0;

	getCPUfreq();

	/* calculate gettimeofday overhead */
	gettimeofday(&tv1,NULL);
	for(cnt=0;cnt<9998;cnt++)
		gettimeofday(&overhead,NULL);
	gettimeofday(&tv2,NULL);

	overhead=subtimeval(&tv2,&tv1);
	overhead.tv_usec/=cnt/2+1;
	overhead.tv_sec=0;
	printf("gettimeofday() call overhead:%ld\n",overhead.tv_usec);

	/* measure busywait usleep() function */
	for(cnt=1;cnt<=1000;cnt++) {
		usec=cnt*10;

		gettimeofday(&tv1,NULL);
		usleep(usec);
		gettimeofday(&tv2,NULL);

		delta=subtimeval(&tv2,&tv1);
		delta=subtimeval(&delta,&overhead);	/* subtract gettimeofday call
overhead */

		elapsed=delta.tv_sec*USECS+delta.tv_usec;

		if (elapsed<usec) {
			/* usleep early returned? */
			if (elapsed+overhead.tv_usec<usec) {
				fprintf(stderr,"invalid timing - usec: %ld elapsed: %ld
\n",usec,elapsed);
				abort();
			}
			/* fix if overhead measurement was not accurate enough */ 
			elapsed=usec;
		}
		jitter=elapsed-usec;

		if (jitter>max_jitter)
			max_jitter=jitter;
		if (jitter<min_jitter)
			min_jitter=jitter;

		sum+=jitter;
	}
	printf("result:\n average:%lu usec\n min. jitter:%ld usec\n max.
jitter:%ld usec\n",sum/cnt,min_jitter,max_jitter);
	return 0;
}

sample source ends here--->

<---extract of my kernel 2.6.26 config
#
# Automatically generated make config: don't edit
# Linux kernel version: 2.6.26
# Fri Jul 25 11:22:39 2008
#
# CONFIG_64BIT is not set
CONFIG_X86_32=y
# CONFIG_X86_64 is not set
CONFIG_X86=y
CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig"
# CONFIG_GENERIC_LOCKBREAK is not set
CONFIG_GENERIC_TIME=y
CONFIG_GENERIC_CMOS_UPDATE=y
CONFIG_CLOCKSOURCE_WATCHDOG=y
CONFIG_GENERIC_CLOCKEVENTS=y
CONFIG_LOCKDEP_SUPPORT=y
CONFIG_STACKTRACE_SUPPORT=y
CONFIG_HAVE_LATENCYTOP_SUPPORT=y
CONFIG_FAST_CMPXCHG_LOCAL=y
CONFIG_MMU=y
CONFIG_ZONE_DMA=y
CONFIG_GENERIC_ISA_DMA=y
CONFIG_GENERIC_IOMAP=y
CONFIG_GENERIC_HWEIGHT=y
# CONFIG_GENERIC_GPIO is not set
CONFIG_ARCH_MAY_HAVE_PC_FDC=y
# CONFIG_RWSEM_GENERIC_SPINLOCK is not set
CONFIG_RWSEM_XCHGADD_ALGORITHM=y
# CONFIG_ARCH_HAS_ILOG2_U32 is not set
# CONFIG_ARCH_HAS_ILOG2_U64 is not set
CONFIG_ARCH_HAS_CPU_IDLE_WAIT=y
CONFIG_GENERIC_CALIBRATE_DELAY=y
# CONFIG_GENERIC_TIME_VSYSCALL is not set
CONFIG_ARCH_HAS_CPU_RELAX=y
CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
# CONFIG_HAVE_SETUP_PER_CPU_AREA is not set
# CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set
CONFIG_ARCH_HIBERNATION_POSSIBLE=y
CONFIG_ARCH_SUSPEND_POSSIBLE=y
# CONFIG_ZONE_DMA32 is not set
CONFIG_ARCH_POPULATES_NODE_MAP=y
# CONFIG_AUDIT_ARCH is not set
CONFIG_ARCH_SUPPORTS_AOUT=y
CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
CONFIG_GENERIC_HARDIRQS=y
CONFIG_GENERIC_IRQ_PROBE=y
CONFIG_X86_BIOS_REBOOT=y
CONFIG_KTIME_SCALAR=y
CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"

#
# General setup
#
CONFIG_EXPERIMENTAL=y
CONFIG_BROKEN_ON_SMP=y
CONFIG_LOCK_KERNEL=y
CONFIG_INIT_ENV_ARG_LIMIT=32
CONFIG_LOCALVERSION=""
# CONFIG_LOCALVERSION_AUTO is not set
# CONFIG_SWAP is not set
CONFIG_SYSVIPC=y
CONFIG_SYSVIPC_SYSCTL=y
CONFIG_POSIX_MQUEUE=y
# CONFIG_BSD_PROCESS_ACCT is not set
# CONFIG_TASKSTATS is not set
# CONFIG_AUDIT is not set
# CONFIG_IKCONFIG is not set
CONFIG_LOG_BUF_SHIFT=16
# CONFIG_CGROUPS is not set
CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y
# CONFIG_GROUP_SCHED is not set
CONFIG_SYSFS_DEPRECATED=y
CONFIG_SYSFS_DEPRECATED_V2=y
# CONFIG_RELAY is not set
# CONFIG_NAMESPACES is not set
CONFIG_BLK_DEV_INITRD=y
CONFIG_INITRAMFS_SOURCE=""
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
CONFIG_SYSCTL=y
CONFIG_EMBEDDED=y
CONFIG_UID16=y
CONFIG_SYSCTL_SYSCALL=y
CONFIG_SYSCTL_SYSCALL_CHECK=y
# CONFIG_KALLSYMS is not set
CONFIG_HOTPLUG=y
CONFIG_PRINTK=y
# CONFIG_BUG is not set
CONFIG_ELF_CORE=y
# CONFIG_PCSPKR_PLATFORM is not set
CONFIG_COMPAT_BRK=y
CONFIG_BASE_FULL=y
CONFIG_FUTEX=y
CONFIG_ANON_INODES=y
# CONFIG_EPOLL is not set
CONFIG_SIGNALFD=y
CONFIG_TIMERFD=y
CONFIG_EVENTFD=y
# CONFIG_SHMEM is not set
# CONFIG_VM_EVENT_COUNTERS is not set
# CONFIG_SLAB is not set
# CONFIG_SLUB is not set
CONFIG_SLOB=y
# CONFIG_PROFILING is not set
# CONFIG_MARKERS is not set
CONFIG_HAVE_OPROFILE=y
CONFIG_HAVE_KPROBES=y
CONFIG_HAVE_KRETPROBES=y
# CONFIG_HAVE_DMA_ATTRS is not set
# CONFIG_PROC_PAGE_MONITOR is not set
CONFIG_RT_MUTEXES=y
CONFIG_TINY_SHMEM=y
CONFIG_BASE_SMALL=0
CONFIG_MODULES=y
# CONFIG_MODULE_FORCE_LOAD is not set
CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
# CONFIG_MODVERSIONS is not set
# CONFIG_MODULE_SRCVERSION_ALL is not set
# CONFIG_KMOD is not set
CONFIG_BLOCK=y
# CONFIG_LBD is not set
# CONFIG_BLK_DEV_IO_TRACE is not set
# CONFIG_LSF is not set
# CONFIG_BLK_DEV_BSG is not set

#
# IO Schedulers
#
CONFIG_IOSCHED_NOOP=y
# CONFIG_IOSCHED_AS is not set
# CONFIG_IOSCHED_DEADLINE is not set
# CONFIG_IOSCHED_CFQ is not set
# CONFIG_DEFAULT_AS is not set
# CONFIG_DEFAULT_DEADLINE is not set
# CONFIG_DEFAULT_CFQ is not set
CONFIG_DEFAULT_NOOP=y
CONFIG_DEFAULT_IOSCHED="noop"
CONFIG_CLASSIC_RCU=y

#
# Processor type and features
#
CONFIG_TICK_ONESHOT=y
# CONFIG_NO_HZ is not set
CONFIG_HIGH_RES_TIMERS=y
CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
# CONFIG_SMP is not set
CONFIG_X86_PC=y
# CONFIG_X86_ELAN is not set
# CONFIG_X86_VOYAGER is not set
# CONFIG_X86_NUMAQ is not set
# CONFIG_X86_SUMMIT is not set
# CONFIG_X86_BIGSMP is not set
# CONFIG_X86_VISWS is not set
# CONFIG_X86_GENERICARCH is not set
# CONFIG_X86_ES7000 is not set
# CONFIG_X86_RDC321X is not set
# CONFIG_X86_VSMP is not set
CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
# CONFIG_PARAVIRT_GUEST is not set
# CONFIG_M386 is not set
# CONFIG_M486 is not set
# CONFIG_M586 is not set
# CONFIG_M586TSC is not set
# CONFIG_M586MMX is not set
# CONFIG_M686 is not set
# CONFIG_MPENTIUMII is not set
CONFIG_MPENTIUMIII=y
# CONFIG_MPENTIUMM is not set
# CONFIG_MPENTIUM4 is not set
# CONFIG_MK6 is not set
# CONFIG_MK7 is not set
# CONFIG_MK8 is not set
# CONFIG_MCRUSOE is not set
# CONFIG_MEFFICEON is not set
# CONFIG_MWINCHIPC6 is not set
# CONFIG_MWINCHIP2 is not set
# CONFIG_MWINCHIP3D is not set
# CONFIG_MGEODEGX1 is not set
# CONFIG_MGEODE_LX is not set
# CONFIG_MCYRIXIII is not set
# CONFIG_MVIAC3_2 is not set
# CONFIG_MVIAC7 is not set
# CONFIG_MPSC is not set
# CONFIG_MCORE2 is not set
# CONFIG_GENERIC_CPU is not set
# CONFIG_X86_GENERIC is not set
CONFIG_X86_CPU=y
CONFIG_X86_CMPXCHG=y
CONFIG_X86_L1_CACHE_SHIFT=5
CONFIG_X86_XADD=y
CONFIG_X86_WP_WORKS_OK=y
CONFIG_X86_INVLPG=y
CONFIG_X86_BSWAP=y
CONFIG_X86_POPAD_OK=y
CONFIG_X86_GOOD_APIC=y
CONFIG_X86_INTEL_USERCOPY=y
CONFIG_X86_USE_PPRO_CHECKSUM=y
CONFIG_X86_P6_NOP=y
CONFIG_X86_TSC=y
CONFIG_X86_CMOV=y
CONFIG_X86_MINIMUM_CPU_FAMILY=6
CONFIG_X86_DEBUGCTLMSR=y
CONFIG_HPET_TIMER=y
CONFIG_HPET_EMULATE_RTC=y
CONFIG_DMI=y
# CONFIG_IOMMU_HELPER is not set
# CONFIG_PREEMPT_NONE is not set
# CONFIG_PREEMPT_VOLUNTARY is not set
CONFIG_PREEMPT=y
# CONFIG_PREEMPT_RCU is not set
# CONFIG_X86_UP_APIC is not set
# CONFIG_X86_MCE is not set
# CONFIG_VM86 is not set
# CONFIG_TOSHIBA is not set
# CONFIG_I8K is not set
# CONFIG_X86_REBOOTFIXUPS is not set
# CONFIG_MICROCODE is not set
# CONFIG_X86_MSR is not set
# CONFIG_X86_CPUID is not set
CONFIG_NOHIGHMEM=y
# CONFIG_HIGHMEM4G is not set
# CONFIG_HIGHMEM64G is not set
CONFIG_VMSPLIT_3G=y
# CONFIG_VMSPLIT_3G_OPT is not set
# CONFIG_VMSPLIT_2G is not set
# CONFIG_VMSPLIT_2G_OPT is not set
# CONFIG_VMSPLIT_1G is not set
CONFIG_PAGE_OFFSET=0xC0000000
# CONFIG_X86_PAE is not set
CONFIG_ARCH_FLATMEM_ENABLE=y
CONFIG_ARCH_SPARSEMEM_ENABLE=y
CONFIG_ARCH_SELECT_MEMORY_MODEL=y
CONFIG_SELECT_MEMORY_MODEL=y
CONFIG_FLATMEM_MANUAL=y
# CONFIG_DISCONTIGMEM_MANUAL is not set
# CONFIG_SPARSEMEM_MANUAL is not set
CONFIG_FLATMEM=y
CONFIG_FLAT_NODE_MEM_MAP=y
CONFIG_SPARSEMEM_STATIC=y
# CONFIG_SPARSEMEM_VMEMMAP_ENABLE is not set
CONFIG_PAGEFLAGS_EXTENDED=y
CONFIG_SPLIT_PTLOCK_CPUS=4
# CONFIG_RESOURCES_64BIT is not set
CONFIG_ZONE_DMA_FLAG=1
CONFIG_BOUNCE=y
CONFIG_VIRT_TO_BUS=y
# CONFIG_MATH_EMULATION is not set
CONFIG_MTRR=y
# CONFIG_X86_PAT is not set
# CONFIG_EFI is not set
# CONFIG_SECCOMP is not set
# CONFIG_HZ_100 is not set
CONFIG_HZ_250=y
# CONFIG_HZ_300 is not set
# CONFIG_HZ_1000 is not set
CONFIG_HZ=250
CONFIG_SCHED_HRTICK=y
CONFIG_KEXEC=y
CONFIG_PHYSICAL_START=0x100000
# CONFIG_RELOCATABLE is not set
CONFIG_PHYSICAL_ALIGN=0x100000
CONFIG_COMPAT_VDSO=y

#
# Power management options
#
CONFIG_PM=y
# CONFIG_PM_DEBUG is not set
# CONFIG_SUSPEND is not set
CONFIG_ACPI=y
CONFIG_ACPI_PROCFS=y
CONFIG_ACPI_PROCFS_POWER=y
CONFIG_ACPI_SYSFS_POWER=y
CONFIG_ACPI_PROC_EVENT=y
# CONFIG_ACPI_AC is not set
# CONFIG_ACPI_BATTERY is not set
CONFIG_ACPI_BUTTON=y
# CONFIG_ACPI_FAN is not set
# CONFIG_ACPI_DOCK is not set
CONFIG_ACPI_PROCESSOR=y
CONFIG_ACPI_THERMAL=y
# CONFIG_ACPI_WMI is not set
# CONFIG_ACPI_ASUS is not set
# CONFIG_ACPI_TOSHIBA is not set
# CONFIG_ACPI_CUSTOM_DSDT is not set
CONFIG_ACPI_BLACKLIST_YEAR=0
# CONFIG_ACPI_DEBUG is not set
CONFIG_ACPI_EC=y
CONFIG_ACPI_POWER=y
CONFIG_ACPI_SYSTEM=y
# CONFIG_X86_PM_TIMER is not set
# CONFIG_ACPI_CONTAINER is not set
# CONFIG_ACPI_SBS is not set

#
# CPU Frequency scaling
#
# CONFIG_CPU_FREQ is not set
# CONFIG_CPU_IDLE is not set

#
# Bus options (PCI etc.)
#
CONFIG_PCI=y
# CONFIG_PCI_GOBIOS is not set
# CONFIG_PCI_GOMMCONFIG is not set
CONFIG_PCI_GODIRECT=y
# CONFIG_PCI_GOOLPC is not set
# CONFIG_PCI_GOANY is not set
CONFIG_PCI_DIRECT=y
CONFIG_PCI_DOMAINS=y
# CONFIG_PCIEPORTBUS is not set
# CONFIG_ARCH_SUPPORTS_MSI is not set
CONFIG_PCI_LEGACY=y
CONFIG_ISA_DMA_API=y
# CONFIG_ISA is not set
# CONFIG_MCA is not set
# CONFIG_SCx200 is not set
# CONFIG_OLPC is not set
# CONFIG_PCCARD is not set
# CONFIG_HOTPLUG_PCI is not set
end of extract of my kernel config--->



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: SCHED_FIFO and SCHED_RR broken by cfs
  2008-08-16  9:55 SCHED_FIFO and SCHED_RR broken by cfs Stefani Seibold
@ 2008-08-16 14:53 ` Peter Zijlstra
  2008-08-16 16:26   ` Stefani Seibold
                     ` (2 more replies)
  0 siblings, 3 replies; 17+ messages in thread
From: Peter Zijlstra @ 2008-08-16 14:53 UTC (permalink / raw)
  To: Stefani Seibold; +Cc: linux-kernel, mingo

On Sat, 2008-08-16 at 11:55 +0200, Stefani Seibold wrote:
> Hi kernel hackers,
> 
> it seems that the new completely fair scheduler breaks the SCHED_RR and
> SCHED_FIFO realtime scheduler.
> 
> In my opinion a high priority real time user process with SCHED_FIFO
> should be only interrupted by the kernel or a process with an higher
> priority. So a user process running under SCHED_FIFO and priority 99
> should never be interrupted by any other process.  This was true under
> kernel 2.6.20. 
> 
> On my pentium/celeron III/400 MHz system with kernel 2.6.20 a busy loop
> using the "time stamp counter" of the x86 cpu for delaying, this was
> very accurate. The max. jitter of the delaying was about 5 microseconds.
> 
> With the new kernel 2.6.26 the jitter is about 51177 microseconds or in
> other words 51 milliseconds or more the 10000 times greater than kernel
> 2.6.20. This huge latency is far away from realtime.
> 
> Below are the results of the attached test program. Maybe somebody else
> can confirm this results. All measurements was done with no other
> process running, only the busybox 1.11.1 shell and the init process was
> there.

Has nothing to do with CFS, but everything to do with the fact that we
now have a 95% bandwidth control by default.

Does doing:

echo -1 > /proc/sys/kernel/sched_rt_runtime_us

fix it?

So, up to 95% cpu usage (per sched_rt_period_us) FIFO and RR behave like
they always did, once they cross that line, they'll be throttled.

95% seemed like a sane default in that it leaves a little room to
recover from a run-away rt process (esp handy now that !root users can
also use RT scheduling classes), and should be enough for most
applications as they usually don't consume all that much time.




^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: SCHED_FIFO and SCHED_RR broken by cfs
  2008-08-16 14:53 ` Peter Zijlstra
@ 2008-08-16 16:26   ` Stefani Seibold
  2008-08-16 21:29   ` Stefani Seibold
  2008-08-17 13:04   ` SCHED_FIFO and SCHED_RR broken by cfs Nick Piggin
  2 siblings, 0 replies; 17+ messages in thread
From: Stefani Seibold @ 2008-08-16 16:26 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-kernel, mingo

I haved tried your suggestion on my 2.6.26 pentium notebook. Nothing is
changing.

After applying 'echo -1> /proc/sys/kernel/sched_rt_runtime_us' the
SCHED_FIFO jitter are still higher than with SCHED_OTHER.

Here are the results of my notebook 

time chrt -f 99 /tmp/a.out      time chrt -o 0 /tmp/a.out
 average: 212                    average: 13
 min. jitter: 0 usec             min. jitter: 0 usec
 max. jitter: 50013 usec         max. jitter: 33 usec

The kernel was startet with init=/bin/bash, so no other process is
running.

Thanx for supporting me.

Am Samstag, den 16.08.2008, 16:53 +0200 schrieb Peter Zijlstra: 
> On Sat, 2008-08-16 at 11:55 +0200, Stefani Seibold wrote:
> > Hi kernel hackers,
> > 
> > it seems that the new completely fair scheduler breaks the SCHED_RR and
> > SCHED_FIFO realtime scheduler.
> > 
> > In my opinion a high priority real time user process with SCHED_FIFO
> > should be only interrupted by the kernel or a process with an higher
> > priority. So a user process running under SCHED_FIFO and priority 99
> > should never be interrupted by any other process.  This was true under
> > kernel 2.6.20. 
> > 
> > On my pentium/celeron III/400 MHz system with kernel 2.6.20 a busy loop
> > using the "time stamp counter" of the x86 cpu for delaying, this was
> > very accurate. The max. jitter of the delaying was about 5 microseconds.
> > 
> > With the new kernel 2.6.26 the jitter is about 51177 microseconds or in
> > other words 51 milliseconds or more the 10000 times greater than kernel
> > 2.6.20. This huge latency is far away from realtime.
> > 
> > Below are the results of the attached test program. Maybe somebody else
> > can confirm this results. All measurements was done with no other
> > process running, only the busybox 1.11.1 shell and the init process was
> > there.
> 
> Has nothing to do with CFS, but everything to do with the fact that we
> now have a 95% bandwidth control by default.
> 
> Does doing:
> 
> echo -1 > /proc/sys/kernel/sched_rt_runtime_us
> 
> fix it?
> 
> So, up to 95% cpu usage (per sched_rt_period_us) FIFO and RR behave like
> they always did, once they cross that line, they'll be throttled.
> 
> 95% seemed like a sane default in that it leaves a little room to
> recover from a run-away rt process (esp handy now that !root users can
> also use RT scheduling classes), and should be enough for most
> applications as they usually don't consume all that much time.
> 
> 
> 


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: SCHED_FIFO and SCHED_RR broken by cfs
  2008-08-16 14:53 ` Peter Zijlstra
  2008-08-16 16:26   ` Stefani Seibold
@ 2008-08-16 21:29   ` Stefani Seibold
  2008-08-17 22:15     ` Dario Faggioli
  2008-08-17 13:04   ` SCHED_FIFO and SCHED_RR broken by cfs Nick Piggin
  2 siblings, 1 reply; 17+ messages in thread
From: Stefani Seibold @ 2008-08-16 21:29 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-kernel, mingo

After disabling kernel support for "Group CPU scheduler" and applying
'echo -1 > /proc/sys/kernel/sched_rt_runtime_us' the behaviour is as
expected.

chrt -f 99 ./a.out

 average:13 usec
 min. jitter:0 usec
 max. jitter:29 usec

chrt -o 0 ./a.out 
 average:153 usec
 min. jitter:0 usec
 max. jitter:37035 usec

So the problem is located first in the new sched_rt_runtime_us default
value and second in the "Group CPU scheduler".

A last question: I though that the kernel will never break user space.
Would it not better to make the old behaviour as the default?

Greetings from Munich/Germany
Stefani

Am Samstag, den 16.08.2008, 16:53 +0200 schrieb Peter Zijlstra:
> On Sat, 2008-08-16 at 11:55 +0200, Stefani Seibold wrote:
> > Hi kernel hackers,
> > 
> > it seems that the new completely fair scheduler breaks the SCHED_RR and
> > SCHED_FIFO realtime scheduler.
> > 
> > In my opinion a high priority real time user process with SCHED_FIFO
> > should be only interrupted by the kernel or a process with an higher
> > priority. So a user process running under SCHED_FIFO and priority 99
> > should never be interrupted by any other process.  This was true under
> > kernel 2.6.20. 
> > 
> > On my pentium/celeron III/400 MHz system with kernel 2.6.20 a busy loop
> > using the "time stamp counter" of the x86 cpu for delaying, this was
> > very accurate. The max. jitter of the delaying was about 5 microseconds.
> > 
> > With the new kernel 2.6.26 the jitter is about 51177 microseconds or in
> > other words 51 milliseconds or more the 10000 times greater than kernel
> > 2.6.20. This huge latency is far away from realtime.
> > 
> > Below are the results of the attached test program. Maybe somebody else
> > can confirm this results. All measurements was done with no other
> > process running, only the busybox 1.11.1 shell and the init process was
> > there.
> 
> Has nothing to do with CFS, but everything to do with the fact that we
> now have a 95% bandwidth control by default.
> 
> Does doing:
> 
> echo -1 > /proc/sys/kernel/sched_rt_runtime_us
> 
> fix it?
> 
> So, up to 95% cpu usage (per sched_rt_period_us) FIFO and RR behave like
> they always did, once they cross that line, they'll be throttled.
> 
> 95% seemed like a sane default in that it leaves a little room to
> recover from a run-away rt process (esp handy now that !root users can
> also use RT scheduling classes), and should be enough for most
> applications as they usually don't consume all that much time.
> 
> 
> 


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: SCHED_FIFO and SCHED_RR broken by cfs
  2008-08-16 14:53 ` Peter Zijlstra
  2008-08-16 16:26   ` Stefani Seibold
  2008-08-16 21:29   ` Stefani Seibold
@ 2008-08-17 13:04   ` Nick Piggin
  2008-08-18 10:50     ` Peter Zijlstra
  2 siblings, 1 reply; 17+ messages in thread
From: Nick Piggin @ 2008-08-17 13:04 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Stefani Seibold, linux-kernel, mingo

On Sunday 17 August 2008 00:53, Peter Zijlstra wrote:
> On Sat, 2008-08-16 at 11:55 +0200, Stefani Seibold wrote:
> > Hi kernel hackers,
> >
> > it seems that the new completely fair scheduler breaks the SCHED_RR and
> > SCHED_FIFO realtime scheduler.
> >
> > In my opinion a high priority real time user process with SCHED_FIFO
> > should be only interrupted by the kernel or a process with an higher
> > priority. So a user process running under SCHED_FIFO and priority 99
> > should never be interrupted by any other process.  This was true under
> > kernel 2.6.20.
> >
> > On my pentium/celeron III/400 MHz system with kernel 2.6.20 a busy loop
> > using the "time stamp counter" of the x86 cpu for delaying, this was
> > very accurate. The max. jitter of the delaying was about 5 microseconds.
> >
> > With the new kernel 2.6.26 the jitter is about 51177 microseconds or in
> > other words 51 milliseconds or more the 10000 times greater than kernel
> > 2.6.20. This huge latency is far away from realtime.
> >
> > Below are the results of the attached test program. Maybe somebody else
> > can confirm this results. All measurements was done with no other
> > process running, only the busybox 1.11.1 shell and the init process was
> > there.
>
> Has nothing to do with CFS, but everything to do with the fact that we
> now have a 95% bandwidth control by default.
>
> Does doing:
>
> echo -1 > /proc/sys/kernel/sched_rt_runtime_us
>
> fix it?
>
> So, up to 95% cpu usage (per sched_rt_period_us) FIFO and RR behave like
> they always did, once they cross that line, they'll be throttled.
>
> 95% seemed like a sane default in that it leaves a little room to
> recover from a run-away rt process (esp handy now that !root users can
> also use RT scheduling classes), and should be enough for most
> applications as they usually don't consume all that much time.

Did it seem sane to break POSIX and backwards compatiblity by default?

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: SCHED_FIFO and SCHED_RR broken by cfs
  2008-08-16 21:29   ` Stefani Seibold
@ 2008-08-17 22:15     ` Dario Faggioli
  2008-08-18 10:47       ` [PATCH] sched: rt-bandwidth disable fixes Peter Zijlstra
  0 siblings, 1 reply; 17+ messages in thread
From: Dario Faggioli @ 2008-08-17 22:15 UTC (permalink / raw)
  To: Stefani Seibold; +Cc: Peter Zijlstra, linux-kernel, mingo

[-- Attachment #1: Type: text/plain, Size: 1455 bytes --]

On Sat, 2008-08-16 at 23:29 +0200, Stefani Seibold wrote:
> After disabling kernel support for "Group CPU scheduler" and applying
> 'echo -1 > /proc/sys/kernel/sched_rt_runtime_us' the behaviour is as
> expected.
> 
> chrt -f 99 ./a.out
> 
>  average:13 usec
>  min. jitter:0 usec
>  max. jitter:29 usec
> 
> chrt -o 0 ./a.out 
>  average:153 usec
>  min. jitter:0 usec
>  max. jitter:37035 usec
> 
> So the problem is located first in the new sched_rt_runtime_us default
> value and second in the "Group CPU scheduler".
Well, if you have group scheduling configured I think you should do both
# echo -1 > /proc/sys/kernel/sched_rt_runtime_us
# echo -1 > /dev/cgroup/cpu.rt_runtime_us

if /dev/cgroup is the mount point of the cgroup file system.

In situations like the one you are describing, this worked for me...
Hope that it also helps you! :-)

Regards,
Dario

-- 
<<This happens because I choose it to happen!>>
(Raistlin Majere, DragonLance Chronicles -Dragons of Spring Drawning-)
----------------------------------------------------------------------
Dario Faggioli
GNU/Linux Registered User: #340657
Web: http://www.linux.it/~raistlin
Blog: http://blog.linux.it/raistlin
SIP Account: dario.faggioli@sipproxy.wengo.fr or
             raistlin@ekiga.net
Jabber Account: dario.faggioli@jabber.org/WengoPhone
GnuPG Key ID: 4DC83AC4
GnuPG Key Fingerprint: 2A78 AD5D B9CF A082 0836 08AD 9385 DA04 4DC8 3AC4

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] sched: rt-bandwidth disable fixes
  2008-08-17 22:15     ` Dario Faggioli
@ 2008-08-18 10:47       ` Peter Zijlstra
  2008-08-18 11:11         ` Peter Zijlstra
  0 siblings, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2008-08-18 10:47 UTC (permalink / raw)
  To: Dario Faggioli; +Cc: Stefani Seibold, linux-kernel, mingo

On Mon, 2008-08-18 at 00:15 +0200, Dario Faggioli wrote:
> On Sat, 2008-08-16 at 23:29 +0200, Stefani Seibold wrote:
> > After disabling kernel support for "Group CPU scheduler" and applying
> > 'echo -1 > /proc/sys/kernel/sched_rt_runtime_us' the behaviour is as
> > expected.

> > So the problem is located first in the new sched_rt_runtime_us default
> > value and second in the "Group CPU scheduler".
> Well, if you have group scheduling configured I think you should do both
> # echo -1 > /proc/sys/kernel/sched_rt_runtime_us
> # echo -1 > /dev/cgroup/cpu.rt_runtime_us
> 
> if /dev/cgroup is the mount point of the cgroup file system.
> 
> In situations like the one you are describing, this worked for me...
> Hope that it also helps you! :-)

Ah, right - I knew I was forgetting something..

(compile tested only)

---
Subject: sched: rt-bandwidth disable fixes
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon Aug 18 12:39:07 CEST 2008

Currently there is no way to revert to the classical behaviour if
RT_GROUP_SCHED is set. Fix this by introducing rt_bandwidth_enabled(),
which will turn off all the bandwidth accounting if sched_rt_runtime_us
is set to a negative value.

Also fix a bug where we would still increase the used time when the limit
would be set to RUNTIME_INF - causing a long throttle period once it would
be lowered.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c    |    9 ++++++++-
 kernel/sched_rt.c |   16 +++++++++-------
 2 files changed, 17 insertions(+), 8 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -204,11 +204,13 @@ void init_rt_bandwidth(struct rt_bandwid
 	rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 }
 
+static inline int rt_bandwidth_enabled(void);
+
 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 {
 	ktime_t now;
 
-	if (rt_b->rt_runtime == RUNTIME_INF)
+	if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
 		return;
 
 	if (hrtimer_active(&rt_b->rt_period_timer))
@@ -839,6 +841,11 @@ static inline u64 global_rt_runtime(void
 	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 
+static inline int rt_bandwidth_enabled(void)
+{
+	return sysctl_sched_rt_runtime >= 0;
+}
+
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -386,7 +386,7 @@ static int do_sched_rt_period_timer(stru
 	int i, idle = 1;
 	cpumask_t span;
 
-	if (rt_b->rt_runtime == RUNTIME_INF)
+	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 		return 1;
 
 	span = sched_rt_period_mask();
@@ -438,9 +438,6 @@ static int sched_rt_runtime_exceeded(str
 {
 	u64 runtime = sched_rt_runtime(rt_rq);
 
-	if (runtime == RUNTIME_INF)
-		return 0;
-
 	if (rt_rq->rt_throttled)
 		return rt_rq_throttled(rt_rq);
 
@@ -487,13 +484,18 @@ static void update_curr_rt(struct rq *rq
 	curr->se.exec_start = rq->clock;
 	cpuacct_charge(curr, delta_exec);
 
+	if (!rt_bandwidth_enabled())
+		return;
+
 	for_each_sched_rt_entity(rt_se) {
 		rt_rq = rt_rq_of_se(rt_se);
 
 		spin_lock(&rt_rq->rt_runtime_lock);
-		rt_rq->rt_time += delta_exec;
-		if (sched_rt_runtime_exceeded(rt_rq))
-			resched_task(curr);
+		if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
+			rt_rq->rt_time += delta_exec;
+			if (sched_rt_runtime_exceeded(rt_rq))
+				resched_task(curr);
+		}
 		spin_unlock(&rt_rq->rt_runtime_lock);
 	}
 }



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: SCHED_FIFO and SCHED_RR broken by cfs
  2008-08-17 13:04   ` SCHED_FIFO and SCHED_RR broken by cfs Nick Piggin
@ 2008-08-18 10:50     ` Peter Zijlstra
  2008-08-18 10:58       ` Nick Piggin
  0 siblings, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2008-08-18 10:50 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Stefani Seibold, linux-kernel, mingo

On Sun, 2008-08-17 at 23:04 +1000, Nick Piggin wrote:
> On Sunday 17 August 2008 00:53, Peter Zijlstra wrote:

> > Has nothing to do with CFS, but everything to do with the fact that we
> > now have a 95% bandwidth control by default.
> >
> > Does doing:
> >
> > echo -1 > /proc/sys/kernel/sched_rt_runtime_us
> >
> > fix it?
> >
> > So, up to 95% cpu usage (per sched_rt_period_us) FIFO and RR behave like
> > they always did, once they cross that line, they'll be throttled.
> >
> > 95% seemed like a sane default in that it leaves a little room to
> > recover from a run-away rt process (esp handy now that !root users can
> > also use RT scheduling classes), and should be enough for most
> > applications as they usually don't consume all that much time.
> 
> Did it seem sane to break POSIX and backwards compatiblity by default?

Up to a point, yes.

There were quite a few complaints that runaway RT tasks could render a
machine unusable - which made 'desktop' usage of the RT class unsafe.

This 95%/1s default allows most RT tasks to run without having to tinker
with the settings, and for those who do need something else, they can
get it too, but will have to turn a knob.

But I guess we could change the default back to unlimited and default to
unsafe if people feel strongly about this.



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: SCHED_FIFO and SCHED_RR broken by cfs
  2008-08-18 10:50     ` Peter Zijlstra
@ 2008-08-18 10:58       ` Nick Piggin
  2008-08-18 11:09         ` Peter Zijlstra
  0 siblings, 1 reply; 17+ messages in thread
From: Nick Piggin @ 2008-08-18 10:58 UTC (permalink / raw)
  To: Peter Zijlstra, Torvalds, Linus; +Cc: Stefani Seibold, linux-kernel, mingo

On Monday 18 August 2008 20:50, Peter Zijlstra wrote:
> On Sun, 2008-08-17 at 23:04 +1000, Nick Piggin wrote:
> > On Sunday 17 August 2008 00:53, Peter Zijlstra wrote:
> > > Has nothing to do with CFS, but everything to do with the fact that we
> > > now have a 95% bandwidth control by default.
> > >
> > > Does doing:
> > >
> > > echo -1 > /proc/sys/kernel/sched_rt_runtime_us
> > >
> > > fix it?
> > >
> > > So, up to 95% cpu usage (per sched_rt_period_us) FIFO and RR behave
> > > like they always did, once they cross that line, they'll be throttled.
> > >
> > > 95% seemed like a sane default in that it leaves a little room to
> > > recover from a run-away rt process (esp handy now that !root users can
> > > also use RT scheduling classes), and should be enough for most
> > > applications as they usually don't consume all that much time.
> >
> > Did it seem sane to break POSIX and backwards compatiblity by default?
>
> Up to a point, yes.
>
> There were quite a few complaints that runaway RT tasks could render a
> machine unusable - which made 'desktop' usage of the RT class unsafe.

Right, but it is restricted to root, and if the task is run as root
then it can equally break the system in any number of ways. So the
complaints are just wrong.

I have no problems with having some non-default mode to throttle by
default. And we already have the sysrq which can downgrade RT tasks.


> This 95%/1s default allows most RT tasks to run without having to tinker
> with the settings, and for those who do need something else, they can
> get it too, but will have to turn a knob.

And that could also easily cause huge problems for code that does the
*right* thing.


> But I guess we could change the default back to unlimited and default to
> unsafe if people feel strongly about this.

Yes, you can't just break the API like this. Please do fix.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: SCHED_FIFO and SCHED_RR broken by cfs
  2008-08-18 10:58       ` Nick Piggin
@ 2008-08-18 11:09         ` Peter Zijlstra
  2008-08-18 11:24           ` Nick Piggin
  0 siblings, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2008-08-18 11:09 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Torvalds, Linus, Stefani Seibold, linux-kernel, mingo

On Mon, 2008-08-18 at 20:58 +1000, Nick Piggin wrote:
> On Monday 18 August 2008 20:50, Peter Zijlstra wrote:
> > On Sun, 2008-08-17 at 23:04 +1000, Nick Piggin wrote:
> > > On Sunday 17 August 2008 00:53, Peter Zijlstra wrote:
> > > > Has nothing to do with CFS, but everything to do with the fact that we
> > > > now have a 95% bandwidth control by default.
> > > >
> > > > Does doing:
> > > >
> > > > echo -1 > /proc/sys/kernel/sched_rt_runtime_us
> > > >
> > > > fix it?
> > > >
> > > > So, up to 95% cpu usage (per sched_rt_period_us) FIFO and RR behave
> > > > like they always did, once they cross that line, they'll be throttled.
> > > >
> > > > 95% seemed like a sane default in that it leaves a little room to
> > > > recover from a run-away rt process (esp handy now that !root users can
> > > > also use RT scheduling classes), and should be enough for most
> > > > applications as they usually don't consume all that much time.
> > >
> > > Did it seem sane to break POSIX and backwards compatiblity by default?
> >
> > Up to a point, yes.
> >
> > There were quite a few complaints that runaway RT tasks could render a
> > machine unusable - which made 'desktop' usage of the RT class unsafe.
> 
> Right, but it is restricted to root, and if the task is run as root
> then it can equally break the system in any number of ways. So the
> complaints are just wrong.

Not so, we have RLIMIT_RTPRIO and quite a few people using it.

> I have no problems with having some non-default mode to throttle by
> default. And we already have the sysrq which can downgrade RT tasks.

Yeah - except that most distros disable sysrq and not a single desktop
user knows about it.

> > This 95%/1s default allows most RT tasks to run without having to tinker
> > with the settings, and for those who do need something else, they can
> > get it too, but will have to turn a knob.
> 
> And that could also easily cause huge problems for code that does the
> *right* thing.
> 
> 
> > But I guess we could change the default back to unlimited and default to
> > unsafe if people feel strongly about this.
> 
> Yes, you can't just break the API like this. Please do fix.

Sigh - I guess that means all distros will just set a limit in their
init scripts - leaving those above in the same situation.




^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] sched: rt-bandwidth disable fixes
  2008-08-18 10:47       ` [PATCH] sched: rt-bandwidth disable fixes Peter Zijlstra
@ 2008-08-18 11:11         ` Peter Zijlstra
  0 siblings, 0 replies; 17+ messages in thread
From: Peter Zijlstra @ 2008-08-18 11:11 UTC (permalink / raw)
  To: Dario Faggioli; +Cc: Stefani Seibold, linux-kernel, mingo

On Mon, 2008-08-18 at 12:47 +0200, Peter Zijlstra wrote:
> On Mon, 2008-08-18 at 00:15 +0200, Dario Faggioli wrote:
> > On Sat, 2008-08-16 at 23:29 +0200, Stefani Seibold wrote:
> > > After disabling kernel support for "Group CPU scheduler" and applying
> > > 'echo -1 > /proc/sys/kernel/sched_rt_runtime_us' the behaviour is as
> > > expected.
> 
> > > So the problem is located first in the new sched_rt_runtime_us default
> > > value and second in the "Group CPU scheduler".
> > Well, if you have group scheduling configured I think you should do both
> > # echo -1 > /proc/sys/kernel/sched_rt_runtime_us
> > # echo -1 > /dev/cgroup/cpu.rt_runtime_us
> > 
> > if /dev/cgroup is the mount point of the cgroup file system.
> > 
> > In situations like the one you are describing, this worked for me...
> > Hope that it also helps you! :-)
> 
> Ah, right - I knew I was forgetting something..
> 
> (compile tested only)

Right - I knew I was forgetting something... this patch forgets:

  - to allow tasks to groups that have a 0 limit when bandwidth control
is disabled
  - deal with the trouble that gets us in on enabling it again after
that happens.

So, please skip this patch for now..

> ---
> Subject: sched: rt-bandwidth disable fixes
> From: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Date: Mon Aug 18 12:39:07 CEST 2008
> 
> Currently there is no way to revert to the classical behaviour if
> RT_GROUP_SCHED is set. Fix this by introducing rt_bandwidth_enabled(),
> which will turn off all the bandwidth accounting if sched_rt_runtime_us
> is set to a negative value.
> 
> Also fix a bug where we would still increase the used time when the limit
> would be set to RUNTIME_INF - causing a long throttle period once it would
> be lowered.
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  kernel/sched.c    |    9 ++++++++-
>  kernel/sched_rt.c |   16 +++++++++-------
>  2 files changed, 17 insertions(+), 8 deletions(-)
> 
> Index: linux-2.6/kernel/sched.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched.c
> +++ linux-2.6/kernel/sched.c
> @@ -204,11 +204,13 @@ void init_rt_bandwidth(struct rt_bandwid
>  	rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
>  }
>  
> +static inline int rt_bandwidth_enabled(void);
> +
>  static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
>  {
>  	ktime_t now;
>  
> -	if (rt_b->rt_runtime == RUNTIME_INF)
> +	if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
>  		return;
>  
>  	if (hrtimer_active(&rt_b->rt_period_timer))
> @@ -839,6 +841,11 @@ static inline u64 global_rt_runtime(void
>  	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
>  }
>  
> +static inline int rt_bandwidth_enabled(void)
> +{
> +	return sysctl_sched_rt_runtime >= 0;
> +}
> +
>  #ifndef prepare_arch_switch
>  # define prepare_arch_switch(next)	do { } while (0)
>  #endif
> Index: linux-2.6/kernel/sched_rt.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_rt.c
> +++ linux-2.6/kernel/sched_rt.c
> @@ -386,7 +386,7 @@ static int do_sched_rt_period_timer(stru
>  	int i, idle = 1;
>  	cpumask_t span;
>  
> -	if (rt_b->rt_runtime == RUNTIME_INF)
> +	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
>  		return 1;
>  
>  	span = sched_rt_period_mask();
> @@ -438,9 +438,6 @@ static int sched_rt_runtime_exceeded(str
>  {
>  	u64 runtime = sched_rt_runtime(rt_rq);
>  
> -	if (runtime == RUNTIME_INF)
> -		return 0;
> -
>  	if (rt_rq->rt_throttled)
>  		return rt_rq_throttled(rt_rq);
>  
> @@ -487,13 +484,18 @@ static void update_curr_rt(struct rq *rq
>  	curr->se.exec_start = rq->clock;
>  	cpuacct_charge(curr, delta_exec);
>  
> +	if (!rt_bandwidth_enabled())
> +		return;
> +
>  	for_each_sched_rt_entity(rt_se) {
>  		rt_rq = rt_rq_of_se(rt_se);
>  
>  		spin_lock(&rt_rq->rt_runtime_lock);
> -		rt_rq->rt_time += delta_exec;
> -		if (sched_rt_runtime_exceeded(rt_rq))
> -			resched_task(curr);
> +		if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
> +			rt_rq->rt_time += delta_exec;
> +			if (sched_rt_runtime_exceeded(rt_rq))
> +				resched_task(curr);
> +		}
>  		spin_unlock(&rt_rq->rt_runtime_lock);
>  	}
>  }
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: SCHED_FIFO and SCHED_RR broken by cfs
  2008-08-18 11:09         ` Peter Zijlstra
@ 2008-08-18 11:24           ` Nick Piggin
  2008-08-18 11:51             ` Peter Zijlstra
  0 siblings, 1 reply; 17+ messages in thread
From: Nick Piggin @ 2008-08-18 11:24 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Torvalds, Linus, Stefani Seibold, linux-kernel, mingo

On Monday 18 August 2008 21:09, Peter Zijlstra wrote:
> On Mon, 2008-08-18 at 20:58 +1000, Nick Piggin wrote:
> > On Monday 18 August 2008 20:50, Peter Zijlstra wrote:
> > > On Sun, 2008-08-17 at 23:04 +1000, Nick Piggin wrote:
> > > > On Sunday 17 August 2008 00:53, Peter Zijlstra wrote:
> > > > > Has nothing to do with CFS, but everything to do with the fact that
> > > > > we now have a 95% bandwidth control by default.
> > > > >
> > > > > Does doing:
> > > > >
> > > > > echo -1 > /proc/sys/kernel/sched_rt_runtime_us
> > > > >
> > > > > fix it?
> > > > >
> > > > > So, up to 95% cpu usage (per sched_rt_period_us) FIFO and RR behave
> > > > > like they always did, once they cross that line, they'll be
> > > > > throttled.
> > > > >
> > > > > 95% seemed like a sane default in that it leaves a little room to
> > > > > recover from a run-away rt process (esp handy now that !root users
> > > > > can also use RT scheduling classes), and should be enough for most
> > > > > applications as they usually don't consume all that much time.
> > > >
> > > > Did it seem sane to break POSIX and backwards compatiblity by
> > > > default?
> > >
> > > Up to a point, yes.
> > >
> > > There were quite a few complaints that runaway RT tasks could render a
> > > machine unusable - which made 'desktop' usage of the RT class unsafe.
> >
> > Right, but it is restricted to root, and if the task is run as root
> > then it can equally break the system in any number of ways. So the
> > complaints are just wrong.
>
> Not so, we have RLIMIT_RTPRIO and quite a few people using it.

OK, but it's the same as any privilige granted. You have to be careful
with it.

TBH, it's pretty trivial to write a watchdog process in your RT app.
Actually, many *real* RT applications use one in order to help with
failover/failstop/etc. and I expect _all_ non trivial ones should be
using a watchdog when under development or running in a debugging
mode.


> > I have no problems with having some non-default mode to throttle by
> > default. And we already have the sysrq which can downgrade RT tasks.
>
> Yeah - except that most distros disable sysrq and not a single desktop
> user knows about it.

What are they doing writing RT apps then?


> > > This 95%/1s default allows most RT tasks to run without having to
> > > tinker with the settings, and for those who do need something else,
> > > they can get it too, but will have to turn a knob.
> >
> > And that could also easily cause huge problems for code that does the
> > *right* thing.
> >
> > > But I guess we could change the default back to unlimited and default
> > > to unsafe if people feel strongly about this.
> >
> > Yes, you can't just break the API like this. Please do fix.
>
> Sigh - I guess that means all distros will just set a limit in their
> init scripts - leaving those above in the same situation.

Really, you think the enterprise distros will willingly break POSIX
and their own backwards compatiblity by default? I wouldn't have
thought so, but anyway I guess they are free to make that choice, so
where's the problem?

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: SCHED_FIFO and SCHED_RR broken by cfs
  2008-08-18 11:24           ` Nick Piggin
@ 2008-08-18 11:51             ` Peter Zijlstra
  2008-08-18 12:14               ` Nick Piggin
  0 siblings, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2008-08-18 11:51 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Torvalds, Linus, Stefani Seibold, linux-kernel, mingo

On Mon, 2008-08-18 at 21:24 +1000, Nick Piggin wrote:

> Really, you think the enterprise distros will willingly break POSIX
> and their own backwards compatiblity by default? I wouldn't have
> thought so, but anyway I guess they are free to make that choice, so
> where's the problem?

I'm not seeing why you're making such a big fuss over this - IMO its not
such a significant breakage. Esp since very few realtime apps will
require such large amounts of time to ever run into the throttle.

If their usage is 95%+ cpu they must have magic WCET estamates - or like
in this case, be a benchmark app which IMHO just abuses the real-time
class.

It's like running your real-time code on a 5% slower cpu - if it runs
correctly on the 5% slower cpu, it will run correctly here too.

Note that correctness from a RT pov is making your deadline.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: SCHED_FIFO and SCHED_RR broken by cfs
  2008-08-18 11:51             ` Peter Zijlstra
@ 2008-08-18 12:14               ` Nick Piggin
  2008-08-18 18:01                 ` Max Krasnyansky
  0 siblings, 1 reply; 17+ messages in thread
From: Nick Piggin @ 2008-08-18 12:14 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Torvalds, Linus, Stefani Seibold, linux-kernel, mingo

On Monday 18 August 2008 21:51, Peter Zijlstra wrote:
> On Mon, 2008-08-18 at 21:24 +1000, Nick Piggin wrote:
> > Really, you think the enterprise distros will willingly break POSIX
> > and their own backwards compatiblity by default? I wouldn't have
> > thought so, but anyway I guess they are free to make that choice, so
> > where's the problem?
>
> I'm not seeing why you're making such a big fuss over this - IMO its not
> such a significant breakage. Esp since very few realtime apps will
> require such large amounts of time to ever run into the throttle.
>
> If their usage is 95%+ cpu they must have magic WCET estamates - or like
> in this case, be a benchmark app which IMHO just abuses the real-time
> class.

Note that this certainly does not have to be the case. It is perfectly
valid to dynamically scale the work performed according to the amount
of CPU time available but still be sensitive to latency.

video decoding would be a really simple example. But you can't just
"know" how all RT apps are coded and think this is no problem.


> It's like running your real-time code on a 5% slower cpu - if it runs
> correctly on the 5% slower cpu, it will run correctly here too.

Aside from the latency issue which makes this statement incorrect...
If the code does not run correctly on a 5% slower CPU, it will break.
How is that OK?

You might expect many systems would include at least a 5% margin of
error, but if the kernel takes 5%, then that's 5% of the safety
margin gone, so while the app might "work", it might no longer
meet requirements.


> Note that correctness from a RT pov is making your deadline.

Correctness from the kernel's POV is implementing APIs as advertised,
and just as importantly, not changing them. We can argue about how RT
apps work, but there is no argument that the kernel has broken
backwards compatibility and standards.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: SCHED_FIFO and SCHED_RR broken by cfs
  2008-08-18 12:14               ` Nick Piggin
@ 2008-08-18 18:01                 ` Max Krasnyansky
  2008-08-18 19:46                   ` Peter Zijlstra
  2008-08-19  7:44                   ` Nick Piggin
  0 siblings, 2 replies; 17+ messages in thread
From: Max Krasnyansky @ 2008-08-18 18:01 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Peter Zijlstra, Torvalds, Linus, Stefani Seibold, linux-kernel, mingo

Nick Piggin wrote:
> On Monday 18 August 2008 21:51, Peter Zijlstra wrote:
>> On Mon, 2008-08-18 at 21:24 +1000, Nick Piggin wrote:
>>> Really, you think the enterprise distros will willingly break POSIX
>>> and their own backwards compatiblity by default? I wouldn't have
>>> thought so, but anyway I guess they are free to make that choice, so
>>> where's the problem?
>> I'm not seeing why you're making such a big fuss over this - IMO its not
>> such a significant breakage. Esp since very few realtime apps will
>> require such large amounts of time to ever run into the throttle.
>>
>> If their usage is 95%+ cpu they must have magic WCET estamates - or like
>> in this case, be a benchmark app which IMHO just abuses the real-time
>> class.
> 
> Note that this certainly does not have to be the case. It is perfectly
> valid to dynamically scale the work performed according to the amount
> of CPU time available but still be sensitive to latency.
> 
> video decoding would be a really simple example. But you can't just
> "know" how all RT apps are coded and think this is no problem.
> 
> 
>> It's like running your real-time code on a 5% slower cpu - if it runs
>> correctly on the 5% slower cpu, it will run correctly here too.
> 
> Aside from the latency issue which makes this statement incorrect...
> If the code does not run correctly on a 5% slower CPU, it will break.
> How is that OK?
> 
> You might expect many systems would include at least a 5% margin of
> error, but if the kernel takes 5%, then that's 5% of the safety
> margin gone, so while the app might "work", it might no longer
> meet requirements.
> 
> 
>> Note that correctness from a RT pov is making your deadline.
> 
> Correctness from the kernel's POV is implementing APIs as advertised,
> and just as importantly, not changing them. We can argue about how RT
> apps work, but there is no argument that the kernel has broken
> backwards compatibility and standards.

Just wanted to mention that I'm with Nick on this one. I pointed this 
(ie POSIX breakage) out as soon as the change went in. I do have a valid 
  (which some people disagree with ;-)) workload that uses 100% of the 
CPU. So my unit-tests caught this right away.

Anyway, "RT bandwidth throttling" has been in and enabled be default 
since 2.6.25. So I'm not sure if it makes sense to revert the default at 
this point.
If we do change the default maybe we can add a CONFIG_ option for this 
so that it can be compiled out completely.

Max





^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: SCHED_FIFO and SCHED_RR broken by cfs
  2008-08-18 18:01                 ` Max Krasnyansky
@ 2008-08-18 19:46                   ` Peter Zijlstra
  2008-08-19  7:44                   ` Nick Piggin
  1 sibling, 0 replies; 17+ messages in thread
From: Peter Zijlstra @ 2008-08-18 19:46 UTC (permalink / raw)
  To: Max Krasnyansky
  Cc: Nick Piggin, Torvalds, Linus, Stefani Seibold, linux-kernel, mingo

On Mon, 2008-08-18 at 11:01 -0700, Max Krasnyansky wrote:

> Just wanted to mention that I'm with Nick on this one. I pointed this 
> (ie POSIX breakage) out as soon as the change went in. I do have a valid 
>   (which some people disagree with ;-)) workload that uses 100% of the 
> CPU. So my unit-tests caught this right away.
> 
> Anyway, "RT bandwidth throttling" has been in and enabled be default 
> since 2.6.25. So I'm not sure if it makes sense to revert the default at 
> this point.

Already working on a patches to make it so..

The patch to change the default is simple enough - but I spotted a few
other bugs while poking at the issue Dario pointed out.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: SCHED_FIFO and SCHED_RR broken by cfs
  2008-08-18 18:01                 ` Max Krasnyansky
  2008-08-18 19:46                   ` Peter Zijlstra
@ 2008-08-19  7:44                   ` Nick Piggin
  1 sibling, 0 replies; 17+ messages in thread
From: Nick Piggin @ 2008-08-19  7:44 UTC (permalink / raw)
  To: Max Krasnyansky
  Cc: Peter Zijlstra, Torvalds, Linus, Stefani Seibold, linux-kernel, mingo

On Tuesday 19 August 2008 04:01, Max Krasnyansky wrote:
> Nick Piggin wrote:
> > On Monday 18 August 2008 21:51, Peter Zijlstra wrote:

> > Correctness from the kernel's POV is implementing APIs as advertised,
> > and just as importantly, not changing them. We can argue about how RT
> > apps work, but there is no argument that the kernel has broken
> > backwards compatibility and standards.
>
> Just wanted to mention that I'm with Nick on this one. I pointed this
> (ie POSIX breakage) out as soon as the change went in. I do have a valid
>   (which some people disagree with ;-)) workload that uses 100% of the
> CPU. So my unit-tests caught this right away.

Ouch. Yep, so much for making assumptions about how apps will use the
API.


> Anyway, "RT bandwidth throttling" has been in and enabled be default
> since 2.6.25. So I'm not sure if it makes sense to revert the default at
> this point.
> If we do change the default maybe we can add a CONFIG_ option for this
> so that it can be compiled out completely.

It definitely does. Most serious users deploying real realtime code
will not be close to 2.6.25. If we leave it until they start complaining,
the problem will be much bigger because then we'll have sets of people
that rely on both behaviours.

Please fix this Peter.

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2008-08-19  7:44 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-08-16  9:55 SCHED_FIFO and SCHED_RR broken by cfs Stefani Seibold
2008-08-16 14:53 ` Peter Zijlstra
2008-08-16 16:26   ` Stefani Seibold
2008-08-16 21:29   ` Stefani Seibold
2008-08-17 22:15     ` Dario Faggioli
2008-08-18 10:47       ` [PATCH] sched: rt-bandwidth disable fixes Peter Zijlstra
2008-08-18 11:11         ` Peter Zijlstra
2008-08-17 13:04   ` SCHED_FIFO and SCHED_RR broken by cfs Nick Piggin
2008-08-18 10:50     ` Peter Zijlstra
2008-08-18 10:58       ` Nick Piggin
2008-08-18 11:09         ` Peter Zijlstra
2008-08-18 11:24           ` Nick Piggin
2008-08-18 11:51             ` Peter Zijlstra
2008-08-18 12:14               ` Nick Piggin
2008-08-18 18:01                 ` Max Krasnyansky
2008-08-18 19:46                   ` Peter Zijlstra
2008-08-19  7:44                   ` Nick Piggin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).