linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Network slowdown due to CFS
@ 2007-09-26  8:52 Martin Michlmayr
  2007-09-26  9:34 ` Ingo Molnar
                   ` (2 more replies)
  0 siblings, 3 replies; 71+ messages in thread
From: Martin Michlmayr @ 2007-09-26  8:52 UTC (permalink / raw)
  To: Ingo Molnar, Srivatsa Vaddagiri; +Cc: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 805 bytes --]

I noticed that my network performance has gone down from 2.6.22
from   [  3]  0.0-10.0 sec    113 MBytes  95.0 Mbits/sec
to     [  3]  0.0-10.0 sec   75.7 MBytes  63.3 Mbits/sec
with 2.6.23-rc1 (and 2.6.23-rc8), as measured with iperf.

I did a git bisect today and tracked it back to the commit where CFS
was enabled ("sched: cfs core code; apply the CFS core code",
commit dd41f596cda0d7d6e4a8b139ffdfabcefdd46528).  I also compiled a
kernel from
git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched-devel.git
but things don't improve.

This is on a Thecus N2100, an ARM (Intel IOP32x) based storage device
with a r8169 card, SATA disks and 512 MB RAM.  My config is attached.

What kind of information can I supply so you can track this down?
-- 
Martin Michlmayr
http://www.cyrius.com/

[-- Attachment #2: .config --]
[-- Type: text/plain, Size: 31150 bytes --]

#
# Automatically generated make config: don't edit
# Linux kernel version: 2.6.22
# Wed Sep 26 07:56:08 2007
#
CONFIG_ARM=y
CONFIG_SYS_SUPPORTS_APM_EMULATION=y
# CONFIG_GENERIC_GPIO is not set
# CONFIG_GENERIC_TIME is not set
# CONFIG_GENERIC_CLOCKEVENTS is not set
CONFIG_MMU=y
# CONFIG_NO_IOPORT is not set
CONFIG_GENERIC_HARDIRQS=y
CONFIG_STACKTRACE_SUPPORT=y
CONFIG_LOCKDEP_SUPPORT=y
CONFIG_TRACE_IRQFLAGS_SUPPORT=y
CONFIG_HARDIRQS_SW_RESEND=y
CONFIG_GENERIC_IRQ_PROBE=y
CONFIG_RWSEM_GENERIC_SPINLOCK=y
# CONFIG_ARCH_HAS_ILOG2_U32 is not set
# CONFIG_ARCH_HAS_ILOG2_U64 is not set
CONFIG_GENERIC_HWEIGHT=y
CONFIG_GENERIC_CALIBRATE_DELAY=y
CONFIG_ZONE_DMA=y
CONFIG_VECTORS_BASE=0xffff0000
CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"

#
# Code maturity level options
#
CONFIG_EXPERIMENTAL=y
CONFIG_BROKEN_ON_SMP=y
CONFIG_INIT_ENV_ARG_LIMIT=32

#
# General setup
#
CONFIG_LOCALVERSION=""
CONFIG_LOCALVERSION_AUTO=y
CONFIG_SWAP=y
CONFIG_SYSVIPC=y
# CONFIG_IPC_NS is not set
CONFIG_SYSVIPC_SYSCTL=y
# CONFIG_POSIX_MQUEUE is not set
CONFIG_BSD_PROCESS_ACCT=y
# CONFIG_BSD_PROCESS_ACCT_V3 is not set
# CONFIG_TASKSTATS is not set
# CONFIG_UTS_NS is not set
# CONFIG_AUDIT is not set
# CONFIG_IKCONFIG is not set
CONFIG_LOG_BUF_SHIFT=14
CONFIG_SYSFS_DEPRECATED=y
# CONFIG_RELAY is not set
CONFIG_BLK_DEV_INITRD=y
CONFIG_INITRAMFS_SOURCE=""
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
CONFIG_SYSCTL=y
# CONFIG_EMBEDDED is not set
CONFIG_UID16=y
CONFIG_SYSCTL_SYSCALL=y
CONFIG_KALLSYMS=y
CONFIG_KALLSYMS_ALL=y
# CONFIG_KALLSYMS_EXTRA_PASS is not set
CONFIG_HOTPLUG=y
CONFIG_PRINTK=y
CONFIG_BUG=y
CONFIG_ELF_CORE=y
CONFIG_BASE_FULL=y
CONFIG_FUTEX=y
CONFIG_ANON_INODES=y
CONFIG_EPOLL=y
CONFIG_SIGNALFD=y
CONFIG_TIMERFD=y
CONFIG_EVENTFD=y
CONFIG_SHMEM=y
CONFIG_VM_EVENT_COUNTERS=y
CONFIG_SLAB=y
# CONFIG_SLUB is not set
# CONFIG_SLOB is not set
CONFIG_RT_MUTEXES=y
# CONFIG_TINY_SHMEM is not set
CONFIG_BASE_SMALL=0

#
# Loadable module support
#
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
# CONFIG_MODULE_FORCE_UNLOAD is not set
# CONFIG_MODVERSIONS is not set
# CONFIG_MODULE_SRCVERSION_ALL is not set
CONFIG_KMOD=y

#
# Block layer
#
CONFIG_BLOCK=y
# CONFIG_LBD is not set
# CONFIG_BLK_DEV_IO_TRACE is not set
# CONFIG_LSF is not set

#
# IO Schedulers
#
CONFIG_IOSCHED_NOOP=y
CONFIG_IOSCHED_AS=y
CONFIG_IOSCHED_DEADLINE=y
CONFIG_IOSCHED_CFQ=y
# CONFIG_DEFAULT_AS is not set
# CONFIG_DEFAULT_DEADLINE is not set
CONFIG_DEFAULT_CFQ=y
# CONFIG_DEFAULT_NOOP is not set
CONFIG_DEFAULT_IOSCHED="cfq"

#
# System Type
#
# CONFIG_ARCH_AAEC2000 is not set
# CONFIG_ARCH_INTEGRATOR is not set
# CONFIG_ARCH_REALVIEW is not set
# CONFIG_ARCH_VERSATILE is not set
# CONFIG_ARCH_AT91 is not set
# CONFIG_ARCH_CLPS7500 is not set
# CONFIG_ARCH_CLPS711X is not set
# CONFIG_ARCH_CO285 is not set
# CONFIG_ARCH_EBSA110 is not set
# CONFIG_ARCH_EP93XX is not set
# CONFIG_ARCH_FOOTBRIDGE is not set
# CONFIG_ARCH_NETX is not set
# CONFIG_ARCH_H720X is not set
# CONFIG_ARCH_IMX is not set
# CONFIG_ARCH_IOP13XX is not set
CONFIG_ARCH_IOP32X=y
# CONFIG_ARCH_IOP33X is not set
# CONFIG_ARCH_IXP23XX is not set
# CONFIG_ARCH_IXP2000 is not set
# CONFIG_ARCH_IXP4XX is not set
# CONFIG_ARCH_L7200 is not set
# CONFIG_ARCH_KS8695 is not set
# CONFIG_ARCH_NS9XXX is not set
# CONFIG_ARCH_PNX4008 is not set
# CONFIG_ARCH_PXA is not set
# CONFIG_ARCH_RPC is not set
# CONFIG_ARCH_SA1100 is not set
# CONFIG_ARCH_S3C2410 is not set
# CONFIG_ARCH_SHARK is not set
# CONFIG_ARCH_LH7A40X is not set
# CONFIG_ARCH_DAVINCI is not set
# CONFIG_ARCH_OMAP is not set

#
# IOP32x Implementation Options
#

#
# IOP32x Platform Types
#
CONFIG_MACH_EP80219=y
CONFIG_MACH_GLANTANK=y
CONFIG_ARCH_IQ80321=y
CONFIG_ARCH_IQ31244=y
CONFIG_MACH_N2100=y
CONFIG_IOP3XX_ATU=y
CONFIG_PLAT_IOP=y

#
# Processor Type
#
CONFIG_CPU_32=y
CONFIG_CPU_XSCALE=y
CONFIG_CPU_32v5=y
CONFIG_CPU_ABRT_EV5T=y
CONFIG_CPU_CACHE_VIVT=y
CONFIG_CPU_TLB_V4WBI=y
CONFIG_CPU_CP15=y
CONFIG_CPU_CP15_MMU=y

#
# Processor Features
#
# CONFIG_ARM_THUMB is not set
# CONFIG_CPU_DCACHE_DISABLE is not set
# CONFIG_OUTER_CACHE is not set
# CONFIG_IWMMXT is not set
CONFIG_XSCALE_PMU=y

#
# Bus support
#
CONFIG_PCI=y
# CONFIG_ARCH_SUPPORTS_MSI is not set
# CONFIG_PCI_DEBUG is not set

#
# PCCARD (PCMCIA/CardBus) support
#
# CONFIG_PCCARD is not set

#
# Kernel Features
#
# CONFIG_TICK_ONESHOT is not set
# CONFIG_PREEMPT is not set
# CONFIG_NO_IDLE_HZ is not set
CONFIG_HZ=100
# CONFIG_AEABI is not set
# CONFIG_ARCH_DISCONTIGMEM_ENABLE is not set
CONFIG_SELECT_MEMORY_MODEL=y
CONFIG_FLATMEM_MANUAL=y
# CONFIG_DISCONTIGMEM_MANUAL is not set
# CONFIG_SPARSEMEM_MANUAL is not set
CONFIG_FLATMEM=y
CONFIG_FLAT_NODE_MEM_MAP=y
# CONFIG_SPARSEMEM_STATIC is not set
CONFIG_SPLIT_PTLOCK_CPUS=4096
# CONFIG_RESOURCES_64BIT is not set
CONFIG_ZONE_DMA_FLAG=1
CONFIG_ALIGNMENT_TRAP=y

#
# Boot options
#
CONFIG_ZBOOT_ROM_TEXT=0x0
CONFIG_ZBOOT_ROM_BSS=0x0
CONFIG_CMDLINE="console=ttyS0,115200 root=/dev/nfs ip=bootp cachepolicy=writealloc"
# CONFIG_XIP_KERNEL is not set
# CONFIG_KEXEC is not set

#
# Floating point emulation
#

#
# At least one emulation must be selected
#
CONFIG_FPE_NWFPE=y
# CONFIG_FPE_NWFPE_XP is not set
# CONFIG_FPE_FASTFPE is not set

#
# Userspace binary formats
#
CONFIG_BINFMT_ELF=y
CONFIG_BINFMT_AOUT=y
# CONFIG_BINFMT_MISC is not set
# CONFIG_ARTHUR is not set

#
# Power management options
#
# CONFIG_PM is not set

#
# Networking
#
CONFIG_NET=y

#
# Networking options
#
CONFIG_PACKET=y
CONFIG_PACKET_MMAP=y
CONFIG_UNIX=y
CONFIG_XFRM=y
# CONFIG_XFRM_USER is not set
# CONFIG_XFRM_SUB_POLICY is not set
# CONFIG_XFRM_MIGRATE is not set
# CONFIG_NET_KEY is not set
CONFIG_INET=y
CONFIG_IP_MULTICAST=y
# CONFIG_IP_ADVANCED_ROUTER is not set
CONFIG_IP_FIB_HASH=y
CONFIG_IP_PNP=y
# CONFIG_IP_PNP_DHCP is not set
CONFIG_IP_PNP_BOOTP=y
# CONFIG_IP_PNP_RARP is not set
# CONFIG_NET_IPIP is not set
# CONFIG_NET_IPGRE is not set
# CONFIG_IP_MROUTE is not set
# CONFIG_ARPD is not set
# CONFIG_SYN_COOKIES is not set
# CONFIG_INET_AH is not set
# CONFIG_INET_ESP is not set
# CONFIG_INET_IPCOMP is not set
# CONFIG_INET_XFRM_TUNNEL is not set
# CONFIG_INET_TUNNEL is not set
CONFIG_INET_XFRM_MODE_TRANSPORT=y
CONFIG_INET_XFRM_MODE_TUNNEL=y
CONFIG_INET_XFRM_MODE_BEET=y
CONFIG_INET_DIAG=y
CONFIG_INET_TCP_DIAG=y
# CONFIG_TCP_CONG_ADVANCED is not set
CONFIG_TCP_CONG_CUBIC=y
CONFIG_DEFAULT_TCP_CONG="cubic"
# CONFIG_TCP_MD5SIG is not set
CONFIG_IPV6=y
# CONFIG_IPV6_PRIVACY is not set
# CONFIG_IPV6_ROUTER_PREF is not set
# CONFIG_IPV6_OPTIMISTIC_DAD is not set
# CONFIG_INET6_AH is not set
# CONFIG_INET6_ESP is not set
# CONFIG_INET6_IPCOMP is not set
# CONFIG_IPV6_MIP6 is not set
# CONFIG_INET6_XFRM_TUNNEL is not set
# CONFIG_INET6_TUNNEL is not set
# CONFIG_INET6_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET6_XFRM_MODE_TUNNEL is not set
# CONFIG_INET6_XFRM_MODE_BEET is not set
# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set
# CONFIG_IPV6_SIT is not set
# CONFIG_IPV6_TUNNEL is not set
# CONFIG_IPV6_MULTIPLE_TABLES is not set
# CONFIG_NETWORK_SECMARK is not set
# CONFIG_NETFILTER is not set
# CONFIG_IP_DCCP is not set
# CONFIG_IP_SCTP is not set
# CONFIG_TIPC is not set
# CONFIG_ATM is not set
# CONFIG_BRIDGE is not set
# CONFIG_VLAN_8021Q is not set
# CONFIG_DECNET is not set
# CONFIG_LLC2 is not set
# CONFIG_IPX is not set
# CONFIG_ATALK is not set
# CONFIG_X25 is not set
# CONFIG_LAPB is not set
# CONFIG_ECONET is not set
# CONFIG_WAN_ROUTER is not set

#
# QoS and/or fair queueing
#
# CONFIG_NET_SCHED is not set

#
# Network testing
#
# CONFIG_NET_PKTGEN is not set
# CONFIG_HAMRADIO is not set
# CONFIG_IRDA is not set
# CONFIG_BT is not set
# CONFIG_AF_RXRPC is not set

#
# Wireless
#
# CONFIG_CFG80211 is not set
# CONFIG_WIRELESS_EXT is not set
# CONFIG_MAC80211 is not set
# CONFIG_IEEE80211 is not set
# CONFIG_RFKILL is not set

#
# Device Drivers
#

#
# Generic Driver Options
#
CONFIG_STANDALONE=y
CONFIG_PREVENT_FIRMWARE_BUILD=y
# CONFIG_FW_LOADER is not set
# CONFIG_DEBUG_DRIVER is not set
# CONFIG_DEBUG_DEVRES is not set
# CONFIG_SYS_HYPERVISOR is not set

#
# Connector - unified userspace <-> kernelspace linker
#
# CONFIG_CONNECTOR is not set
CONFIG_MTD=y
# CONFIG_MTD_DEBUG is not set
# CONFIG_MTD_CONCAT is not set
CONFIG_MTD_PARTITIONS=y
CONFIG_MTD_REDBOOT_PARTS=y
CONFIG_MTD_REDBOOT_DIRECTORY_BLOCK=-1
CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED=y
CONFIG_MTD_REDBOOT_PARTS_READONLY=y
# CONFIG_MTD_CMDLINE_PARTS is not set
# CONFIG_MTD_AFS_PARTS is not set

#
# User Modules And Translation Layers
#
CONFIG_MTD_CHAR=y
CONFIG_MTD_BLKDEVS=y
CONFIG_MTD_BLOCK=y
# CONFIG_FTL is not set
# CONFIG_NFTL is not set
# CONFIG_INFTL is not set
# CONFIG_RFD_FTL is not set
# CONFIG_SSFDC is not set

#
# RAM/ROM/Flash chip drivers
#
CONFIG_MTD_CFI=y
# CONFIG_MTD_JEDECPROBE is not set
CONFIG_MTD_GEN_PROBE=y
# CONFIG_MTD_CFI_ADV_OPTIONS is not set
CONFIG_MTD_MAP_BANK_WIDTH_1=y
CONFIG_MTD_MAP_BANK_WIDTH_2=y
CONFIG_MTD_MAP_BANK_WIDTH_4=y
# CONFIG_MTD_MAP_BANK_WIDTH_8 is not set
# CONFIG_MTD_MAP_BANK_WIDTH_16 is not set
# CONFIG_MTD_MAP_BANK_WIDTH_32 is not set
CONFIG_MTD_CFI_I1=y
CONFIG_MTD_CFI_I2=y
# CONFIG_MTD_CFI_I4 is not set
# CONFIG_MTD_CFI_I8 is not set
CONFIG_MTD_CFI_INTELEXT=y
# CONFIG_MTD_CFI_AMDSTD is not set
# CONFIG_MTD_CFI_STAA is not set
CONFIG_MTD_CFI_UTIL=y
# CONFIG_MTD_RAM is not set
# CONFIG_MTD_ROM is not set
# CONFIG_MTD_ABSENT is not set

#
# Mapping drivers for chip access
#
# CONFIG_MTD_COMPLEX_MAPPINGS is not set
CONFIG_MTD_PHYSMAP=y
CONFIG_MTD_PHYSMAP_START=0x0
CONFIG_MTD_PHYSMAP_LEN=0x0
CONFIG_MTD_PHYSMAP_BANKWIDTH=1
# CONFIG_MTD_ARM_INTEGRATOR is not set
# CONFIG_MTD_PLATRAM is not set

#
# Self-contained MTD device drivers
#
# CONFIG_MTD_PMC551 is not set
# CONFIG_MTD_SLRAM is not set
# CONFIG_MTD_PHRAM is not set
# CONFIG_MTD_MTDRAM is not set
# CONFIG_MTD_BLOCK2MTD is not set

#
# Disk-On-Chip Device Drivers
#
# CONFIG_MTD_DOC2000 is not set
# CONFIG_MTD_DOC2001 is not set
# CONFIG_MTD_DOC2001PLUS is not set
# CONFIG_MTD_NAND is not set
# CONFIG_MTD_ONENAND is not set

#
# UBI - Unsorted block images
#
# CONFIG_MTD_UBI is not set

#
# Parallel port support
#
# CONFIG_PARPORT is not set

#
# Plug and Play support
#
# CONFIG_PNPACPI is not set

#
# Block devices
#
# CONFIG_BLK_CPQ_DA is not set
# CONFIG_BLK_CPQ_CISS_DA is not set
# CONFIG_BLK_DEV_DAC960 is not set
# CONFIG_BLK_DEV_UMEM is not set
# CONFIG_BLK_DEV_COW_COMMON is not set
CONFIG_BLK_DEV_LOOP=y
# CONFIG_BLK_DEV_CRYPTOLOOP is not set
CONFIG_BLK_DEV_NBD=y
# CONFIG_BLK_DEV_SX8 is not set
# CONFIG_BLK_DEV_UB is not set
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_COUNT=16
CONFIG_BLK_DEV_RAM_SIZE=8192
CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024
# CONFIG_CDROM_PKTCDVD is not set
# CONFIG_ATA_OVER_ETH is not set
# CONFIG_IDE is not set

#
# SCSI device support
#
# CONFIG_RAID_ATTRS is not set
CONFIG_SCSI=y
# CONFIG_SCSI_TGT is not set
# CONFIG_SCSI_NETLINK is not set
CONFIG_SCSI_PROC_FS=y

#
# SCSI support type (disk, tape, CD-ROM)
#
CONFIG_BLK_DEV_SD=y
# CONFIG_CHR_DEV_ST is not set
# CONFIG_CHR_DEV_OSST is not set
# CONFIG_BLK_DEV_SR is not set
CONFIG_CHR_DEV_SG=y
# CONFIG_CHR_DEV_SCH is not set

#
# Some SCSI devices (e.g. CD jukebox) support multiple LUNs
#
# CONFIG_SCSI_MULTI_LUN is not set
# CONFIG_SCSI_CONSTANTS is not set
# CONFIG_SCSI_LOGGING is not set
# CONFIG_SCSI_SCAN_ASYNC is not set
CONFIG_SCSI_WAIT_SCAN=m

#
# SCSI Transports
#
# CONFIG_SCSI_SPI_ATTRS is not set
# CONFIG_SCSI_FC_ATTRS is not set
# CONFIG_SCSI_ISCSI_ATTRS is not set
# CONFIG_SCSI_SAS_ATTRS is not set
# CONFIG_SCSI_SAS_LIBSAS is not set

#
# SCSI low-level drivers
#
# CONFIG_ISCSI_TCP is not set
# CONFIG_BLK_DEV_3W_XXXX_RAID is not set
# CONFIG_SCSI_3W_9XXX is not set
# CONFIG_SCSI_ACARD is not set
# CONFIG_SCSI_AACRAID is not set
# CONFIG_SCSI_AIC7XXX is not set
# CONFIG_SCSI_AIC7XXX_OLD is not set
# CONFIG_SCSI_AIC79XX is not set
# CONFIG_SCSI_AIC94XX is not set
# CONFIG_SCSI_DPT_I2O is not set
# CONFIG_SCSI_ARCMSR is not set
# CONFIG_MEGARAID_NEWGEN is not set
# CONFIG_MEGARAID_LEGACY is not set
# CONFIG_MEGARAID_SAS is not set
# CONFIG_SCSI_HPTIOP is not set
# CONFIG_SCSI_DMX3191D is not set
# CONFIG_SCSI_FUTURE_DOMAIN is not set
# CONFIG_SCSI_IPS is not set
# CONFIG_SCSI_INITIO is not set
# CONFIG_SCSI_INIA100 is not set
# CONFIG_SCSI_STEX is not set
# CONFIG_SCSI_SYM53C8XX_2 is not set
# CONFIG_SCSI_IPR is not set
# CONFIG_SCSI_QLOGIC_1280 is not set
# CONFIG_SCSI_QLA_FC is not set
# CONFIG_SCSI_QLA_ISCSI is not set
# CONFIG_SCSI_LPFC is not set
# CONFIG_SCSI_DC395x is not set
# CONFIG_SCSI_DC390T is not set
# CONFIG_SCSI_NSP32 is not set
# CONFIG_SCSI_DEBUG is not set
# CONFIG_SCSI_SRP is not set
CONFIG_ATA=y
# CONFIG_ATA_NONSTANDARD is not set
# CONFIG_SATA_AHCI is not set
# CONFIG_SATA_SVW is not set
# CONFIG_ATA_PIIX is not set
# CONFIG_SATA_MV is not set
# CONFIG_SATA_NV is not set
# CONFIG_PDC_ADMA is not set
# CONFIG_SATA_QSTOR is not set
# CONFIG_SATA_PROMISE is not set
# CONFIG_SATA_SX4 is not set
CONFIG_SATA_SIL=y
# CONFIG_SATA_SIL24 is not set
# CONFIG_SATA_SIS is not set
# CONFIG_SATA_ULI is not set
# CONFIG_SATA_VIA is not set
CONFIG_SATA_VITESSE=y
# CONFIG_SATA_INIC162X is not set
# CONFIG_PATA_ALI is not set
# CONFIG_PATA_AMD is not set
# CONFIG_PATA_ARTOP is not set
# CONFIG_PATA_ATIIXP is not set
# CONFIG_PATA_CMD640_PCI is not set
# CONFIG_PATA_CMD64X is not set
# CONFIG_PATA_CS5520 is not set
# CONFIG_PATA_CS5530 is not set
# CONFIG_PATA_CYPRESS is not set
# CONFIG_PATA_EFAR is not set
# CONFIG_ATA_GENERIC is not set
# CONFIG_PATA_HPT366 is not set
# CONFIG_PATA_HPT37X is not set
# CONFIG_PATA_HPT3X2N is not set
# CONFIG_PATA_HPT3X3 is not set
# CONFIG_PATA_IT821X is not set
# CONFIG_PATA_IT8213 is not set
# CONFIG_PATA_JMICRON is not set
# CONFIG_PATA_TRIFLEX is not set
# CONFIG_PATA_MARVELL is not set
# CONFIG_PATA_MPIIX is not set
# CONFIG_PATA_OLDPIIX is not set
# CONFIG_PATA_NETCELL is not set
# CONFIG_PATA_NS87410 is not set
# CONFIG_PATA_OPTI is not set
# CONFIG_PATA_OPTIDMA is not set
# CONFIG_PATA_PDC_OLD is not set
# CONFIG_PATA_RADISYS is not set
# CONFIG_PATA_RZ1000 is not set
# CONFIG_PATA_SC1200 is not set
# CONFIG_PATA_SERVERWORKS is not set
# CONFIG_PATA_PDC2027X is not set
# CONFIG_PATA_SIL680 is not set
# CONFIG_PATA_SIS is not set
# CONFIG_PATA_VIA is not set
# CONFIG_PATA_WINBOND is not set

#
# Multi-device support (RAID and LVM)
#
CONFIG_MD=y
CONFIG_BLK_DEV_MD=y
# CONFIG_MD_LINEAR is not set
CONFIG_MD_RAID0=y
CONFIG_MD_RAID1=y
CONFIG_MD_RAID10=y
CONFIG_MD_RAID456=y
# CONFIG_MD_RAID5_RESHAPE is not set
# CONFIG_MD_MULTIPATH is not set
# CONFIG_MD_FAULTY is not set
CONFIG_BLK_DEV_DM=y
# CONFIG_DM_DEBUG is not set
# CONFIG_DM_CRYPT is not set
# CONFIG_DM_SNAPSHOT is not set
# CONFIG_DM_MIRROR is not set
# CONFIG_DM_ZERO is not set
# CONFIG_DM_MULTIPATH is not set
# CONFIG_DM_DELAY is not set

#
# Fusion MPT device support
#
# CONFIG_FUSION is not set
# CONFIG_FUSION_SPI is not set
# CONFIG_FUSION_FC is not set
# CONFIG_FUSION_SAS is not set

#
# IEEE 1394 (FireWire) support
#
# CONFIG_FIREWIRE is not set
# CONFIG_IEEE1394 is not set

#
# I2O device support
#
# CONFIG_I2O is not set

#
# Network device support
#
CONFIG_NETDEVICES=y
# CONFIG_DUMMY is not set
# CONFIG_BONDING is not set
# CONFIG_EQUALIZER is not set
# CONFIG_TUN is not set
# CONFIG_ARCNET is not set
# CONFIG_PHYLIB is not set

#
# Ethernet (10 or 100Mbit)
#
CONFIG_NET_ETHERNET=y
CONFIG_MII=y
# CONFIG_HAPPYMEAL is not set
# CONFIG_SUNGEM is not set
# CONFIG_CASSINI is not set
# CONFIG_NET_VENDOR_3COM is not set
# CONFIG_SMC91X is not set
# CONFIG_DM9000 is not set

#
# Tulip family network device support
#
# CONFIG_NET_TULIP is not set
# CONFIG_HP100 is not set
CONFIG_NET_PCI=y
# CONFIG_PCNET32 is not set
# CONFIG_AMD8111_ETH is not set
# CONFIG_ADAPTEC_STARFIRE is not set
# CONFIG_B44 is not set
# CONFIG_FORCEDETH is not set
# CONFIG_DGRS is not set
# CONFIG_EEPRO100 is not set
CONFIG_E100=y
# CONFIG_FEALNX is not set
# CONFIG_NATSEMI is not set
# CONFIG_NE2K_PCI is not set
# CONFIG_8139CP is not set
# CONFIG_8139TOO is not set
# CONFIG_SIS900 is not set
# CONFIG_EPIC100 is not set
# CONFIG_SUNDANCE is not set
# CONFIG_TLAN is not set
# CONFIG_VIA_RHINE is not set
# CONFIG_SC92031 is not set
CONFIG_NETDEV_1000=y
# CONFIG_ACENIC is not set
# CONFIG_DL2K is not set
CONFIG_E1000=y
CONFIG_E1000_NAPI=y
# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set
# CONFIG_NS83820 is not set
# CONFIG_HAMACHI is not set
# CONFIG_YELLOWFIN is not set
CONFIG_R8169=y
# CONFIG_R8169_NAPI is not set
# CONFIG_SIS190 is not set
# CONFIG_SKGE is not set
# CONFIG_SKY2 is not set
# CONFIG_SK98LIN is not set
# CONFIG_VIA_VELOCITY is not set
# CONFIG_TIGON3 is not set
# CONFIG_BNX2 is not set
# CONFIG_QLA3XXX is not set
# CONFIG_ATL1 is not set
CONFIG_NETDEV_10000=y
# CONFIG_CHELSIO_T1 is not set
# CONFIG_CHELSIO_T3 is not set
# CONFIG_IXGB is not set
# CONFIG_S2IO is not set
# CONFIG_MYRI10GE is not set
# CONFIG_NETXEN_NIC is not set
# CONFIG_MLX4_CORE is not set
# CONFIG_TR is not set

#
# Wireless LAN
#
# CONFIG_WLAN_PRE80211 is not set
# CONFIG_WLAN_80211 is not set

#
# USB Network Adapters
#
# CONFIG_USB_CATC is not set
# CONFIG_USB_KAWETH is not set
# CONFIG_USB_PEGASUS is not set
# CONFIG_USB_RTL8150 is not set
# CONFIG_USB_USBNET_MII is not set
# CONFIG_USB_USBNET is not set
# CONFIG_WAN is not set
# CONFIG_FDDI is not set
# CONFIG_HIPPI is not set
# CONFIG_PPP is not set
# CONFIG_SLIP is not set
# CONFIG_NET_FC is not set
# CONFIG_SHAPER is not set
# CONFIG_NETCONSOLE is not set
# CONFIG_NETPOLL is not set
# CONFIG_NET_POLL_CONTROLLER is not set

#
# ISDN subsystem
#
# CONFIG_ISDN is not set

#
# Input device support
#
CONFIG_INPUT=y
# CONFIG_INPUT_FF_MEMLESS is not set
# CONFIG_INPUT_POLLDEV is not set

#
# Userland interfaces
#
CONFIG_INPUT_MOUSEDEV=y
# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
# CONFIG_INPUT_JOYDEV is not set
# CONFIG_INPUT_TSDEV is not set
# CONFIG_INPUT_EVDEV is not set
# CONFIG_INPUT_EVBUG is not set

#
# Input Device Drivers
#
# CONFIG_INPUT_KEYBOARD is not set
# CONFIG_INPUT_MOUSE is not set
# CONFIG_INPUT_JOYSTICK is not set
# CONFIG_INPUT_TABLET is not set
# CONFIG_INPUT_TOUCHSCREEN is not set
# CONFIG_INPUT_MISC is not set

#
# Hardware I/O ports
#
# CONFIG_SERIO is not set
# CONFIG_GAMEPORT is not set

#
# Character devices
#
CONFIG_VT=y
CONFIG_VT_CONSOLE=y
CONFIG_HW_CONSOLE=y
# CONFIG_VT_HW_CONSOLE_BINDING is not set
# CONFIG_SERIAL_NONSTANDARD is not set

#
# Serial drivers
#
CONFIG_SERIAL_8250=y
CONFIG_SERIAL_8250_CONSOLE=y
CONFIG_SERIAL_8250_PCI=y
CONFIG_SERIAL_8250_NR_UARTS=4
CONFIG_SERIAL_8250_RUNTIME_UARTS=4
# CONFIG_SERIAL_8250_EXTENDED is not set

#
# Non-8250 serial port support
#
CONFIG_SERIAL_CORE=y
CONFIG_SERIAL_CORE_CONSOLE=y
# CONFIG_SERIAL_JSM is not set
CONFIG_UNIX98_PTYS=y
CONFIG_LEGACY_PTYS=y
CONFIG_LEGACY_PTY_COUNT=256

#
# IPMI
#
# CONFIG_IPMI_HANDLER is not set
# CONFIG_WATCHDOG is not set
CONFIG_HW_RANDOM=y
# CONFIG_NVRAM is not set
# CONFIG_R3964 is not set
# CONFIG_APPLICOM is not set
# CONFIG_DRM is not set
# CONFIG_RAW_DRIVER is not set

#
# TPM devices
#
# CONFIG_TCG_TPM is not set
CONFIG_DEVPORT=y
CONFIG_I2C=y
CONFIG_I2C_BOARDINFO=y
CONFIG_I2C_CHARDEV=y

#
# I2C Algorithms
#
# CONFIG_I2C_ALGOBIT is not set
# CONFIG_I2C_ALGOPCF is not set
# CONFIG_I2C_ALGOPCA is not set

#
# I2C Hardware Bus support
#
# CONFIG_I2C_ALI1535 is not set
# CONFIG_I2C_ALI1563 is not set
# CONFIG_I2C_ALI15X3 is not set
# CONFIG_I2C_AMD756 is not set
# CONFIG_I2C_AMD8111 is not set
# CONFIG_I2C_I801 is not set
# CONFIG_I2C_I810 is not set
# CONFIG_I2C_PIIX4 is not set
CONFIG_I2C_IOP3XX=y
# CONFIG_I2C_NFORCE2 is not set
# CONFIG_I2C_OCORES is not set
# CONFIG_I2C_PARPORT_LIGHT is not set
# CONFIG_I2C_PROSAVAGE is not set
# CONFIG_I2C_SAVAGE4 is not set
# CONFIG_I2C_SIMTEC is not set
# CONFIG_I2C_SIS5595 is not set
# CONFIG_I2C_SIS630 is not set
# CONFIG_I2C_SIS96X is not set
# CONFIG_I2C_STUB is not set
# CONFIG_I2C_TINY_USB is not set
# CONFIG_I2C_VIA is not set
# CONFIG_I2C_VIAPRO is not set
# CONFIG_I2C_VOODOO3 is not set

#
# Miscellaneous I2C Chip support
#
# CONFIG_SENSORS_DS1337 is not set
# CONFIG_SENSORS_DS1374 is not set
# CONFIG_SENSORS_EEPROM is not set
# CONFIG_SENSORS_PCF8574 is not set
# CONFIG_SENSORS_PCA9539 is not set
# CONFIG_SENSORS_PCF8591 is not set
# CONFIG_SENSORS_MAX6875 is not set
# CONFIG_I2C_DEBUG_CORE is not set
# CONFIG_I2C_DEBUG_ALGO is not set
# CONFIG_I2C_DEBUG_BUS is not set
# CONFIG_I2C_DEBUG_CHIP is not set

#
# SPI support
#
# CONFIG_SPI is not set
# CONFIG_SPI_MASTER is not set

#
# Dallas's 1-wire bus
#
# CONFIG_W1 is not set
CONFIG_HWMON=y
# CONFIG_HWMON_VID is not set
# CONFIG_SENSORS_ABITUGURU is not set
# CONFIG_SENSORS_AD7418 is not set
# CONFIG_SENSORS_ADM1021 is not set
# CONFIG_SENSORS_ADM1025 is not set
# CONFIG_SENSORS_ADM1026 is not set
# CONFIG_SENSORS_ADM1029 is not set
# CONFIG_SENSORS_ADM1031 is not set
# CONFIG_SENSORS_ADM9240 is not set
# CONFIG_SENSORS_ASB100 is not set
# CONFIG_SENSORS_ATXP1 is not set
# CONFIG_SENSORS_DS1621 is not set
# CONFIG_SENSORS_F71805F is not set
# CONFIG_SENSORS_FSCHER is not set
# CONFIG_SENSORS_FSCPOS is not set
# CONFIG_SENSORS_GL518SM is not set
# CONFIG_SENSORS_GL520SM is not set
# CONFIG_SENSORS_IT87 is not set
# CONFIG_SENSORS_LM63 is not set
# CONFIG_SENSORS_LM75 is not set
# CONFIG_SENSORS_LM77 is not set
# CONFIG_SENSORS_LM78 is not set
# CONFIG_SENSORS_LM80 is not set
# CONFIG_SENSORS_LM83 is not set
# CONFIG_SENSORS_LM85 is not set
# CONFIG_SENSORS_LM87 is not set
# CONFIG_SENSORS_LM90 is not set
# CONFIG_SENSORS_LM92 is not set
# CONFIG_SENSORS_MAX1619 is not set
# CONFIG_SENSORS_MAX6650 is not set
# CONFIG_SENSORS_PC87360 is not set
# CONFIG_SENSORS_PC87427 is not set
# CONFIG_SENSORS_SIS5595 is not set
# CONFIG_SENSORS_SMSC47M1 is not set
# CONFIG_SENSORS_SMSC47M192 is not set
# CONFIG_SENSORS_SMSC47B397 is not set
# CONFIG_SENSORS_VIA686A is not set
# CONFIG_SENSORS_VT1211 is not set
# CONFIG_SENSORS_VT8231 is not set
# CONFIG_SENSORS_W83781D is not set
# CONFIG_SENSORS_W83791D is not set
# CONFIG_SENSORS_W83792D is not set
# CONFIG_SENSORS_W83793 is not set
# CONFIG_SENSORS_W83L785TS is not set
# CONFIG_SENSORS_W83627HF is not set
# CONFIG_SENSORS_W83627EHF is not set
# CONFIG_HWMON_DEBUG_CHIP is not set

#
# Misc devices
#
# CONFIG_PHANTOM is not set
# CONFIG_SGI_IOC4 is not set
# CONFIG_TIFM_CORE is not set

#
# Multifunction device drivers
#
# CONFIG_MFD_SM501 is not set

#
# LED devices
#
# CONFIG_NEW_LEDS is not set

#
# LED drivers
#

#
# LED Triggers
#

#
# Multimedia devices
#
# CONFIG_VIDEO_DEV is not set
# CONFIG_DVB_CORE is not set
CONFIG_DAB=y
# CONFIG_USB_DABUSB is not set

#
# Graphics support
#
# CONFIG_BACKLIGHT_LCD_SUPPORT is not set

#
# Display device support
#
# CONFIG_DISPLAY_SUPPORT is not set
# CONFIG_VGASTATE is not set
# CONFIG_FB is not set

#
# Console display driver support
#
# CONFIG_VGA_CONSOLE is not set
CONFIG_DUMMY_CONSOLE=y

#
# Sound
#
# CONFIG_SOUND is not set

#
# HID Devices
#
CONFIG_HID=y
# CONFIG_HID_DEBUG is not set

#
# USB Input Devices
#
# CONFIG_USB_HID is not set

#
# USB HID Boot Protocol drivers
#
# CONFIG_USB_KBD is not set
# CONFIG_USB_MOUSE is not set

#
# USB support
#
CONFIG_USB_ARCH_HAS_HCD=y
CONFIG_USB_ARCH_HAS_OHCI=y
CONFIG_USB_ARCH_HAS_EHCI=y
CONFIG_USB=y
# CONFIG_USB_DEBUG is not set

#
# Miscellaneous USB options
#
# CONFIG_USB_DEVICEFS is not set
CONFIG_USB_DEVICE_CLASS=y
# CONFIG_USB_DYNAMIC_MINORS is not set
# CONFIG_USB_OTG is not set

#
# USB Host Controller Drivers
#
CONFIG_USB_EHCI_HCD=y
CONFIG_USB_EHCI_SPLIT_ISO=y
CONFIG_USB_EHCI_ROOT_HUB_TT=y
CONFIG_USB_EHCI_TT_NEWSCHED=y
# CONFIG_USB_EHCI_BIG_ENDIAN_MMIO is not set
# CONFIG_USB_ISP116X_HCD is not set
# CONFIG_USB_OHCI_HCD is not set
CONFIG_USB_UHCI_HCD=y
# CONFIG_USB_SL811_HCD is not set

#
# USB Device Class drivers
#
# CONFIG_USB_ACM is not set
# CONFIG_USB_PRINTER is not set

#
# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support'
#

#
# may also be needed; see USB_STORAGE Help for more information
#
CONFIG_USB_STORAGE=y
# CONFIG_USB_STORAGE_DEBUG is not set
# CONFIG_USB_STORAGE_DATAFAB is not set
# CONFIG_USB_STORAGE_FREECOM is not set
# CONFIG_USB_STORAGE_DPCM is not set
# CONFIG_USB_STORAGE_USBAT is not set
# CONFIG_USB_STORAGE_SDDR09 is not set
# CONFIG_USB_STORAGE_SDDR55 is not set
# CONFIG_USB_STORAGE_JUMPSHOT is not set
# CONFIG_USB_STORAGE_ALAUDA is not set
# CONFIG_USB_STORAGE_KARMA is not set
# CONFIG_USB_LIBUSUAL is not set

#
# USB Imaging devices
#
# CONFIG_USB_MDC800 is not set
# CONFIG_USB_MICROTEK is not set
CONFIG_USB_MON=y

#
# USB port drivers
#

#
# USB Serial Converter support
#
# CONFIG_USB_SERIAL is not set

#
# USB Miscellaneous drivers
#
# CONFIG_USB_EMI62 is not set
# CONFIG_USB_EMI26 is not set
# CONFIG_USB_ADUTUX is not set
# CONFIG_USB_AUERSWALD is not set
# CONFIG_USB_RIO500 is not set
# CONFIG_USB_LEGOTOWER is not set
# CONFIG_USB_LCD is not set
# CONFIG_USB_BERRY_CHARGE is not set
# CONFIG_USB_LED is not set
# CONFIG_USB_CYPRESS_CY7C63 is not set
# CONFIG_USB_CYTHERM is not set
# CONFIG_USB_PHIDGET is not set
# CONFIG_USB_IDMOUSE is not set
# CONFIG_USB_FTDI_ELAN is not set
# CONFIG_USB_APPLEDISPLAY is not set
# CONFIG_USB_SISUSBVGA is not set
# CONFIG_USB_LD is not set
# CONFIG_USB_TRANCEVIBRATOR is not set
# CONFIG_USB_IOWARRIOR is not set

#
# USB DSL modem support
#

#
# USB Gadget Support
#
# CONFIG_USB_GADGET is not set
# CONFIG_MMC is not set

#
# Real Time Clock
#
CONFIG_RTC_LIB=y
# CONFIG_RTC_CLASS is not set

#
# File systems
#
CONFIG_EXT2_FS=y
# CONFIG_EXT2_FS_XATTR is not set
# CONFIG_EXT2_FS_XIP is not set
CONFIG_EXT3_FS=y
CONFIG_EXT3_FS_XATTR=y
# CONFIG_EXT3_FS_POSIX_ACL is not set
# CONFIG_EXT3_FS_SECURITY is not set
# CONFIG_EXT4DEV_FS is not set
CONFIG_JBD=y
# CONFIG_JBD_DEBUG is not set
CONFIG_FS_MBCACHE=y
# CONFIG_REISERFS_FS is not set
# CONFIG_JFS_FS is not set
# CONFIG_FS_POSIX_ACL is not set
# CONFIG_XFS_FS is not set
# CONFIG_GFS2_FS is not set
# CONFIG_OCFS2_FS is not set
# CONFIG_MINIX_FS is not set
# CONFIG_ROMFS_FS is not set
CONFIG_INOTIFY=y
CONFIG_INOTIFY_USER=y
# CONFIG_QUOTA is not set
CONFIG_DNOTIFY=y
# CONFIG_AUTOFS_FS is not set
# CONFIG_AUTOFS4_FS is not set
# CONFIG_FUSE_FS is not set

#
# CD-ROM/DVD Filesystems
#
# CONFIG_ISO9660_FS is not set
# CONFIG_UDF_FS is not set

#
# DOS/FAT/NT Filesystems
#
# CONFIG_MSDOS_FS is not set
# CONFIG_VFAT_FS is not set
# CONFIG_NTFS_FS is not set

#
# Pseudo filesystems
#
CONFIG_PROC_FS=y
CONFIG_PROC_SYSCTL=y
CONFIG_SYSFS=y
CONFIG_TMPFS=y
# CONFIG_TMPFS_POSIX_ACL is not set
# CONFIG_HUGETLB_PAGE is not set
CONFIG_RAMFS=y
# CONFIG_CONFIGFS_FS is not set

#
# Miscellaneous filesystems
#
# CONFIG_ADFS_FS is not set
# CONFIG_AFFS_FS is not set
CONFIG_ECRYPT_FS=y
# CONFIG_HFS_FS is not set
# CONFIG_HFSPLUS_FS is not set
# CONFIG_BEFS_FS is not set
# CONFIG_BFS_FS is not set
# CONFIG_EFS_FS is not set
CONFIG_JFFS2_FS=y
CONFIG_JFFS2_FS_DEBUG=0
CONFIG_JFFS2_FS_WRITEBUFFER=y
# CONFIG_JFFS2_SUMMARY is not set
# CONFIG_JFFS2_FS_XATTR is not set
# CONFIG_JFFS2_COMPRESSION_OPTIONS is not set
CONFIG_JFFS2_ZLIB=y
CONFIG_JFFS2_RTIME=y
# CONFIG_JFFS2_RUBIN is not set
CONFIG_CRAMFS=y
# CONFIG_VXFS_FS is not set
# CONFIG_HPFS_FS is not set
# CONFIG_QNX4FS_FS is not set
# CONFIG_SYSV_FS is not set
# CONFIG_UFS_FS is not set

#
# Network File Systems
#
CONFIG_NFS_FS=y
CONFIG_NFS_V3=y
# CONFIG_NFS_V3_ACL is not set
# CONFIG_NFS_V4 is not set
# CONFIG_NFS_DIRECTIO is not set
CONFIG_NFSD=y
CONFIG_NFSD_V3=y
# CONFIG_NFSD_V3_ACL is not set
# CONFIG_NFSD_V4 is not set
CONFIG_NFSD_TCP=y
CONFIG_ROOT_NFS=y
CONFIG_LOCKD=y
CONFIG_LOCKD_V4=y
CONFIG_EXPORTFS=y
CONFIG_NFS_COMMON=y
CONFIG_SUNRPC=y
# CONFIG_SUNRPC_BIND34 is not set
# CONFIG_RPCSEC_GSS_KRB5 is not set
# CONFIG_RPCSEC_GSS_SPKM3 is not set
# CONFIG_SMB_FS is not set
# CONFIG_CIFS is not set
# CONFIG_NCP_FS is not set
# CONFIG_CODA_FS is not set
# CONFIG_AFS_FS is not set
# CONFIG_9P_FS is not set

#
# Partition Types
#
CONFIG_PARTITION_ADVANCED=y
# CONFIG_ACORN_PARTITION is not set
# CONFIG_OSF_PARTITION is not set
# CONFIG_AMIGA_PARTITION is not set
# CONFIG_ATARI_PARTITION is not set
# CONFIG_MAC_PARTITION is not set
CONFIG_MSDOS_PARTITION=y
# CONFIG_BSD_DISKLABEL is not set
# CONFIG_MINIX_SUBPARTITION is not set
# CONFIG_SOLARIS_X86_PARTITION is not set
# CONFIG_UNIXWARE_DISKLABEL is not set
# CONFIG_LDM_PARTITION is not set
# CONFIG_SGI_PARTITION is not set
# CONFIG_ULTRIX_PARTITION is not set
# CONFIG_SUN_PARTITION is not set
# CONFIG_KARMA_PARTITION is not set
# CONFIG_EFI_PARTITION is not set
# CONFIG_SYSV68_PARTITION is not set

#
# Native Language Support
#
# CONFIG_NLS is not set

#
# Distributed Lock Manager
#
# CONFIG_DLM is not set

#
# Profiling support
#
# CONFIG_PROFILING is not set

#
# Kernel hacking
#
# CONFIG_PRINTK_TIME is not set
CONFIG_ENABLE_MUST_CHECK=y
CONFIG_MAGIC_SYSRQ=y
# CONFIG_UNUSED_SYMBOLS is not set
# CONFIG_DEBUG_FS is not set
# CONFIG_HEADERS_CHECK is not set
CONFIG_DEBUG_KERNEL=y
# CONFIG_DEBUG_SHIRQ is not set
CONFIG_DETECT_SOFTLOCKUP=y
# CONFIG_SCHEDSTATS is not set
# CONFIG_TIMER_STATS is not set
# CONFIG_DEBUG_SLAB is not set
# CONFIG_DEBUG_RT_MUTEXES is not set
# CONFIG_RT_MUTEX_TESTER is not set
# CONFIG_DEBUG_SPINLOCK is not set
# CONFIG_DEBUG_MUTEXES is not set
# CONFIG_DEBUG_LOCK_ALLOC is not set
# CONFIG_PROVE_LOCKING is not set
# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
# CONFIG_DEBUG_KOBJECT is not set
CONFIG_DEBUG_BUGVERBOSE=y
# CONFIG_DEBUG_INFO is not set
# CONFIG_DEBUG_VM is not set
# CONFIG_DEBUG_LIST is not set
CONFIG_FRAME_POINTER=y
# CONFIG_FORCED_INLINING is not set
# CONFIG_RCU_TORTURE_TEST is not set
# CONFIG_FAULT_INJECTION is not set
CONFIG_DEBUG_USER=y
# CONFIG_DEBUG_ERRORS is not set
CONFIG_DEBUG_LL=y
# CONFIG_DEBUG_ICEDCC is not set

#
# Security options
#
CONFIG_KEYS=y
CONFIG_KEYS_DEBUG_PROC_KEYS=y
# CONFIG_SECURITY is not set

#
# Cryptographic options
#
CONFIG_CRYPTO=y
CONFIG_CRYPTO_ALGAPI=y
CONFIG_CRYPTO_BLKCIPHER=y
CONFIG_CRYPTO_HASH=y
CONFIG_CRYPTO_MANAGER=y
CONFIG_CRYPTO_HMAC=y
CONFIG_CRYPTO_XCBC=y
CONFIG_CRYPTO_NULL=y
CONFIG_CRYPTO_MD4=y
CONFIG_CRYPTO_MD5=y
CONFIG_CRYPTO_SHA1=y
CONFIG_CRYPTO_SHA256=y
CONFIG_CRYPTO_SHA512=y
CONFIG_CRYPTO_WP512=y
CONFIG_CRYPTO_TGR192=y
CONFIG_CRYPTO_GF128MUL=y
CONFIG_CRYPTO_ECB=y
CONFIG_CRYPTO_CBC=y
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_LRW=y
# CONFIG_CRYPTO_CRYPTD is not set
CONFIG_CRYPTO_DES=y
# CONFIG_CRYPTO_FCRYPT is not set
CONFIG_CRYPTO_BLOWFISH=y
CONFIG_CRYPTO_TWOFISH=y
CONFIG_CRYPTO_TWOFISH_COMMON=y
CONFIG_CRYPTO_SERPENT=y
CONFIG_CRYPTO_AES=y
CONFIG_CRYPTO_CAST5=y
CONFIG_CRYPTO_CAST6=y
CONFIG_CRYPTO_TEA=y
CONFIG_CRYPTO_ARC4=y
CONFIG_CRYPTO_KHAZAD=y
CONFIG_CRYPTO_ANUBIS=y
CONFIG_CRYPTO_DEFLATE=y
CONFIG_CRYPTO_MICHAEL_MIC=y
CONFIG_CRYPTO_CRC32C=y
# CONFIG_CRYPTO_CAMELLIA is not set
# CONFIG_CRYPTO_TEST is not set

#
# Hardware crypto devices
#

#
# Library routines
#
CONFIG_BITREVERSE=y
# CONFIG_CRC_CCITT is not set
# CONFIG_CRC16 is not set
# CONFIG_CRC_ITU_T is not set
CONFIG_CRC32=y
CONFIG_LIBCRC32C=y
CONFIG_ZLIB_INFLATE=y
CONFIG_ZLIB_DEFLATE=y
CONFIG_PLIST=y
CONFIG_HAS_IOMEM=y
CONFIG_HAS_IOPORT=y
CONFIG_HAS_DMA=y

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-09-26  8:52 Network slowdown due to CFS Martin Michlmayr
@ 2007-09-26  9:34 ` Ingo Molnar
  2007-09-26  9:47   ` Ingo Molnar
  2007-09-26 10:20 ` Mike Galbraith
  2007-09-26 10:23 ` Mike Galbraith
  2 siblings, 1 reply; 71+ messages in thread
From: Ingo Molnar @ 2007-09-26  9:34 UTC (permalink / raw)
  To: Martin Michlmayr; +Cc: Srivatsa Vaddagiri, linux-kernel


* Martin Michlmayr <tbm@cyrius.com> wrote:

> I noticed that my network performance has gone down from 2.6.22
> from   [  3]  0.0-10.0 sec    113 MBytes  95.0 Mbits/sec
> to     [  3]  0.0-10.0 sec   75.7 MBytes  63.3 Mbits/sec
> with 2.6.23-rc1 (and 2.6.23-rc8), as measured with iperf.
> 
> I did a git bisect today and tracked it back to the commit where CFS
> was enabled ("sched: cfs core code; apply the CFS core code",
> commit dd41f596cda0d7d6e4a8b139ffdfabcefdd46528).  I also compiled a
> kernel from
> git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched-devel.git
> but things don't improve.
> 
> This is on a Thecus N2100, an ARM (Intel IOP32x) based storage device 
> with a r8169 card, SATA disks and 512 MB RAM.  My config is attached.
> 
> What kind of information can I supply so you can track this down?

as a starter, could you boot the sched-devel.git kernel, with 
CONFIG_SCHED_DEBUG=y and CONFIG_SCHEDSTATS=y enabled and could you run 
this script while the iperf test is in the middle of its testrun:

  http://people.redhat.com/mingo/cfs-scheduler/tools/cfs-debug-info.sh

this will gather a good deal of info about the workload in question. 
Please send me the resulting debug file.

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-09-26  9:34 ` Ingo Molnar
@ 2007-09-26  9:47   ` Ingo Molnar
  2007-09-26 10:08     ` Martin Michlmayr
  0 siblings, 1 reply; 71+ messages in thread
From: Ingo Molnar @ 2007-09-26  9:47 UTC (permalink / raw)
  To: Martin Michlmayr; +Cc: Srivatsa Vaddagiri, linux-kernel


* Ingo Molnar <mingo@elte.hu> wrote:

> > What kind of information can I supply so you can track this down?
> 
> as a starter, could you boot the sched-devel.git kernel, with 
> CONFIG_SCHED_DEBUG=y and CONFIG_SCHEDSTATS=y enabled and could you run 
> this script while the iperf test is in the middle of its testrun:
> 
>   http://people.redhat.com/mingo/cfs-scheduler/tools/cfs-debug-info.sh
> 
> this will gather a good deal of info about the workload in question. 
> Please send me the resulting debug file.

Another thing: please also do the same with the vanilla v2.6.22 kernel, 
and send me that file too. (so that the two cases can be compared)

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-09-26  9:47   ` Ingo Molnar
@ 2007-09-26 10:08     ` Martin Michlmayr
  2007-09-26 10:18       ` Ingo Molnar
  0 siblings, 1 reply; 71+ messages in thread
From: Martin Michlmayr @ 2007-09-26 10:08 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Srivatsa Vaddagiri, linux-kernel

* Ingo Molnar <mingo@elte.hu> [2007-09-26 11:47]:
> > this will gather a good deal of info about the workload in question. 
> > Please send me the resulting debug file.
> Another thing: please also do the same with the vanilla v2.6.22 kernel, 
> and send me that file too. (so that the two cases can be compared)

I put the log files here:
http://www.cyrius.com/tmp/2.6.22
http://www.cyrius.com/tmp/2.6.23-rc8-sched-devel

I increased the time ipfer ran to 30 secs since your script runs for
over 15 secs.  I got:

[  3]  0.0-30.0 sec    331 MBytes  92.6 Mbits/sec   2.6.22
vs
[  3]  0.0-30.0 sec    222 MBytes  62.1 Mbits/sec   2.6.23-rc8-sched-devel

-- 
Martin Michlmayr
http://www.cyrius.com/

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-09-26 10:08     ` Martin Michlmayr
@ 2007-09-26 10:18       ` Ingo Molnar
  0 siblings, 0 replies; 71+ messages in thread
From: Ingo Molnar @ 2007-09-26 10:18 UTC (permalink / raw)
  To: Martin Michlmayr
  Cc: Srivatsa Vaddagiri, linux-kernel, Peter Zijlstra, Mike Galbraith


* Martin Michlmayr <tbm@cyrius.com> wrote:

> * Ingo Molnar <mingo@elte.hu> [2007-09-26 11:47]:
> > > this will gather a good deal of info about the workload in question. 
> > > Please send me the resulting debug file.
> > Another thing: please also do the same with the vanilla v2.6.22 kernel, 
> > and send me that file too. (so that the two cases can be compared)
> 
> I put the log files here:
> http://www.cyrius.com/tmp/2.6.22
> http://www.cyrius.com/tmp/2.6.23-rc8-sched-devel
> 
> I increased the time ipfer ran to 30 secs since your script runs for
> over 15 secs.  I got:
> 
> [  3]  0.0-30.0 sec    331 MBytes  92.6 Mbits/sec   2.6.22
> vs
> [  3]  0.0-30.0 sec    222 MBytes  62.1 Mbits/sec   2.6.23-rc8-sched-devel

thanks!

the test does almost no context switches:

procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa
 2  0      0 462928   3280  37216    0    0   137    75 2306   83 13 38 47  2
 2  0      0 462928   3280  37216    0    0     0     0 8600   54  6 94  0  0
 2  0      0 462928   3280  37216    0    0     0    36 8667   55  7 93  0  0
 2  0      0 462928   3280  37216    0    0     0     0 8592   53  5 95  0  0
 2  0      0 462928   3280  37216    0    0     0     0 8638   52  7 93  0  0

(the 'cs' column shows 50-80 context switches per second.)

so there must be some other side-effect, not raw scheduling overhead or 
some other direct scheduler performance problem.

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-09-26  8:52 Network slowdown due to CFS Martin Michlmayr
  2007-09-26  9:34 ` Ingo Molnar
@ 2007-09-26 10:20 ` Mike Galbraith
  2007-09-26 10:23 ` Mike Galbraith
  2 siblings, 0 replies; 71+ messages in thread
From: Mike Galbraith @ 2007-09-26 10:20 UTC (permalink / raw)
  To: Martin Michlmayr; +Cc: Ingo Molnar, Srivatsa Vaddagiri, linux-kernel

On Wed, 2007-09-26 at 10:52 +0200, Martin Michlmayr wrote:
> I noticed that my network performance has gone down from 2.6.22
> >from   [  3]  0.0-10.0 sec    113 MBytes  95.0 Mbits/sec
> to     [  3]  0.0-10.0 sec   75.7 MBytes  63.3 Mbits/sec
> with 2.6.23-rc1 (and 2.6.23-rc8), as measured with iperf.

FWIW, on my box blasting localhost, I see the opposite, repeatably.

root@Homer: iperf -v
iperf version 2.0.2 (03 May 2005) pthreads

2.6.22.1-smp
for i in `seq 1 3`; do iperf -c localhost; done >> /xx

------------------------------------------------------------
Client connecting to localhost, TCP port 5001
TCP window size: 49.2 KByte (default)
------------------------------------------------------------
[  3] local 127.0.0.1 port 21383 connected with 127.0.0.1 port 5001
[  3]  0.0-10.1 sec    518 MBytes    432 Mbits/sec
------------------------------------------------------------
Client connecting to localhost, TCP port 5001
TCP window size: 49.2 KByte (default)
------------------------------------------------------------
[  3] local 127.0.0.1 port 21384 connected with 127.0.0.1 port 5001
[  3]  0.0-10.0 sec    325 MBytes    273 Mbits/sec
------------------------------------------------------------
Client connecting to localhost, TCP port 5001
TCP window size: 49.2 KByte (default)
------------------------------------------------------------
[  3] local 127.0.0.1 port 21385 connected with 127.0.0.1 port 5001
[  3]  0.0-10.0 sec    434 MBytes    363 Mbits/sec

2.6.23-rc8-smp-d
for i in `seq 1 3`; do iperf -c localhost; done >> /xx

------------------------------------------------------------
Client connecting to localhost, TCP port 5001
TCP window size: 49.2 KByte (default)
------------------------------------------------------------
[  3] local 127.0.0.1 port 11650 connected with 127.0.0.1 port 5001
[  3]  0.0-10.0 sec  2.01 GBytes  1.72 Gbits/sec
------------------------------------------------------------
Client connecting to localhost, TCP port 5001
TCP window size: 49.2 KByte (default)
------------------------------------------------------------
[  3] local 127.0.0.1 port 11651 connected with 127.0.0.1 port 5001
[  3]  0.0-10.0 sec  2.02 GBytes  1.74 Gbits/sec
------------------------------------------------------------
Client connecting to localhost, TCP port 5001
TCP window size: 49.2 KByte (default)
------------------------------------------------------------
[  3] local 127.0.0.1 port 11652 connected with 127.0.0.1 port 5001
[  3]  0.0-10.0 sec  2.10 GBytes  1.81 Gbits/sec



^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-09-26  8:52 Network slowdown due to CFS Martin Michlmayr
  2007-09-26  9:34 ` Ingo Molnar
  2007-09-26 10:20 ` Mike Galbraith
@ 2007-09-26 10:23 ` Mike Galbraith
  2007-09-26 10:48   ` Martin Michlmayr
  2 siblings, 1 reply; 71+ messages in thread
From: Mike Galbraith @ 2007-09-26 10:23 UTC (permalink / raw)
  To: Martin Michlmayr; +Cc: Ingo Molnar, Srivatsa Vaddagiri, linux-kernel

On Wed, 2007-09-26 at 10:52 +0200, Martin Michlmayr wrote:
> I noticed that my network performance has gone down from 2.6.22
> >from   [  3]  0.0-10.0 sec    113 MBytes  95.0 Mbits/sec
> to     [  3]  0.0-10.0 sec   75.7 MBytes  63.3 Mbits/sec
> with 2.6.23-rc1 (and 2.6.23-rc8), as measured with iperf.
> 
> I did a git bisect today and tracked it back to the commit where CFS
> was enabled ("sched: cfs core code; apply the CFS core code",
> commit dd41f596cda0d7d6e4a8b139ffdfabcefdd46528).  I also compiled a
> kernel from
> git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched-devel.git
> but things don't improve.
> 
> This is on a Thecus N2100, an ARM (Intel IOP32x) based storage device
> with a r8169 card, SATA disks and 512 MB RAM.  My config is attached.
> 
> What kind of information can I supply so you can track this down?

I noticed on the iperf website a patch which contains sched_yield().

http://dast.nlanr.net/Projects/Iperf2.0/patch-iperf-linux-2.6.21.txt

Do you have that patch applied by any chance?  If so, it might be a
worth while to try it without it.

	-Mike


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-09-26 10:23 ` Mike Galbraith
@ 2007-09-26 10:48   ` Martin Michlmayr
  2007-09-26 11:21     ` Ingo Molnar
  0 siblings, 1 reply; 71+ messages in thread
From: Martin Michlmayr @ 2007-09-26 10:48 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Ingo Molnar, Srivatsa Vaddagiri, linux-kernel

* Mike Galbraith <efault@gmx.de> [2007-09-26 12:23]:
> I noticed on the iperf website a patch which contains sched_yield().
> http://dast.nlanr.net/Projects/Iperf2.0/patch-iperf-linux-2.6.21.txt
> 
> Do you have that patch applied by any chance?  If so, it might be a
> worth while to try it without it.

Yes, this patch was applied.  When I revert it, I get the same (high)
performance with both kernels.
-- 
Martin Michlmayr
http://www.cyrius.com/

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-09-26 10:48   ` Martin Michlmayr
@ 2007-09-26 11:21     ` Ingo Molnar
  2007-09-26 11:29       ` Martin Michlmayr
  0 siblings, 1 reply; 71+ messages in thread
From: Ingo Molnar @ 2007-09-26 11:21 UTC (permalink / raw)
  To: Martin Michlmayr; +Cc: Mike Galbraith, Srivatsa Vaddagiri, linux-kernel


* Martin Michlmayr <tbm@cyrius.com> wrote:

> * Mike Galbraith <efault@gmx.de> [2007-09-26 12:23]:
> > I noticed on the iperf website a patch which contains sched_yield().
> > http://dast.nlanr.net/Projects/Iperf2.0/patch-iperf-linux-2.6.21.txt
> > 
> > Do you have that patch applied by any chance?  If so, it might be a
> > worth while to try it without it.
> 
> Yes, this patch was applied.  When I revert it, I get the same (high) 
> performance with both kernels.

great! Could you try this too:

   echo 1 > /proc/sys/kernel/sched_compat_yield

does it fix iperf performance too (with the yield patch applied to 
iperf)?

I think the real fix would be for iperf to use blocking network IO 
though, or maybe to use a POSIX mutex or POSIX semaphores.

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-09-26 11:21     ` Ingo Molnar
@ 2007-09-26 11:29       ` Martin Michlmayr
  2007-09-26 12:00         ` David Schwartz
  2007-09-27  9:49         ` Ingo Molnar
  0 siblings, 2 replies; 71+ messages in thread
From: Martin Michlmayr @ 2007-09-26 11:29 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Mike Galbraith, Srivatsa Vaddagiri, linux-kernel, Stephen Hemminger

* Ingo Molnar <mingo@elte.hu> [2007-09-26 13:21]:
> > > I noticed on the iperf website a patch which contains sched_yield().
> > > http://dast.nlanr.net/Projects/Iperf2.0/patch-iperf-linux-2.6.21.txt
> 
> great! Could you try this too:
>    echo 1 > /proc/sys/kernel/sched_compat_yield
> 
> does it fix iperf performance too (with the yield patch applied to
> iperf)?

Yes, this gives me good performance too.

> I think the real fix would be for iperf to use blocking network IO
> though, or maybe to use a POSIX mutex or POSIX semaphores.

So it's definitely not a bug in the kernel, only in iperf?

(CCing Stephen Hemminger who wrote the iperf patch.)
-- 
Martin Michlmayr
http://www.cyrius.com/

^ permalink raw reply	[flat|nested] 71+ messages in thread

* RE: Network slowdown due to CFS
  2007-09-26 11:29       ` Martin Michlmayr
@ 2007-09-26 12:00         ` David Schwartz
  2007-09-26 13:31           ` Ingo Molnar
  2007-09-27  9:49         ` Ingo Molnar
  1 sibling, 1 reply; 71+ messages in thread
From: David Schwartz @ 2007-09-26 12:00 UTC (permalink / raw)
  To: Linux-Kernel@Vger. Kernel. Org


> > I think the real fix would be for iperf to use blocking network IO
> > though, or maybe to use a POSIX mutex or POSIX semaphores.
>
> So it's definitely not a bug in the kernel, only in iperf?

Martin:

Actually, in this case I think iperf is doing the right thing (though not
the best thing) and the kernel is doing the wrong thing. It's calling
'sched_yield' to ensure that every other thread gets a chance to run before
the current thread runs again. CFS is not doing that, allowing the yielding
thread to hog the CPU to the exclusion of the other threads. (It can allow
the yielding thread to hog the CPU, of course, just not to the exclusion of
other threads.)

It's still better to use some kind of rational synchronization primitive
(like mutex/sempahore) so that the other threads can tell you when there's
something for you to do. It's still better to use blocking network IO, so
the kernel will let you know exactly when to try I/O and your dynamic
priority can rise.

Ingo:

Can you clarify what CFS' current default sched_yield implementation is and
what setting sched_compat_yield to 1 does? Which way do we get the right
semantics (all threads of equal static priority are scheduled, with some
possible SMP fuzziness, before this thread is scheduled again)?

DS



^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-09-26 12:00         ` David Schwartz
@ 2007-09-26 13:31           ` Ingo Molnar
  2007-09-26 15:40             ` Stephen Hemminger
                               ` (2 more replies)
  0 siblings, 3 replies; 71+ messages in thread
From: Ingo Molnar @ 2007-09-26 13:31 UTC (permalink / raw)
  To: David Schwartz
  Cc: Linux-Kernel@Vger. Kernel. Org, Mike Galbraith, Peter Zijlstra,
	Martin Michlmayr, Srivatsa Vaddagiri, Stephen Hemminger


* David Schwartz <davids@webmaster.com> wrote:

> > > I think the real fix would be for iperf to use blocking network IO 
> > > though, or maybe to use a POSIX mutex or POSIX semaphores.
> >
> > So it's definitely not a bug in the kernel, only in iperf?
> 
> Martin:
> 
> Actually, in this case I think iperf is doing the right thing (though not
> the best thing) and the kernel is doing the wrong thing. [...]

it's not doing the right thing at all. I had a quick look at the source 
code, and the reason for that weird yield usage was that there's a 
locking bug in iperf's "Reporter thread" abstraction and apparently 
instead of fixing the bug it was worked around via a horrible yield() 
based user-space lock.

the (small) patch below fixes the iperf locking bug and removes the 
yield() use. There are numerous immediate benefits of this patch:

 - iperf uses _much_ less CPU time. On my Core2Duo test system, before 
   the patch it used up 100% CPU time to saturate 1 gigabit of network 
   traffic to another box. With the patch applied it now uses 9% of 
   CPU time.

 - sys_sched_yield() is removed altogether

 - i was able to measure much higher bandwidth over localhost for 
   example. This is the case for over-the-network measurements as well.

 - the results are also more consistent and more deterministic, hence 
   more reliable as a benchmarking tool. (the reason for that is that
   more CPU time is spent on actually delivering packets, instead of
   mindlessly polling on the user-space "lock", so we actually max out
   the CPU, instead of relying on the random proportion the workload was
   able to make progress versus wasting CPU time on polling.)

sched_yield() is almost always the symptom of broken locking or other 
bug. In that sense CFS does the right thing by exposing such bugs =B-)
 
	Ingo

------------------------->
Subject: iperf: fix locking
From: Ingo Molnar <mingo@elte.hu>

fix iperf locking - it was burning CPU time while polling
unnecessarily, instead of using the proper wait primitives.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 compat/Thread.c |    3 ---
 src/Reporter.c  |   13 +++++++++----
 src/main.cpp    |    2 ++
 3 files changed, 11 insertions(+), 7 deletions(-)

Index: iperf-2.0.2/compat/Thread.c
===================================================================
--- iperf-2.0.2.orig/compat/Thread.c
+++ iperf-2.0.2/compat/Thread.c
@@ -405,9 +405,6 @@ int thread_numuserthreads( void ) {
 void thread_rest ( void ) {
 #if defined( HAVE_THREAD )
 #if defined( HAVE_POSIX_THREAD )
-    // TODO add checks for sched_yield or pthread_yield and call that
-    // if available
-    usleep( 0 );
 #else // Win32
     SwitchToThread( );
 #endif
Index: iperf-2.0.2/src/Reporter.c
===================================================================
--- iperf-2.0.2.orig/src/Reporter.c
+++ iperf-2.0.2/src/Reporter.c
@@ -111,6 +111,7 @@ report_statistics multiple_reports[kRepo
 char buffer[64]; // Buffer for printing
 ReportHeader *ReportRoot = NULL;
 extern Condition ReportCond;
+extern Condition ReportDoneCond;
 int reporter_process_report ( ReportHeader *report );
 void process_report ( ReportHeader *report );
 int reporter_handle_packet( ReportHeader *report );
@@ -338,7 +339,7 @@ void ReportPacket( ReportHeader* agent, 
             // item
             while ( index == 0 ) {
                 Condition_Signal( &ReportCond );
-                thread_rest();
+                Condition_Wait( &ReportDoneCond );
                 index = agent->reporterindex;
             }
             agent->agentindex = 0;
@@ -346,7 +347,7 @@ void ReportPacket( ReportHeader* agent, 
         // Need to make sure that reporter is not about to be "lapped"
         while ( index - 1 == agent->agentindex ) {
             Condition_Signal( &ReportCond );
-            thread_rest();
+            Condition_Wait( &ReportDoneCond );
             index = agent->reporterindex;
         }
         
@@ -553,6 +554,7 @@ void reporter_spawn( thread_Settings *th
         }
         Condition_Unlock ( ReportCond );
 
+again:
         if ( ReportRoot != NULL ) {
             ReportHeader *temp = ReportRoot;
             //Condition_Unlock ( ReportCond );
@@ -575,9 +577,12 @@ void reporter_spawn( thread_Settings *th
                 // finished with report so free it
                 free( temp );
                 Condition_Unlock ( ReportCond );
+            	Condition_Signal( &ReportDoneCond );
+		if (ReportRoot)
+			goto again;
             }
-            // yield control of CPU is another thread is waiting
-            thread_rest();
+            Condition_Signal( &ReportDoneCond );
+            usleep(10000);
         } else {
             //Condition_Unlock ( ReportCond );
         }
Index: iperf-2.0.2/src/main.cpp
===================================================================
--- iperf-2.0.2.orig/src/main.cpp
+++ iperf-2.0.2/src/main.cpp
@@ -96,6 +96,7 @@ extern "C" {
     // records being accessed in a report and also to
     // serialize modification of the report list
     Condition ReportCond;
+    Condition ReportDoneCond;
 }
 
 // global variables only accessed within this file
@@ -141,6 +142,7 @@ int main( int argc, char **argv ) {
 
     // Initialize global mutexes and conditions
     Condition_Initialize ( &ReportCond );
+    Condition_Initialize ( &ReportDoneCond );
     Mutex_Initialize( &groupCond );
     Mutex_Initialize( &clients_mutex );
 

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-09-26 13:31           ` Ingo Molnar
@ 2007-09-26 15:40             ` Stephen Hemminger
  2007-09-26 15:46             ` Stephen Hemminger
  2007-09-27  9:30             ` Jarek Poplawski
  2 siblings, 0 replies; 71+ messages in thread
From: Stephen Hemminger @ 2007-09-26 15:40 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: David Schwartz, Linux-Kernel@Vger. Kernel. Org, Mike Galbraith,
	Peter Zijlstra, Martin Michlmayr, Srivatsa Vaddagiri

On Wed, 26 Sep 2007 15:31:38 +0200
Ingo Molnar <mingo@elte.hu> wrote:

> 
> * David Schwartz <davids@webmaster.com> wrote:
> 
> > > > I think the real fix would be for iperf to use blocking network
> > > > IO though, or maybe to use a POSIX mutex or POSIX semaphores.
> > >
> > > So it's definitely not a bug in the kernel, only in iperf?
> > 
> > Martin:
> > 
> > Actually, in this case I think iperf is doing the right thing
> > (though not the best thing) and the kernel is doing the wrong
> > thing. [...]
> 
> it's not doing the right thing at all. I had a quick look at the
> source code, and the reason for that weird yield usage was that
> there's a locking bug in iperf's "Reporter thread" abstraction and
> apparently instead of fixing the bug it was worked around via a
> horrible yield() based user-space lock.
> 
> the (small) patch below fixes the iperf locking bug and removes the 
> yield() use. There are numerous immediate benefits of this patch:
> 
>  - iperf uses _much_ less CPU time. On my Core2Duo test system,
> before the patch it used up 100% CPU time to saturate 1 gigabit of
> network traffic to another box. With the patch applied it now uses 9%
> of CPU time.
> 
>  - sys_sched_yield() is removed altogether
> 
>  - i was able to measure much higher bandwidth over localhost for 
>    example. This is the case for over-the-network measurements as
> well.
> 
>  - the results are also more consistent and more deterministic, hence 
>    more reliable as a benchmarking tool. (the reason for that is that
>    more CPU time is spent on actually delivering packets, instead of
>    mindlessly polling on the user-space "lock", so we actually max out
>    the CPU, instead of relying on the random proportion the workload
> was able to make progress versus wasting CPU time on polling.)
> 
> sched_yield() is almost always the symptom of broken locking or other 
> bug. In that sense CFS does the right thing by exposing such bugs =B-)
>  
> 	Ingo

A similar patch has already been submitted, since BSD wouldn't work
without it.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-09-26 13:31           ` Ingo Molnar
  2007-09-26 15:40             ` Stephen Hemminger
@ 2007-09-26 15:46             ` Stephen Hemminger
  2007-09-27  9:30             ` Jarek Poplawski
  2 siblings, 0 replies; 71+ messages in thread
From: Stephen Hemminger @ 2007-09-26 15:46 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: David Schwartz, Linux-Kernel@Vger. Kernel. Org, Mike Galbraith,
	Peter Zijlstra, Martin Michlmayr, Srivatsa Vaddagiri

[-- Attachment #1: Type: text/plain, Size: 2061 bytes --]

Here is the combined fixes from iperf-users list.

Begin forwarded message:

Date: Thu, 30 Aug 2007 15:55:22 -0400
From: "Andrew Gallatin" <gallatin@gmail.com>
To: iperf-users@dast.nlanr.net
Subject: [PATCH] performance fixes for non-linux


Hi,

I've attached a patch which gives iperf similar performance to netperf
on my FreeBSD, MacOSX and Solaris hosts.  It does not seem to
negatively impact Linux.  I only started looking at the iperf source
yesterday, so I don't really expect this to be integrated as is, but a
patch is worth a 1000 words :)

Background: On both Solaris and FreeBSD, there are 2 things slowing
iperf down: The gettimeofday timestamp around each socket read/write
is terribly expensive, and the sched_yield() or usleep(0) causes iperf
to take 100% of the time (system time on BSD, split user/system time
on Solaris and MacOSX), which slows things down and confuses the
scheduler.

To address the gettimeofday() issue,  I treat TCP different than UDP,
and TCP tests behave as though only a single (huge) packet was sent.
Rather then ending the test based on polling gettimeofday()
timestamps, an interval timer / sigalarm handler is used.  I had
to increase the packetLen from an int to a max_size_t.

To address the sched_yield/usleep issue, I put the reporter thread
to sleep on a condition variable.  For the TCP tests at least, there
is no reason to have it running during the test and it is best
to just get it out of the way rather than burning CPU in a tight
loop.

I've also incorporated some fixes from the FreeBSD ports collection:

--- include/headers.h
use a 64-bit type for max_size_t

--- compat/Thread.c
oldTID is not declared anywhere.  Make this compile
(seems needed for at least FreeBSD & MacOSX)

--- src/Client.cpp
BSDs can return ENOBUFS during a UDP test when the socket
buffer fills. Don't exit when this happens.

I've run the resulting iperf on FreeBSD, Solaris, MacOSX and Linux,
and it seems to work for me.  It is nice not to have a 100% CPU
load when running an iperf test across a 100Mb/s network.

Drew



[-- Attachment #2: non-linux.diff --]
[-- Type: text/x-patch, Size: 9187 bytes --]

Index: include/Reporter.h
===================================================================
--- include/Reporter.h	(revision 11)
+++ include/Reporter.h	(working copy)
@@ -74,7 +74,7 @@
  */
 typedef struct ReportStruct {
     int packetID;
-    int packetLen;
+    max_size_t packetLen;
     struct timeval packetTime;
     struct timeval sentTime;
 } ReportStruct;
Index: include/headers.h
===================================================================
--- include/headers.h	(revision 11)
+++ include/headers.h	(working copy)
@@ -180,7 +180,7 @@
 // from the gnu archive
 
 #include <iperf-int.h>
-typedef uintmax_t max_size_t;
+typedef uint64_t max_size_t;
 
 /* in case the OS doesn't have these, we provide our own implementations */
 #include "gettimeofday.h"
Index: include/Client.hpp
===================================================================
--- include/Client.hpp	(revision 11)
+++ include/Client.hpp	(working copy)
@@ -69,6 +69,9 @@
     // connects and sends data
     void Run( void );
 
+    // TCP specific version of above
+    void RunTCP( void );
+
     void InitiateServer();
 
     // UDP / TCP
Index: compat/Thread.c
===================================================================
--- compat/Thread.c	(revision 11)
+++ compat/Thread.c	(working copy)
@@ -202,7 +202,7 @@
 #if   defined( HAVE_POSIX_THREAD )
             // Cray J90 doesn't have pthread_cancel; Iperf works okay without
 #ifdef HAVE_PTHREAD_CANCEL
-            pthread_cancel( oldTID );
+            pthread_cancel( thread->mTID );
 #endif
 #else // Win32
             // this is a somewhat dangerous function; it's not
Index: src/Reporter.c
===================================================================
--- src/Reporter.c	(revision 11)
+++ src/Reporter.c	(working copy)
@@ -110,6 +110,8 @@
 
 char buffer[64]; // Buffer for printing
 ReportHeader *ReportRoot = NULL;
+int threadWait = 0;
+int threadSleeping = 0;
 extern Condition ReportCond;
 int reporter_process_report ( ReportHeader *report );
 void process_report ( ReportHeader *report );
@@ -349,7 +351,9 @@
             thread_rest();
             index = agent->reporterindex;
         }
-        
+	if (threadSleeping)
+           Condition_Signal( &ReportCond );
+
         // Put the information there
         memcpy( agent->data + agent->agentindex, packet, sizeof(ReportStruct) );
         
@@ -378,6 +382,9 @@
         packet->packetLen = 0;
         ReportPacket( agent, packet );
         packet->packetID = agent->report.cntDatagrams;
+	if (threadSleeping)
+           Condition_Signal( &ReportCond );
+
     }
 }
 
@@ -389,6 +396,9 @@
 void EndReport( ReportHeader *agent ) {
     if ( agent != NULL ) {
         int index = agent->reporterindex;
+	if (threadSleeping)
+           Condition_Signal( &ReportCond );
+
         while ( index != -1 ) {
             thread_rest();
             index = agent->reporterindex;
@@ -457,6 +467,10 @@
              * Update the ReportRoot to include this report.
              */
             Condition_Lock( ReportCond );
+	    if ( isUDP(agent) )
+	      threadWait = 0;
+	    else
+	      threadWait = 1;
             reporthdr->next = ReportRoot;
             ReportRoot = reporthdr;
             Condition_Signal( &ReportCond );
@@ -577,7 +591,17 @@
                 Condition_Unlock ( ReportCond );
             }
             // yield control of CPU is another thread is waiting
-            thread_rest();
+	    // sleep on a condition variable, as it is much cheaper
+	    // on most platforms than issuing schedyield or usleep
+	    // syscalls
+	    Condition_Lock ( ReportCond );
+	    if ( threadWait && ReportRoot != NULL) {
+	      threadSleeping = 1;
+	      Condition_TimedWait (& ReportCond, 1 );
+	      threadSleeping = 0;
+	    }
+	    Condition_Unlock ( ReportCond );
+	    
         } else {
             //Condition_Unlock ( ReportCond );
         }
Index: src/Server.cpp
===================================================================
--- src/Server.cpp	(revision 11)
+++ src/Server.cpp	(working copy)
@@ -98,6 +98,7 @@
  * ------------------------------------------------------------------- */ 
 void Server::Run( void ) {
     long currLen; 
+    max_size_t totLen = 0;
     struct UDP_datagram* mBuf_UDP  = (struct UDP_datagram*) mBuf; 
 
     ReportStruct *reportstruct = NULL;
@@ -115,22 +116,28 @@
                 reportstruct->packetID = ntohl( mBuf_UDP->id ); 
                 reportstruct->sentTime.tv_sec = ntohl( mBuf_UDP->tv_sec  );
                 reportstruct->sentTime.tv_usec = ntohl( mBuf_UDP->tv_usec ); 
-            }
+		reportstruct->packetLen = currLen;
+		gettimeofday( &(reportstruct->packetTime), NULL );
+            } else {
+		totLen += currLen;
+	    }
         
-            reportstruct->packetLen = currLen;
-            gettimeofday( &(reportstruct->packetTime), NULL );
-        
             // terminate when datagram begins with negative index 
             // the datagram ID should be correct, just negated 
             if ( reportstruct->packetID < 0 ) {
                 reportstruct->packetID = -reportstruct->packetID;
                 currLen = -1; 
             }
-            ReportPacket( mSettings->reporthdr, reportstruct );
+	    if ( isUDP (mSettings))
+		ReportPacket( mSettings->reporthdr, reportstruct );
         } while ( currLen > 0 ); 
         
         // stop timing 
         gettimeofday( &(reportstruct->packetTime), NULL );
+	if ( !isUDP (mSettings)) {
+		reportstruct->packetLen = totLen;
+		ReportPacket( mSettings->reporthdr, reportstruct );
+	}
         CloseReport( mSettings->reporthdr, reportstruct );
         
         // send a acknowledgement back only if we're NOT receiving multicast 
Index: src/Client.cpp
===================================================================
--- src/Client.cpp	(revision 11)
+++ src/Client.cpp	(working copy)
@@ -115,6 +115,79 @@
 const double kSecs_to_usecs = 1e6; 
 const int    kBytes_to_Bits = 8; 
 
+void Client::RunTCP( void ) {
+    long currLen = 0; 
+    struct itimerval it;
+    max_size_t totLen = 0;
+
+    int delay_target = 0; 
+    int delay = 0; 
+    int adjust = 0; 
+    int secs;
+    int usecs;
+    int err;
+
+    char* readAt = mBuf;
+
+    // Indicates if the stream is readable 
+    bool canRead = true, mMode_Time = isModeTime( mSettings ); 
+
+    ReportStruct *reportstruct = NULL;
+
+    // InitReport handles Barrier for multiple Streams
+    mSettings->reporthdr = InitReport( mSettings );
+    reportstruct = new ReportStruct;
+    reportstruct->packetID = 0;
+
+    lastPacketTime.setnow();
+    if ( mMode_Time ) {
+	memset (&it, 0, sizeof (it));
+	it.it_value.tv_sec = (int) (mSettings->mAmount / 100.0);
+	it.it_value.tv_usec = (int) 10000 * (mSettings->mAmount -
+	    it.it_value.tv_sec * 100.0);
+	err = setitimer( ITIMER_REAL, &it, NULL );
+	if ( err != 0 ) {
+	    perror("setitimer");
+	    exit(1);
+	}
+    }
+    do {
+        // Read the next data block from 
+        // the file if it's file input 
+        if ( isFileInput( mSettings ) ) {
+            Extractor_getNextDataBlock( readAt, mSettings ); 
+            canRead = Extractor_canRead( mSettings ) != 0; 
+        } else
+            canRead = true; 
+
+        // perform write 
+        currLen = write( mSettings->mSock, mBuf, mSettings->mBufLen ); 
+        if ( currLen < 0 ) {
+            WARN_errno( currLen < 0, "write2" ); 
+            break; 
+        }
+	totLen += currLen;
+
+        if ( delay > 0 ) {
+            delay_loop( delay ); 
+        }
+        if ( !mMode_Time ) {
+            mSettings->mAmount -= currLen;
+        }
+
+    } while ( ! (sInterupted  || 
+                   (!mMode_Time  &&  0 >= mSettings->mAmount)) && canRead ); 
+
+    // stop timing
+    gettimeofday( &(reportstruct->packetTime), NULL );
+    reportstruct->packetLen = totLen;
+    ReportPacket( mSettings->reporthdr, reportstruct );
+    CloseReport( mSettings->reporthdr, reportstruct );
+
+    DELETE_PTR( reportstruct );
+    EndReport( mSettings->reporthdr );
+}
+
 /* ------------------------------------------------------------------- 
  * Send data using the connected UDP/TCP socket, 
  * until a termination flag is reached. 
@@ -130,6 +203,13 @@
     int adjust = 0; 
 
     char* readAt = mBuf;
+
+#if HAVE_THREAD
+    if ( !isUDP( mSettings ) ) {
+	RunTCP();
+	return;
+    }
+#endif
     
     // Indicates if the stream is readable 
     bool canRead = true, mMode_Time = isModeTime( mSettings ); 
@@ -215,7 +295,7 @@
 
         // perform write 
         currLen = write( mSettings->mSock, mBuf, mSettings->mBufLen ); 
-        if ( currLen < 0 ) {
+        if ( currLen < 0 && errno != ENOBUFS ) {
             WARN_errno( currLen < 0, "write2" ); 
             break; 
         }
Index: src/main.cpp
===================================================================
--- src/main.cpp	(revision 11)
+++ src/main.cpp	(working copy)
@@ -123,6 +123,7 @@
     // Set SIGTERM and SIGINT to call our user interrupt function
     my_signal( SIGTERM, Sig_Interupt );
     my_signal( SIGINT,  Sig_Interupt );
+    my_signal( SIGALRM,  Sig_Interupt );
 
 #ifndef WIN32
     // Ignore broken pipes

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-09-26 13:31           ` Ingo Molnar
  2007-09-26 15:40             ` Stephen Hemminger
  2007-09-26 15:46             ` Stephen Hemminger
@ 2007-09-27  9:30             ` Jarek Poplawski
  2007-09-27  9:46               ` Ingo Molnar
  2 siblings, 1 reply; 71+ messages in thread
From: Jarek Poplawski @ 2007-09-27  9:30 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: David Schwartz, Linux-Kernel@Vger. Kernel. Org, Mike Galbraith,
	Peter Zijlstra, Martin Michlmayr, Srivatsa Vaddagiri,
	Stephen Hemminger

On 26-09-2007 15:31, Ingo Molnar wrote:
> * David Schwartz <davids@webmaster.com> wrote:
> 
>>>> I think the real fix would be for iperf to use blocking network IO 
>>>> though, or maybe to use a POSIX mutex or POSIX semaphores.
>>> So it's definitely not a bug in the kernel, only in iperf?
>> Martin:
>>
>> Actually, in this case I think iperf is doing the right thing (though not
>> the best thing) and the kernel is doing the wrong thing. [...]
> 
> it's not doing the right thing at all. I had a quick look at the source 
> code, and the reason for that weird yield usage was that there's a 
> locking bug in iperf's "Reporter thread" abstraction and apparently 
> instead of fixing the bug it was worked around via a horrible yield() 
> based user-space lock.
> 
> the (small) patch below fixes the iperf locking bug and removes the 
> yield() use. There are numerous immediate benefits of this patch:
...
> 
> sched_yield() is almost always the symptom of broken locking or other 
> bug. In that sense CFS does the right thing by exposing such bugs =B-)

...Only if it were under some DEBUG option. Even if iperf is doing
the wrong thing there is no explanation for such big difference in
the behavior between sched_compat_yield 1 vs. 0. It seems common
interfaces should work similarly and predictably on various
systems, and here, if I didn't miss something, linux looks like a
different kind?

Regards,
Jarek P.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-09-27  9:30             ` Jarek Poplawski
@ 2007-09-27  9:46               ` Ingo Molnar
  2007-09-27 12:27                 ` Jarek Poplawski
  0 siblings, 1 reply; 71+ messages in thread
From: Ingo Molnar @ 2007-09-27  9:46 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: David Schwartz, Linux-Kernel@Vger. Kernel. Org, Mike Galbraith,
	Peter Zijlstra, Martin Michlmayr, Srivatsa Vaddagiri,
	Stephen Hemminger


* Jarek Poplawski <jarkao2@o2.pl> wrote:

> > the (small) patch below fixes the iperf locking bug and removes the 
> > yield() use. There are numerous immediate benefits of this patch:
> ...
> > 
> > sched_yield() is almost always the symptom of broken locking or other 
> > bug. In that sense CFS does the right thing by exposing such bugs =B-)
> 
> ...Only if it were under some DEBUG option. [...]

note that i qualified my sentence both via "In that sense" and via a 
smiley! So i was not suggesting that this is a general rule at all and i 
was also joking :-)

> [...] Even if iperf is doing the wrong thing there is no explanation 
> for such big difference in the behavior between sched_compat_yield 1 
> vs. 0. It seems common interfaces should work similarly and 
> predictably on various systems, and here, if I didn't miss something, 
> linux looks like a different kind?

What you missed is that there is no such thing as "predictable yield 
behavior" for anything but SCHED_FIFO/RR tasks (for which tasks CFS does 
keep the behavior). Please read this thread on lkml for a more detailed 
background:

   CFS: some bad numbers with Java/database threading [FIXED]

   http://lkml.org/lkml/2007/9/19/357
   http://lkml.org/lkml/2007/9/19/328

in short: the yield implementation was tied to the O(1) scheduler, so 
the only way to have the exact same behavior would be to have the exact 
same core scheduler again. If what you said was true we would not be 
able to change the scheduler, ever. For something as vaguely defined of 
an API as yield, there's just no way to have a different core scheduler 
and still behave the same way.

So _generally_ i'd agree with you that normally we want to be bug for 
bug compatible, but in this specific (iperf) case there's just no point 
in preserving behavior that papers over this _clearly_ broken user-space 
app/thread locking (for which now two fixes exist already, plus a third 
fix is the twiddling of that sysctl).

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-09-26 11:29       ` Martin Michlmayr
  2007-09-26 12:00         ` David Schwartz
@ 2007-09-27  9:49         ` Ingo Molnar
  2007-09-27 10:54           ` Martin Michlmayr
  1 sibling, 1 reply; 71+ messages in thread
From: Ingo Molnar @ 2007-09-27  9:49 UTC (permalink / raw)
  To: Martin Michlmayr
  Cc: Mike Galbraith, Srivatsa Vaddagiri, linux-kernel, Stephen Hemminger


* Martin Michlmayr <tbm@cyrius.com> wrote:

> > I think the real fix would be for iperf to use blocking network IO 
> > though, or maybe to use a POSIX mutex or POSIX semaphores.
> 
> So it's definitely not a bug in the kernel, only in iperf?
> 
> (CCing Stephen Hemminger who wrote the iperf patch.)

Martin, could you check the iperf patch below instead of the yield patch 
- does it solve the iperf performance problem equally well, and does CPU 
utilization drop for you too?

	Ingo

-------------------------->
Subject: iperf: fix locking
From: Ingo Molnar <mingo@elte.hu>

fix iperf locking - it was burning CPU time while polling
unnecessarily, instead of using the proper wait primitives.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 compat/Thread.c |    3 ---
 src/Reporter.c  |   13 +++++++++----
 src/main.cpp    |    2 ++
 3 files changed, 11 insertions(+), 7 deletions(-)

Index: iperf-2.0.2/compat/Thread.c
===================================================================
--- iperf-2.0.2.orig/compat/Thread.c
+++ iperf-2.0.2/compat/Thread.c
@@ -405,9 +405,6 @@ int thread_numuserthreads( void ) {
 void thread_rest ( void ) {
 #if defined( HAVE_THREAD )
 #if defined( HAVE_POSIX_THREAD )
-    // TODO add checks for sched_yield or pthread_yield and call that
-    // if available
-    usleep( 0 );
 #else // Win32
     SwitchToThread( );
 #endif
Index: iperf-2.0.2/src/Reporter.c
===================================================================
--- iperf-2.0.2.orig/src/Reporter.c
+++ iperf-2.0.2/src/Reporter.c
@@ -111,6 +111,7 @@ report_statistics multiple_reports[kRepo
 char buffer[64]; // Buffer for printing
 ReportHeader *ReportRoot = NULL;
 extern Condition ReportCond;
+extern Condition ReportDoneCond;
 int reporter_process_report ( ReportHeader *report );
 void process_report ( ReportHeader *report );
 int reporter_handle_packet( ReportHeader *report );
@@ -338,7 +339,7 @@ void ReportPacket( ReportHeader* agent, 
             // item
             while ( index == 0 ) {
                 Condition_Signal( &ReportCond );
-                thread_rest();
+                Condition_Wait( &ReportDoneCond );
                 index = agent->reporterindex;
             }
             agent->agentindex = 0;
@@ -346,7 +347,7 @@ void ReportPacket( ReportHeader* agent, 
         // Need to make sure that reporter is not about to be "lapped"
         while ( index - 1 == agent->agentindex ) {
             Condition_Signal( &ReportCond );
-            thread_rest();
+            Condition_Wait( &ReportDoneCond );
             index = agent->reporterindex;
         }
         
@@ -553,6 +554,7 @@ void reporter_spawn( thread_Settings *th
         }
         Condition_Unlock ( ReportCond );
 
+again:
         if ( ReportRoot != NULL ) {
             ReportHeader *temp = ReportRoot;
             //Condition_Unlock ( ReportCond );
@@ -575,9 +577,12 @@ void reporter_spawn( thread_Settings *th
                 // finished with report so free it
                 free( temp );
                 Condition_Unlock ( ReportCond );
+            	Condition_Signal( &ReportDoneCond );
+		if (ReportRoot)
+			goto again;
             }
-            // yield control of CPU is another thread is waiting
-            thread_rest();
+            Condition_Signal( &ReportDoneCond );
+            usleep(10000);
         } else {
             //Condition_Unlock ( ReportCond );
         }
Index: iperf-2.0.2/src/main.cpp
===================================================================
--- iperf-2.0.2.orig/src/main.cpp
+++ iperf-2.0.2/src/main.cpp
@@ -96,6 +96,7 @@ extern "C" {
     // records being accessed in a report and also to
     // serialize modification of the report list
     Condition ReportCond;
+    Condition ReportDoneCond;
 }
 
 // global variables only accessed within this file
@@ -141,6 +142,7 @@ int main( int argc, char **argv ) {
 
     // Initialize global mutexes and conditions
     Condition_Initialize ( &ReportCond );
+    Condition_Initialize ( &ReportDoneCond );
     Mutex_Initialize( &groupCond );
     Mutex_Initialize( &clients_mutex );
 


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-09-27  9:49         ` Ingo Molnar
@ 2007-09-27 10:54           ` Martin Michlmayr
  2007-09-27 10:56             ` Ingo Molnar
  0 siblings, 1 reply; 71+ messages in thread
From: Martin Michlmayr @ 2007-09-27 10:54 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Mike Galbraith, Srivatsa Vaddagiri, linux-kernel, Stephen Hemminger

* Ingo Molnar <mingo@elte.hu> [2007-09-27 11:49]:
> Martin, could you check the iperf patch below instead of the yield
> patch - does it solve the iperf performance problem equally well,
> and does CPU utilization drop for you too?

Yes, it works and CPU goes down too.
-- 
Martin Michlmayr
http://www.cyrius.com/

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-09-27 10:54           ` Martin Michlmayr
@ 2007-09-27 10:56             ` Ingo Molnar
  2007-09-27 11:12               ` Martin Michlmayr
  0 siblings, 1 reply; 71+ messages in thread
From: Ingo Molnar @ 2007-09-27 10:56 UTC (permalink / raw)
  To: Martin Michlmayr
  Cc: Mike Galbraith, Srivatsa Vaddagiri, linux-kernel, Stephen Hemminger


* Martin Michlmayr <tbm@cyrius.com> wrote:

> * Ingo Molnar <mingo@elte.hu> [2007-09-27 11:49]:
> > Martin, could you check the iperf patch below instead of the yield
> > patch - does it solve the iperf performance problem equally well,
> > and does CPU utilization drop for you too?
> 
> Yes, it works and CPU goes down too.

i'm curious by how much does CPU go down, and what's the output of 
iperf? (does it saturate full 100mbit network bandwidth)

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-09-27 10:56             ` Ingo Molnar
@ 2007-09-27 11:12               ` Martin Michlmayr
  0 siblings, 0 replies; 71+ messages in thread
From: Martin Michlmayr @ 2007-09-27 11:12 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Mike Galbraith, Srivatsa Vaddagiri, linux-kernel, Stephen Hemminger

* Ingo Molnar <mingo@elte.hu> [2007-09-27 12:56]:
> i'm curious by how much does CPU go down, and what's the output of
> iperf? (does it saturate full 100mbit network bandwidth)

I get about 94-95 Mbits/sec and CPU drops from 99% to about 82% (this
is with a 600 MHz ARM CPU).
-- 
Martin Michlmayr
http://www.cyrius.com/

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-09-27  9:46               ` Ingo Molnar
@ 2007-09-27 12:27                 ` Jarek Poplawski
  2007-09-27 13:31                   ` Ingo Molnar
  0 siblings, 1 reply; 71+ messages in thread
From: Jarek Poplawski @ 2007-09-27 12:27 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: David Schwartz, Linux-Kernel@Vger. Kernel. Org, Mike Galbraith,
	Peter Zijlstra, Martin Michlmayr, Srivatsa Vaddagiri,
	Stephen Hemminger

On Thu, Sep 27, 2007 at 11:46:03AM +0200, Ingo Molnar wrote:
> 
> * Jarek Poplawski <jarkao2@o2.pl> wrote:
> 
> > > the (small) patch below fixes the iperf locking bug and removes the 
> > > yield() use. There are numerous immediate benefits of this patch:
> > ...
> > > 
> > > sched_yield() is almost always the symptom of broken locking or other 
> > > bug. In that sense CFS does the right thing by exposing such bugs =B-)
> > 
> > ...Only if it were under some DEBUG option. [...]
> 
> note that i qualified my sentence both via "In that sense" and via a 
> smiley! So i was not suggesting that this is a general rule at all and i 
> was also joking :-)

Actually, I've analyzed this smiley for some time but these scheduler
jokes are really hard, and I definitely need more time...

> 
> > [...] Even if iperf is doing the wrong thing there is no explanation 
> > for such big difference in the behavior between sched_compat_yield 1 
> > vs. 0. It seems common interfaces should work similarly and 
> > predictably on various systems, and here, if I didn't miss something, 
> > linux looks like a different kind?
> 
> What you missed is that there is no such thing as "predictable yield 
> behavior" for anything but SCHED_FIFO/RR tasks (for which tasks CFS does 
> keep the behavior). Please read this thread on lkml for a more detailed 
> background:
> 
>    CFS: some bad numbers with Java/database threading [FIXED]
> 
>    http://lkml.org/lkml/2007/9/19/357
>    http://lkml.org/lkml/2007/9/19/328
> 
> in short: the yield implementation was tied to the O(1) scheduler, so 
> the only way to have the exact same behavior would be to have the exact 
> same core scheduler again. If what you said was true we would not be 
> able to change the scheduler, ever. For something as vaguely defined of 
> an API as yield, there's just no way to have a different core scheduler 
> and still behave the same way.
> 
> So _generally_ i'd agree with you that normally we want to be bug for 
> bug compatible, but in this specific (iperf) case there's just no point 
> in preserving behavior that papers over this _clearly_ broken user-space 
> app/thread locking (for which now two fixes exist already, plus a third 
> fix is the twiddling of that sysctl).
> 

OK, but let's forget about fixing iperf. Probably I got this wrong,
but I've thought this "bad" iperf patch was tested on a few nixes and
linux was the most different one. The main point is: even if there is
no standard here, it should be a common interest to try to not differ
too much at least. So, it's not about exactness, but 50% (63 -> 95)
change in linux own 'definition' after upgrading seems to be a lot.
So, IMHO, maybe some 'compatibility' test could be prepared to
compare a few different ideas on this yield and some average value
could be a kind of at least linux' own standard, which should be
emulated within some limits by next kernels?

Thanks,
Jarek P.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-09-27 12:27                 ` Jarek Poplawski
@ 2007-09-27 13:31                   ` Ingo Molnar
  2007-09-27 14:42                     ` Jarek Poplawski
  0 siblings, 1 reply; 71+ messages in thread
From: Ingo Molnar @ 2007-09-27 13:31 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: David Schwartz, Linux-Kernel@Vger. Kernel. Org, Mike Galbraith,
	Peter Zijlstra, Martin Michlmayr, Srivatsa Vaddagiri,
	Stephen Hemminger


* Jarek Poplawski <jarkao2@o2.pl> wrote:

> On Thu, Sep 27, 2007 at 11:46:03AM +0200, Ingo Molnar wrote:
[...]
> > What you missed is that there is no such thing as "predictable yield 
> > behavior" for anything but SCHED_FIFO/RR tasks (for which tasks CFS does 
> > keep the behavior). Please read this thread on lkml for a more detailed 
> > background:
> > 
> >    CFS: some bad numbers with Java/database threading [FIXED]
> > 
> >    http://lkml.org/lkml/2007/9/19/357
> >    http://lkml.org/lkml/2007/9/19/328
> > 
> > in short: the yield implementation was tied to the O(1) scheduler, so 
> > the only way to have the exact same behavior would be to have the exact 
> > same core scheduler again. If what you said was true we would not be 
> > able to change the scheduler, ever. For something as vaguely defined of 
> > an API as yield, there's just no way to have a different core scheduler 
> > and still behave the same way.
> > 
> > So _generally_ i'd agree with you that normally we want to be bug for 
> > bug compatible, but in this specific (iperf) case there's just no point 
> > in preserving behavior that papers over this _clearly_ broken user-space 
> > app/thread locking (for which now two fixes exist already, plus a third 
> > fix is the twiddling of that sysctl).
> > 
> 
> OK, but let's forget about fixing iperf. Probably I got this wrong, 
> but I've thought this "bad" iperf patch was tested on a few nixes and 
> linux was the most different one. The main point is: even if there is 
> no standard here, it should be a common interest to try to not differ 
> too much at least. So, it's not about exactness, but 50% (63 -> 95) 
> change in linux own 'definition' after upgrading seems to be a lot. 
> So, IMHO, maybe some 'compatibility' test could be prepared to compare 
> a few different ideas on this yield and some average value could be a 
> kind of at least linux' own standard, which should be emulated within 
> some limits by next kernels?

you repeat your point of "emulating yield", and i can only repeat my 
point that you should please read this:

    http://lkml.org/lkml/2007/9/19/357

because, once you read that, i think you'll agree with me that what you 
say is simply not possible in a sane way at this stage. We went through 
a number of yield implementations already and each will change behavior 
for _some_ category of apps. So right now we offer two implementations, 
and the default was chosen empirically to minimize the amount of 
complaints. (but it's not possible to eliminate them altogether, for the 
reasons outlined above - hence the switch.)

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-09-27 13:31                   ` Ingo Molnar
@ 2007-09-27 14:42                     ` Jarek Poplawski
  2007-09-28  6:10                       ` Nick Piggin
  0 siblings, 1 reply; 71+ messages in thread
From: Jarek Poplawski @ 2007-09-27 14:42 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: David Schwartz, Linux-Kernel@Vger. Kernel. Org, Mike Galbraith,
	Peter Zijlstra, Martin Michlmayr, Srivatsa Vaddagiri,
	Stephen Hemminger

On Thu, Sep 27, 2007 at 03:31:23PM +0200, Ingo Molnar wrote:
> 
> * Jarek Poplawski <jarkao2@o2.pl> wrote:
...
> > OK, but let's forget about fixing iperf. Probably I got this wrong, 
> > but I've thought this "bad" iperf patch was tested on a few nixes and 
> > linux was the most different one. The main point is: even if there is 
> > no standard here, it should be a common interest to try to not differ 
> > too much at least. So, it's not about exactness, but 50% (63 -> 95) 
> > change in linux own 'definition' after upgrading seems to be a lot. 
> > So, IMHO, maybe some 'compatibility' test could be prepared to compare 
> > a few different ideas on this yield and some average value could be a 
> > kind of at least linux' own standard, which should be emulated within 
> > some limits by next kernels?
> 
> you repeat your point of "emulating yield", and i can only repeat my 
> point that you should please read this:
> 
>     http://lkml.org/lkml/2007/9/19/357
> 
> because, once you read that, i think you'll agree with me that what you 
> say is simply not possible in a sane way at this stage. We went through 
> a number of yield implementations already and each will change behavior 
> for _some_ category of apps. So right now we offer two implementations, 
> and the default was chosen empirically to minimize the amount of 
> complaints. (but it's not possible to eliminate them altogether, for the 
> reasons outlined above - hence the switch.)

Sorry, but I think you got me wrong: I didn't mean emulation of any
implementation, but probably the some thing you write above: emulation
of time/performance. In my opinion this should be done experimentally
too, but with something more objective and constant than current
"complaints counter". And the first thing could be a try to set some
kind of linux internal "standard of yeld" for the future by averaging
a few most popular systems in a test doing things like this iperf or
preferably more.

Jarek P.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-09-27 14:42                     ` Jarek Poplawski
@ 2007-09-28  6:10                       ` Nick Piggin
  2007-10-01  8:43                         ` Jarek Poplawski
  0 siblings, 1 reply; 71+ messages in thread
From: Nick Piggin @ 2007-09-28  6:10 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Ingo Molnar, David Schwartz, linux-kernel, Mike Galbraith,
	Peter Zijlstra, Martin Michlmayr, Srivatsa Vaddagiri,
	Stephen Hemminger

On Friday 28 September 2007 00:42, Jarek Poplawski wrote:
> On Thu, Sep 27, 2007 at 03:31:23PM +0200, Ingo Molnar wrote:
> > * Jarek Poplawski <jarkao2@o2.pl> wrote:
>
> ...
>
> > > OK, but let's forget about fixing iperf. Probably I got this wrong,
> > > but I've thought this "bad" iperf patch was tested on a few nixes and
> > > linux was the most different one. The main point is: even if there is
> > > no standard here, it should be a common interest to try to not differ
> > > too much at least. So, it's not about exactness, but 50% (63 -> 95)
> > > change in linux own 'definition' after upgrading seems to be a lot.
> > > So, IMHO, maybe some 'compatibility' test could be prepared to compare
> > > a few different ideas on this yield and some average value could be a
> > > kind of at least linux' own standard, which should be emulated within
> > > some limits by next kernels?
> >
> > you repeat your point of "emulating yield", and i can only repeat my
> > point that you should please read this:
> >
> >     http://lkml.org/lkml/2007/9/19/357
> >
> > because, once you read that, i think you'll agree with me that what you
> > say is simply not possible in a sane way at this stage. We went through
> > a number of yield implementations already and each will change behavior
> > for _some_ category of apps. So right now we offer two implementations,
> > and the default was chosen empirically to minimize the amount of
> > complaints. (but it's not possible to eliminate them altogether, for the
> > reasons outlined above - hence the switch.)
>
> Sorry, but I think you got me wrong: I didn't mean emulation of any
> implementation, but probably the some thing you write above: emulation
> of time/performance. In my opinion this should be done experimentally
> too, but with something more objective and constant than current
> "complaints counter". And the first thing could be a try to set some
> kind of linux internal "standard of yeld" for the future by averaging
> a few most popular systems in a test doing things like this iperf or
> preferably more.

By definition, yield is essentially undefined as to the behaviour between
SCHED_OTHER tasks at the same priority level (ie. all of them), because
SCHED_OTHER scheduling behaviour itself is undefined.

It's never going to do exactly what everybody wants, except those using
it for legitimate reasons in realtime applications.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-09-28  6:10                       ` Nick Piggin
@ 2007-10-01  8:43                         ` Jarek Poplawski
  2007-10-01 16:25                           ` Ingo Molnar
  2007-10-02  9:26                           ` Jarek Poplawski
  0 siblings, 2 replies; 71+ messages in thread
From: Jarek Poplawski @ 2007-10-01  8:43 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Ingo Molnar, David Schwartz, linux-kernel, Mike Galbraith,
	Peter Zijlstra, Martin Michlmayr, Srivatsa Vaddagiri,
	Stephen Hemminger

On Fri, Sep 28, 2007 at 04:10:00PM +1000, Nick Piggin wrote:
> On Friday 28 September 2007 00:42, Jarek Poplawski wrote:
> > On Thu, Sep 27, 2007 at 03:31:23PM +0200, Ingo Molnar wrote:
> > > * Jarek Poplawski <jarkao2@o2.pl> wrote:
> >
> > ...
> >
> > > > OK, but let's forget about fixing iperf. Probably I got this wrong,
> > > > but I've thought this "bad" iperf patch was tested on a few nixes and
> > > > linux was the most different one. The main point is: even if there is
> > > > no standard here, it should be a common interest to try to not differ
> > > > too much at least. So, it's not about exactness, but 50% (63 -> 95)
> > > > change in linux own 'definition' after upgrading seems to be a lot.
> > > > So, IMHO, maybe some 'compatibility' test could be prepared to compare
> > > > a few different ideas on this yield and some average value could be a
> > > > kind of at least linux' own standard, which should be emulated within
> > > > some limits by next kernels?
> > >
> > > you repeat your point of "emulating yield", and i can only repeat my
> > > point that you should please read this:
> > >
> > >     http://lkml.org/lkml/2007/9/19/357
> > >
> > > because, once you read that, i think you'll agree with me that what you
> > > say is simply not possible in a sane way at this stage. We went through
> > > a number of yield implementations already and each will change behavior
> > > for _some_ category of apps. So right now we offer two implementations,
> > > and the default was chosen empirically to minimize the amount of
> > > complaints. (but it's not possible to eliminate them altogether, for the
> > > reasons outlined above - hence the switch.)
> >
> > Sorry, but I think you got me wrong: I didn't mean emulation of any
> > implementation, but probably the some thing you write above: emulation
> > of time/performance. In my opinion this should be done experimentally
> > too, but with something more objective and constant than current
> > "complaints counter". And the first thing could be a try to set some
> > kind of linux internal "standard of yeld" for the future by averaging
> > a few most popular systems in a test doing things like this iperf or
> > preferably more.
> 
> By definition, yield is essentially undefined as to the behaviour between
> SCHED_OTHER tasks at the same priority level (ie. all of them), because
> SCHED_OTHER scheduling behaviour itself is undefined.
> 
> It's never going to do exactly what everybody wants, except those using
> it for legitimate reasons in realtime applications.
> 

That's why I've used words like: "not differ too much" and "within
some limits" above. So, it's only about being reasonable, compared
to our previous versions, and to others, if possible.

This should be not impossible to additionally control (delay or
accelerate) yielding tasks wrt. current load/weight/number_of_tasks
etc., if we know (after testing) eg. average expedition time of such
tasks with various schedulers. Of course, such tests and controlling
paremeters can change for some time until the problem is explored
enough, and still no aim for exactness or to please everybody.

BTW, it looks like risky to criticise sched_yield too much: some
people can misinterpret such discussions and stop using this at
all, even where it's right.

Regards,
Jarek P.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-01  8:43                         ` Jarek Poplawski
@ 2007-10-01 16:25                           ` Ingo Molnar
  2007-10-01 16:49                             ` David Schwartz
                                               ` (2 more replies)
  2007-10-02  9:26                           ` Jarek Poplawski
  1 sibling, 3 replies; 71+ messages in thread
From: Ingo Molnar @ 2007-10-01 16:25 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Nick Piggin, David Schwartz, linux-kernel, Mike Galbraith,
	Peter Zijlstra, Martin Michlmayr, Srivatsa Vaddagiri,
	Stephen Hemminger


* Jarek Poplawski <jarkao2@o2.pl> wrote:

> BTW, it looks like risky to criticise sched_yield too much: some 
> people can misinterpret such discussions and stop using this at all, 
> even where it's right.

Really, i have never seen a _single_ mainstream app where the use of 
sched_yield() was the right choice.

Fortunately, the sched_yield() API is already one of the most rarely 
used scheduler functionalities, so it does not really matter. [ In my 
experience a Linux scheduler is stabilizing pretty well when the 
discussion shifts to yield behavior, because that shows that everything 
else is pretty much fine ;-) ]

But, because you assert it that it's risky to "criticise sched_yield() 
too much", you sure must know at least one real example where it's right 
to use it (and cite the line and code where it's used, with 
specificity)?

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* RE: Network slowdown due to CFS
  2007-10-01 16:25                           ` Ingo Molnar
@ 2007-10-01 16:49                             ` David Schwartz
  2007-10-01 17:31                               ` Ingo Molnar
                                                 ` (3 more replies)
  2007-10-01 16:55                             ` Chris Friesen
  2007-10-02  9:03                             ` Network slowdown due to CFS Jarek Poplawski
  2 siblings, 4 replies; 71+ messages in thread
From: David Schwartz @ 2007-10-01 16:49 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel


> * Jarek Poplawski <jarkao2@o2.pl> wrote:
>
> > BTW, it looks like risky to criticise sched_yield too much: some
> > people can misinterpret such discussions and stop using this at all,
> > even where it's right.

> Really, i have never seen a _single_ mainstream app where the use of
> sched_yield() was the right choice.

It can occasionally be an optimization. You may have a case where you can do
something very efficiently if a lock is not held, but you cannot afford to
wait for the lock to be released. So you check the lock, if it's held, you
yield and then check again. If that fails, you do it the less optimal way
(for example, dispatching it to a thread that *can* afford to wait).

It is also sometimes used in the implementation of spinlock-type primitives.
After spinning fails, yielding is tried.

I think it's also sometimes appropriate when a thread may monopolize a
mutex. For example, consider a rarely-run task that cleans up some expensive
structures. It may need to hold locks that are only held during this complex
clean up.

One example I know of is a defragmenter for a multi-threaded memory
allocator, and it has to lock whole pools. When it releases these locks, it
calls yield before re-acquiring them to go back to work. The idea is to "go
to the back of the line" if any threads are blocking on those mutexes.

There are certainly other ways to do these things, but I have seen cases
where, IMO, yielding was the best solution. Doing nothing would have been
okay too.

> Fortunately, the sched_yield() API is already one of the most rarely
> used scheduler functionalities, so it does not really matter. [ In my
> experience a Linux scheduler is stabilizing pretty well when the
> discussion shifts to yield behavior, because that shows that everything
> else is pretty much fine ;-) ]

Can you explain what the current sched_yield behavior *is* for CFS and what
the tunable does to change it?

The desired behavior is for the current thread to not be rescheduled until
every thread at the same static priority as this thread has had a chance to
be scheduled.

Of course, it's not clear exactly what a "chance" is.

The semantics with respect to threads at other static priority levels is not
clear. Ditto for SMP issues. It's also not clear whether threads that yield
should be rewarded or punished for doing so.

DS



^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-01 16:25                           ` Ingo Molnar
  2007-10-01 16:49                             ` David Schwartz
@ 2007-10-01 16:55                             ` Chris Friesen
  2007-10-01 17:09                               ` Ingo Molnar
  2007-10-02  9:03                             ` Network slowdown due to CFS Jarek Poplawski
  2 siblings, 1 reply; 71+ messages in thread
From: Chris Friesen @ 2007-10-01 16:55 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Jarek Poplawski, Nick Piggin, David Schwartz, linux-kernel,
	Mike Galbraith, Peter Zijlstra, Martin Michlmayr,
	Srivatsa Vaddagiri, Stephen Hemminger

Ingo Molnar wrote:

> But, because you assert it that it's risky to "criticise sched_yield() 
> too much", you sure must know at least one real example where it's right 
> to use it (and cite the line and code where it's used, with 
> specificity)?

It's fine to criticise sched_yield().  I agree that new apps should 
generally be written to use proper completion mechanisms or to wait for 
specific events.

However, there are closed-source and/or frozen-source apps where it's 
not practical to rewrite or rebuild the app.  Does it make sense to 
break the behaviour of all of these?

Chris

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-01 16:55                             ` Chris Friesen
@ 2007-10-01 17:09                               ` Ingo Molnar
  2007-10-01 17:45                                 ` Chris Friesen
  0 siblings, 1 reply; 71+ messages in thread
From: Ingo Molnar @ 2007-10-01 17:09 UTC (permalink / raw)
  To: Chris Friesen
  Cc: Jarek Poplawski, Nick Piggin, David Schwartz, linux-kernel,
	Mike Galbraith, Peter Zijlstra, Martin Michlmayr,
	Srivatsa Vaddagiri, Stephen Hemminger


* Chris Friesen <cfriesen@nortel.com> wrote:

> Ingo Molnar wrote:
> 
> >But, because you assert it that it's risky to "criticise sched_yield() 
> >too much", you sure must know at least one real example where it's right 
> >to use it (and cite the line and code where it's used, with 
> >specificity)?
> 
> It's fine to criticise sched_yield().  I agree that new apps should 
> generally be written to use proper completion mechanisms or to wait 
> for specific events.

yes.

> However, there are closed-source and/or frozen-source apps where it's 
> not practical to rewrite or rebuild the app.  Does it make sense to 
> break the behaviour of all of these?

See the background and answers to that in:

   http://lkml.org/lkml/2007/9/19/357
   http://lkml.org/lkml/2007/9/19/328

there's plenty of recourse possible to all possible kinds of apps. Tune 
the sysctl flag in one direction or another, depending on which behavior 
the app is expecting.

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-01 16:49                             ` David Schwartz
@ 2007-10-01 17:31                               ` Ingo Molnar
  2007-10-01 18:23                                 ` David Schwartz
  2007-10-01 19:53                               ` Network slowdown due to CFS Arjan van de Ven
                                                 ` (2 subsequent siblings)
  3 siblings, 1 reply; 71+ messages in thread
From: Ingo Molnar @ 2007-10-01 17:31 UTC (permalink / raw)
  To: David Schwartz; +Cc: linux-kernel


* David Schwartz <davids@webmaster.com> wrote:

> > > BTW, it looks like risky to criticise sched_yield too much: some 
> > > people can misinterpret such discussions and stop using this at 
> > > all, even where it's right.
> 
> > Really, i have never seen a _single_ mainstream app where the use of 
> > sched_yield() was the right choice.
> 
> It can occasionally be an optimization. You may have a case where you 
> can do something very efficiently if a lock is not held, but you 
> cannot afford to wait for the lock to be released. So you check the 
> lock, if it's held, you yield and then check again. If that fails, you 
> do it the less optimal way (for example, dispatching it to a thread 
> that *can* afford to wait).

These are generic statements, but i'm _really_ interested in the 
specifics. Real, specific code that i can look at. The typical Linux 
distro consists of in execess of 500 millions of lines of code, in tens 
of thousands of apps, so there really must be some good, valid and 
"right" use of sched_yield() somewhere in there, in some mainstream app, 
right? (because, as you might have guessed it, in the past decade of 
sched_yield() existence i _have_ seen my share of sched_yield() 
utilizing user-space code, and at the moment i'm not really impressed by 
those examples.)
 
Preferably that example should show that the best quality user-space 
lock implementation in a given scenario is best done via sched_yield(). 
Actual code and numbers. (And this isnt _that_ hard. I'm not asking for 
a full RDBMS implementation that must run through SQL99 spec suite. This 
is about a simple locking primitive, or a simple pointer to an existing 
codebase.)

> It is also sometimes used in the implementation of spinlock-type 
> primitives. After spinning fails, yielding is tried.

(user-space spinlocks are broken beyond words for anything but perhaps 
SCHED_FIFO tasks.)

> One example I know of is a defragmenter for a multi-threaded memory 
> allocator, and it has to lock whole pools. When it releases these 
> locks, it calls yield before re-acquiring them to go back to work. The 
> idea is to "go to the back of the line" if any threads are blocking on 
> those mutexes.

at a quick glance this seems broken too - but if you show the specific 
code i might be able to point out the breakage in detail. (One 
underlying problem here appears to be fairness: a quick unlock/lock 
sequence may starve out other threads. yield wont solve that fundamental 
problem either, and it will introduce random latencies into apps using 
this memory allocator.)

> > Fortunately, the sched_yield() API is already one of the most rarely
> > used scheduler functionalities, so it does not really matter. [ In my
> > experience a Linux scheduler is stabilizing pretty well when the
> > discussion shifts to yield behavior, because that shows that everything
> > else is pretty much fine ;-) ]
> 
> Can you explain what the current sched_yield behavior *is* for CFS and 
> what the tunable does to change it?

sure. (and i described that flag on lkml before) The sched_yield flag 
does two things:

 - if 0 ("opportunistic mode"), then the task will reschedule to any
   other task that is in "bigger need for CPU time" than the currently 
   running task, as indicated by CFS's ->wait_runtime metric. (or as 
   indicated by the similar ->vruntime metric in sched-devel.git)

 - if 1 ("agressive mode"), then the task will be one-time requeued to 
   the right end of the CFS rbtree. This means that for one instance, 
   all other tasks will run before this task will run again - after that 
   this task's natural ordering within the rbtree is restored.

> The desired behavior is for the current thread to not be rescheduled 
> until every thread at the same static priority as this thread has had 
> a chance to be scheduled.

do you realize that this "desired behavior" you just described is not 
achieved by the old scheduler, and that this random behavior _is_ the 
main problem here? If yield was well-specified then we could implement 
it in a well-specified way - even if the API was poor.

But fact is that it is _not_ well-specified, and apps grew upon a random 
scheduler implementation details in random ways. (in the lkml discussion 
about this topic, Linus offered a pretty sane theoretical definition for 
yield but it's not simple to implement [and no scheduler implements it 
at the moment] - nor will it map to the old scheduler's yield behavior 
so we'll end up breaking more apps.)

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-01 17:09                               ` Ingo Molnar
@ 2007-10-01 17:45                                 ` Chris Friesen
  2007-10-01 19:09                                   ` iperf yield usage Ingo Molnar
  0 siblings, 1 reply; 71+ messages in thread
From: Chris Friesen @ 2007-10-01 17:45 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Jarek Poplawski, Nick Piggin, David Schwartz, linux-kernel,
	Mike Galbraith, Peter Zijlstra, Martin Michlmayr,
	Srivatsa Vaddagiri, Stephen Hemminger

Ingo Molnar wrote:
> * Chris Friesen <cfriesen@nortel.com> wrote:

>>However, there are closed-source and/or frozen-source apps where it's 
>>not practical to rewrite or rebuild the app.  Does it make sense to 
>>break the behaviour of all of these?
> 
> 
> See the background and answers to that in:
> 
>    http://lkml.org/lkml/2007/9/19/357
>    http://lkml.org/lkml/2007/9/19/328
> 
> there's plenty of recourse possible to all possible kinds of apps. Tune 
> the sysctl flag in one direction or another, depending on which behavior 
> the app is expecting.

Yeah, I read those threads.

It seems like the fundamental source of the disconnect is that the tasks 
used to be sorted by priority (thus making it easy to bump a yielding 
task to the end of that priority level) while now they're organized by 
time (making it harder to do anything priority-based).  Do I have that 
right?

Chris

^ permalink raw reply	[flat|nested] 71+ messages in thread

* RE: Network slowdown due to CFS
  2007-10-01 17:31                               ` Ingo Molnar
@ 2007-10-01 18:23                                 ` David Schwartz
  2007-10-02  6:06                                   ` Ingo Molnar
                                                     ` (3 more replies)
  0 siblings, 4 replies; 71+ messages in thread
From: David Schwartz @ 2007-10-01 18:23 UTC (permalink / raw)
  To: mingo; +Cc: linux-kernel


> These are generic statements, but i'm _really_ interested in the
> specifics. Real, specific code that i can look at. The typical Linux
> distro consists of in execess of 500 millions of lines of code, in tens
> of thousands of apps, so there really must be some good, valid and
> "right" use of sched_yield() somewhere in there, in some mainstream app,
> right? (because, as you might have guessed it, in the past decade of
> sched_yield() existence i _have_ seen my share of sched_yield()
> utilizing user-space code, and at the moment i'm not really impressed by
> those examples.)

Maybe, maybe not. Even if so, it would be very difficult to find. Simply
grepping for sched_yield is not going to help because determining whether a
given use of sched_yield is smart is not going to be easy.

> (user-space spinlocks are broken beyond words for anything but perhaps
> SCHED_FIFO tasks.)

User-space spinlocks are broken so spinlocks can only be implemented in
kernel-space? Even if you use the kernel to schedule/unschedule the tasks,
you still have to spin in user-space.

> > One example I know of is a defragmenter for a multi-threaded memory
> > allocator, and it has to lock whole pools. When it releases these
> > locks, it calls yield before re-acquiring them to go back to work. The
> > idea is to "go to the back of the line" if any threads are blocking on
> > those mutexes.

> at a quick glance this seems broken too - but if you show the specific
> code i might be able to point out the breakage in detail. (One
> underlying problem here appears to be fairness: a quick unlock/lock
> sequence may starve out other threads. yield wont solve that fundamental
> problem either, and it will introduce random latencies into apps using
> this memory allocator.)

You are assuming that random latencies are necessarily bad. Random latencies
may be significantly better than predictable high latency.


> > Can you explain what the current sched_yield behavior *is* for CFS and
> > what the tunable does to change it?

> sure. (and i described that flag on lkml before) The sched_yield flag
> does two things:

>  - if 0 ("opportunistic mode"), then the task will reschedule to any
>    other task that is in "bigger need for CPU time" than the currently
>    running task, as indicated by CFS's ->wait_runtime metric. (or as
>    indicated by the similar ->vruntime metric in sched-devel.git)
>
>  - if 1 ("agressive mode"), then the task will be one-time requeued to
>    the right end of the CFS rbtree. This means that for one instance,
>    all other tasks will run before this task will run again - after that
>    this task's natural ordering within the rbtree is restored.

Thank you. Unfortunately, neither of these does what sched_yiled is really
supposed to do. Opportunistic mode does too little and agressive mode does
too much.

> > The desired behavior is for the current thread to not be rescheduled
> > until every thread at the same static priority as this thread has had
> > a chance to be scheduled.

> do you realize that this "desired behavior" you just described is not
> achieved by the old scheduler, and that this random behavior _is_ the
> main problem here? If yield was well-specified then we could implement
> it in a well-specified way - even if the API was poor.

> But fact is that it is _not_ well-specified, and apps grew upon a random
> scheduler implementation details in random ways. (in the lkml discussion
> about this topic, Linus offered a pretty sane theoretical definition for
> yield but it's not simple to implement [and no scheduler implements it
> at the moment] - nor will it map to the old scheduler's yield behavior
> so we'll end up breaking more apps.)

I don't have a problem with failing to emulate the old scheduler's behavior
if we can show that the new behavior has saner semantics. Unfortunately, in
this case, I think CFS' semantics are pretty bad. Neither of these is what
sched_yield is supposed to do.

Note that I'm not saying this is a particularly big deal. And I'm not
calling CFS' behavior a regression, since it's not really better or worse
than the old behavior, simply different.

I'm not familiar enough with CFS' internals to help much on the
implementation, but there may be some simple compromise yield that might
work well enough. How about simply acting as if the task used up its
timeslice and scheduling the next one? (Possibly with a slight reduction in
penalty or reward for not really using all the time, if possible?)

DS



^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: iperf yield usage
  2007-10-01 17:45                                 ` Chris Friesen
@ 2007-10-01 19:09                                   ` Ingo Molnar
  0 siblings, 0 replies; 71+ messages in thread
From: Ingo Molnar @ 2007-10-01 19:09 UTC (permalink / raw)
  To: Chris Friesen
  Cc: Jarek Poplawski, Nick Piggin, David Schwartz, linux-kernel,
	Mike Galbraith, Peter Zijlstra, Martin Michlmayr,
	Srivatsa Vaddagiri, Stephen Hemminger


* Chris Friesen <cfriesen@nortel.com> wrote:

> >See the background and answers to that in:
> >
> >   http://lkml.org/lkml/2007/9/19/357
> >   http://lkml.org/lkml/2007/9/19/328
> >
> >there's plenty of recourse possible to all possible kinds of apps. 
> >Tune the sysctl flag in one direction or another, depending on which 
> >behavior the app is expecting.
> 
> Yeah, I read those threads.
> 
> It seems like the fundamental source of the disconnect is that the 
> tasks used to be sorted by priority (thus making it easy to bump a 
> yielding task to the end of that priority level) while now they're 
> organized by time (making it harder to do anything priority-based).  
> Do I have that right?

not really - the old yield implementation in essence gave the task a 
time hit too, because we rotated through tasks based on timeslices. But 
the old one requeued yield-ing tasks to the 'active array', and the 
decision whether a task is in the active or in the expired array was a 
totally stohastic, load-dependent thing. As a result, certain tasks, 
under certain workloads saw a "stronger" yield, other tasks saw a 
"weaker" yield. (The reason for that implementation was simple: yield 
was (and is) unimportant and it was implemented in the most 
straightforward way that caused no overhead anywhere else in the 
scheduler.)

( and to keep perspective it's also important to correct the subject
  line here: it's not about "network slowdown" - nothing in networking
  slowed down in any way - it was that iperf used yield in a horrible
  way. I changed the subject line to reflect that. )

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-01 16:49                             ` David Schwartz
  2007-10-01 17:31                               ` Ingo Molnar
@ 2007-10-01 19:53                               ` Arjan van de Ven
  2007-10-01 22:17                                 ` David Schwartz
  2007-10-03 11:31                               ` Helge Hafting
  2007-10-04  0:31                               ` Rusty Russell
  3 siblings, 1 reply; 71+ messages in thread
From: Arjan van de Ven @ 2007-10-01 19:53 UTC (permalink / raw)
  To: davids; +Cc: Ingo Molnar, linux-kernel

On Mon, 1 Oct 2007 09:49:35 -0700
"David Schwartz" <davids@webmaster.com> wrote:

> 
> > * Jarek Poplawski <jarkao2@o2.pl> wrote:
> >
> > > BTW, it looks like risky to criticise sched_yield too much: some
> > > people can misinterpret such discussions and stop using this at
> > > all, even where it's right.
> 
> > Really, i have never seen a _single_ mainstream app where the use of
> > sched_yield() was the right choice.
> 
> It can occasionally be an optimization. You may have a case where you
> can do something very efficiently if a lock is not held, but you
> cannot afford to wait for the lock to be released. So you check the
> lock, if it's held, you yield and then check again. If that fails,
> you do it the less optimal way (for example, dispatching it to a
> thread that *can* afford to wait).


at this point it's "use a futex" instead; once you're doing system
calls you might as well use the right one for what you're trying to
achieve.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* RE: Network slowdown due to CFS
  2007-10-01 19:53                               ` Network slowdown due to CFS Arjan van de Ven
@ 2007-10-01 22:17                                 ` David Schwartz
  2007-10-01 22:35                                   ` Arjan van de Ven
  0 siblings, 1 reply; 71+ messages in thread
From: David Schwartz @ 2007-10-01 22:17 UTC (permalink / raw)
  To: Arjan van de Ven; +Cc: Ingo Molnar, linux-kernel


Arjan van de Ven wrote:

> > It can occasionally be an optimization. You may have a case where you
> > can do something very efficiently if a lock is not held, but you
> > cannot afford to wait for the lock to be released. So you check the
> > lock, if it's held, you yield and then check again. If that fails,
> > you do it the less optimal way (for example, dispatching it to a
> > thread that *can* afford to wait).

> at this point it's "use a futex" instead; once you're doing system
> calls you might as well use the right one for what you're trying to
> achieve.

There are two answers to this. One is that you sometimes are writing POSIX
code and Linux-specific optimizations don't change the fact that you still
need a portable implementation.

The other answer is that futexes don't change anything in this case. In
fact, in the last time I hit this, the lock was a futex on Linux.
Nevertheless, that doesn't change the basic issue. The lock is locked, you
cannot afford to wait for it, but not getting the lock is expensive. The
solution is to yield and check the lock again. If it's still held, you
dispatch to another thread, but many times, yielding can avoid that.

A futex doesn't change the fact that sometimes you can't afford to block on
a lock but nevertheless would save significant effort if you were able to
acquire it. Odds are the thread that holds it is about to release it anyway.

That is, you need something in-between "non-blocking trylock, fail easily"
and "blocking lock, do not fail", but you'd rather make forward progress
without the lock than actually block/sleep.

DS



^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-01 22:17                                 ` David Schwartz
@ 2007-10-01 22:35                                   ` Arjan van de Ven
  2007-10-01 22:44                                     ` David Schwartz
  0 siblings, 1 reply; 71+ messages in thread
From: Arjan van de Ven @ 2007-10-01 22:35 UTC (permalink / raw)
  To: davids; +Cc: Ingo Molnar, linux-kernel

On Mon, 1 Oct 2007 15:17:52 -0700
"David Schwartz" <davids@webmaster.com> wrote:

> 
> Arjan van de Ven wrote:
> 
> > > It can occasionally be an optimization. You may have a case where
> > > you can do something very efficiently if a lock is not held, but
> > > you cannot afford to wait for the lock to be released. So you
> > > check the lock, if it's held, you yield and then check again. If
> > > that fails, you do it the less optimal way (for example,
> > > dispatching it to a thread that *can* afford to wait).
> 
> > at this point it's "use a futex" instead; once you're doing system
> > calls you might as well use the right one for what you're trying to
> > achieve.
> 
> There are two answers to this. One is that you sometimes are writing
> POSIX code and Linux-specific optimizations don't change the fact
> that you still need a portable implementation.
> 
> The other answer is that futexes don't change anything in this case.
> In fact, in the last time I hit this, the lock was a futex on Linux.
> Nevertheless, that doesn't change the basic issue. The lock is
> locked, you cannot afford to wait for it, but not getting the lock is
> expensive. The solution is to yield and check the lock again. If it's
> still held, you dispatch to another thread, but many times, yielding
> can avoid that.

yielding IS blocking. Just with indeterminate fuzzyness added to it....

^ permalink raw reply	[flat|nested] 71+ messages in thread

* RE: Network slowdown due to CFS
  2007-10-01 22:35                                   ` Arjan van de Ven
@ 2007-10-01 22:44                                     ` David Schwartz
  2007-10-01 22:55                                       ` Arjan van de Ven
  0 siblings, 1 reply; 71+ messages in thread
From: David Schwartz @ 2007-10-01 22:44 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel


> yielding IS blocking. Just with indeterminate fuzzyness added to it....

Yielding is sort of blocking, but the difference is that yielding will not
idle the CPU while blocking might. Yielding is sometimes preferable to
blocking in a case where the thread knows it can make forward progress even
if it doesn't get the resource. (As in the examples I explained.)

DS



^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-01 22:44                                     ` David Schwartz
@ 2007-10-01 22:55                                       ` Arjan van de Ven
  2007-10-02 15:37                                         ` David Schwartz
  0 siblings, 1 reply; 71+ messages in thread
From: Arjan van de Ven @ 2007-10-01 22:55 UTC (permalink / raw)
  To: davids; +Cc: Ingo Molnar, linux-kernel

On Mon, 1 Oct 2007 15:44:09 -0700
"David Schwartz" <davids@webmaster.com> wrote:

> 
> > yielding IS blocking. Just with indeterminate fuzzyness added to
> > it....
> 
> Yielding is sort of blocking, but the difference is that yielding
> will not idle the CPU while blocking might. 

not really; SOMEONE will make progress, the one holding the lock.
Granted, he can be on some other cpu, but at that point all yielding
gets you is a bunch of cache bounces.

>Yielding is sometimes
> preferable to blocking in a case where the thread knows it can make
> forward progress even if it doesn't get the resource. (As in the
> examples I explained.)

that's also what trylock is for... as well as spinaphores...
(you can argue that futexes should be more intelligent and do
spinaphore stuff etc... and I can buy that, lets improve them in the
kernel by any means. But userspace yield() isn't the answer. A
yield_to() would have been a ton better (which would return immediately
if the thing you want to yield to is running already somethere), a
blind "yield" isn't, since it doesn't say what you want to yield to.

Note: The answer to "what to yield to" isn't "everything that might
want to run"; we tried that way back when the 2.6.early scheduler was
designed and that turns out to not be what people calling yield
expected.. (it made their things even slower than they thought). So
they want "yield to" semantics, without telling the kernel what they
want to yield to, and complain if the kernel second-guesses wrongly....


not a good api.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-01 18:23                                 ` David Schwartz
@ 2007-10-02  6:06                                   ` Ingo Molnar
  2007-10-02  6:47                                     ` Andi Kleen
  2007-10-03  8:02                                     ` Jarek Poplawski
  2007-10-02  6:08                                   ` Ingo Molnar
                                                     ` (2 subsequent siblings)
  3 siblings, 2 replies; 71+ messages in thread
From: Ingo Molnar @ 2007-10-02  6:06 UTC (permalink / raw)
  To: David Schwartz; +Cc: linux-kernel


* David Schwartz <davids@webmaster.com> wrote:

> > These are generic statements, but i'm _really_ interested in the 
> > specifics. Real, specific code that i can look at. The typical Linux 
> > distro consists of in execess of 500 millions of lines of code, in 
> > tens of thousands of apps, so there really must be some good, valid 
> > and "right" use of sched_yield() somewhere in there, in some 
> > mainstream app, right? (because, as you might have guessed it, in 
> > the past decade of sched_yield() existence i _have_ seen my share of 
> > sched_yield() utilizing user-space code, and at the moment i'm not 
> > really impressed by those examples.)
> 
> Maybe, maybe not. Even if so, it would be very difficult to find. 
> [...]

google.com/codesearch is your friend. Really, 

> Note that I'm not saying this is a particularly big deal. And I'm not 
> calling CFS' behavior a regression, since it's not really better or 
> worse than the old behavior, simply different.

yes, and that's the core point.

> I'm not familiar enough with CFS' internals to help much on the 
> implementation, but there may be some simple compromise yield that 
> might work well enough. How about simply acting as if the task used up 
> its timeslice and scheduling the next one? (Possibly with a slight 
> reduction in penalty or reward for not really using all the time, if 
> possible?)

firstly, there's no notion of "timeslices" in CFS. (in CFS tasks "earn" 
a right to the CPU, and that "right" is not sliced in the traditional 
sense) But we tried a conceptually similar thing: to schedule not to the 
end of the tree but into the next position. That too was bad for _some_ 
apps. CFS literally cycled through 5-6 different yield implementations 
in its 22 versions so far. The current flag solution was achieved in 
such an iterative fashion and gives an acceptable solution to all app 
categories that came up so far. [ and this is driven by compatibility 
goals - regardless of how broken we consider yield use. The ideal 
solution is of course to almost never use yield. Fortunately 99%+ of 
Linux apps follow that ideal solution ;-) ]

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-01 18:23                                 ` David Schwartz
  2007-10-02  6:06                                   ` Ingo Molnar
@ 2007-10-02  6:08                                   ` Ingo Molnar
  2007-10-02  6:26                                   ` Ingo Molnar
  2007-10-02  6:46                                   ` yield API Ingo Molnar
  3 siblings, 0 replies; 71+ messages in thread
From: Ingo Molnar @ 2007-10-02  6:08 UTC (permalink / raw)
  To: David Schwartz; +Cc: linux-kernel


* David Schwartz <davids@webmaster.com> wrote:

> > (user-space spinlocks are broken beyond words for anything but 
> > perhaps SCHED_FIFO tasks.)
> 
> User-space spinlocks are broken so spinlocks can only be implemented 
> in kernel-space? Even if you use the kernel to schedule/unschedule the 
> tasks, you still have to spin in user-space.

user-space spinlocks (in anything but SCHED_FIFO tasks) are pretty 
broken because they waste CPU time. (not as broken as yield, because 
"wasting CPU time" is a more deterministic act, but still broken) Could 
you cite a single example where user-space spinlocks are technically the 
best solution?

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-01 18:23                                 ` David Schwartz
  2007-10-02  6:06                                   ` Ingo Molnar
  2007-10-02  6:08                                   ` Ingo Molnar
@ 2007-10-02  6:26                                   ` Ingo Molnar
  2007-10-02  6:46                                   ` yield API Ingo Molnar
  3 siblings, 0 replies; 71+ messages in thread
From: Ingo Molnar @ 2007-10-02  6:26 UTC (permalink / raw)
  To: David Schwartz; +Cc: linux-kernel


* David Schwartz <davids@webmaster.com> wrote:

> > at a quick glance this seems broken too - but if you show the 
> > specific code i might be able to point out the breakage in detail. 
> > (One underlying problem here appears to be fairness: a quick 
> > unlock/lock sequence may starve out other threads. yield wont solve 
> > that fundamental problem either, and it will introduce random 
> > latencies into apps using this memory allocator.)
> 
> You are assuming that random latencies are necessarily bad. Random 
> latencies may be significantly better than predictable high latency.

i'm not really assuming anything, i gave a vague first impression of the 
vague example you gave (assuming that the yield was done to combat 
fairness problems). This is a case where the human language shows its 
boundaries: statements that are hard to refute with certainty because 
they are too vague. So i'd really suggest you show me some sample/real 
code - that would move this discussion to a much more productive level.

but i'll attempt to weave the chain of argument one step forward (in the 
hope of not distorting your point in any way): _if_ the sched_yield() 
call in that memory allocator is done because it uses a locking 
primitive that is unfair (hence the memory pool lock can be starved), 
then the "guaranteed large latency" is caused by "guaranteed 
unfairness". The solution is not to insert a random latency (via a 
sched_yield() call) that also has a side-effect of fairness to other 
tasks, because this random latency introduces guaranteed unfairness for 
this particular task. The correct solution IMO is to make the locking 
primitive more fair _without_ random delays, and there are a number of 
good techniques for that. (they mostly center around the use of futexes)

one thing that is often missed is that most of the cost of a yield() is 
in the system call and the context-switch - quite similar to the futex 
slowpath. So there's _no_ reason to not use a futexes on Linux. (yes, 
there might be historic/compatibility or ease-of-porting arguments but 
those do not really impact the fundamental argument of whether something 
is technically right or not.)

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: yield API
  2007-10-01 18:23                                 ` David Schwartz
                                                     ` (2 preceding siblings ...)
  2007-10-02  6:26                                   ` Ingo Molnar
@ 2007-10-02  6:46                                   ` Ingo Molnar
  2007-10-02 11:50                                     ` linux-os (Dick Johnson)
                                                       ` (2 more replies)
  3 siblings, 3 replies; 71+ messages in thread
From: Ingo Molnar @ 2007-10-02  6:46 UTC (permalink / raw)
  To: David Schwartz; +Cc: linux-kernel


* David Schwartz <davids@webmaster.com> wrote:

> > These are generic statements, but i'm _really_ interested in the 
> > specifics. Real, specific code that i can look at. The typical Linux 
> > distro consists of in execess of 500 millions of lines of code, in 
> > tens of thousands of apps, so there really must be some good, valid 
> > and "right" use of sched_yield() somewhere in there, in some 
> > mainstream app, right? (because, as you might have guessed it, in 
> > the past decade of sched_yield() existence i _have_ seen my share of 
> > sched_yield() utilizing user-space code, and at the moment i'm not 
> > really impressed by those examples.)
> 
> Maybe, maybe not. Even if so, it would be very difficult to find. 
> Simply grepping for sched_yield is not going to help because 
> determining whether a given use of sched_yield is smart is not going 
> to be easy.

sched_yield() has been around for a decade (about three times longer 
than futexes were around), so if it's useful, it sure should have grown 
some 'crown jewel' app that uses it and shows off its advantages, 
compared to other locking approaches, right?

For example, if you asked me whether pipes are the best thing for 
certain apps, i could immediately show you tons of examples where they 
are. Same for sockets. Or RT priorities. Or nice levels. Or futexes. Or 
just about any other core kernel concept or API. Your notion that 
showing a good example of an API would be "difficult" because it's hard 
to determine "smart" use is not tenable i believe and does not 
adequately refute my pretty plain-meaning "it does not exist" assertion.

If then this is one more supporting proof for the fundamental weakness 
of the sched_yield() API. Rarely are we able to so universally condemn 
an API: real-life is usually more varied and even for theoretically 
poorly defined APIs _some_ sort of legitimate use does grow up.

APIs that are not in any real, meaningful use, despite a decade of 
presence are not really interesting to me personally. (especially in 
this case where we know exactly _why_ the API is used so rarely.) Sure 
we'll continue to support it in the best possible way, with the usual 
kernel maintainance policy: without hurting other, more commonly used 
APIs. That was the principle we followed in previous schedulers too. And 
if anyone has a patch to make sched_yield() better than it is today, i'm 
of course interested in it.

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-02  6:06                                   ` Ingo Molnar
@ 2007-10-02  6:47                                     ` Andi Kleen
  2007-10-03  8:02                                     ` Jarek Poplawski
  1 sibling, 0 replies; 71+ messages in thread
From: Andi Kleen @ 2007-10-02  6:47 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: David Schwartz, linux-kernel

Ingo Molnar <mingo@elte.hu> writes:

> * David Schwartz <davids@webmaster.com> wrote:
> 
> > > These are generic statements, but i'm _really_ interested in the 
> > > specifics. Real, specific code that i can look at. The typical Linux 
> > > distro consists of in execess of 500 millions of lines of code, in 
> > > tens of thousands of apps, so there really must be some good, valid 
> > > and "right" use of sched_yield() somewhere in there, in some 
> > > mainstream app, right? (because, as you might have guessed it, in 
> > > the past decade of sched_yield() existence i _have_ seen my share of 
> > > sched_yield() utilizing user-space code, and at the moment i'm not 
> > > really impressed by those examples.)
> > 
> > Maybe, maybe not. Even if so, it would be very difficult to find. 
> > [...]
> 
> google.com/codesearch is your friend. Really, 

http://www.koders.com/ (which has been around for a long time)
actually seems to have more code.

It's also a pity that so much free code is behind passwords
and protected from spiders.

-Andi


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-01 16:25                           ` Ingo Molnar
  2007-10-01 16:49                             ` David Schwartz
  2007-10-01 16:55                             ` Chris Friesen
@ 2007-10-02  9:03                             ` Jarek Poplawski
  2007-10-02 13:39                               ` Jarek Poplawski
  2 siblings, 1 reply; 71+ messages in thread
From: Jarek Poplawski @ 2007-10-02  9:03 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Nick Piggin, David Schwartz, linux-kernel, Mike Galbraith,
	Peter Zijlstra, Martin Michlmayr, Srivatsa Vaddagiri,
	Stephen Hemminger

On Mon, Oct 01, 2007 at 06:25:07PM +0200, Ingo Molnar wrote:
> 
> * Jarek Poplawski <jarkao2@o2.pl> wrote:
> 
> > BTW, it looks like risky to criticise sched_yield too much: some 
> > people can misinterpret such discussions and stop using this at all, 
> > even where it's right.
> 
> Really, i have never seen a _single_ mainstream app where the use of 
> sched_yield() was the right choice.
> 
> Fortunately, the sched_yield() API is already one of the most rarely 
> used scheduler functionalities, so it does not really matter. [ In my 
> experience a Linux scheduler is stabilizing pretty well when the 
> discussion shifts to yield behavior, because that shows that everything 
> else is pretty much fine ;-) ]
> 
> But, because you assert it that it's risky to "criticise sched_yield() 
> too much", you sure must know at least one real example where it's right 
> to use it (and cite the line and code where it's used, with 
> specificity)?

Very clever move! And I see some people have catched this...

Since sched_yeld() is a very general purpose tool, it can be easily
replaced by others, of course, just like probably half of all
system calls. And such things are done often in a code during
optimization. But, IMHO, the main value of sched_yield() is it's
easy to use and very readable way to mark some place. Sometimes
even test if such idea is reasonable at all... Otherwise, many such
possibilities could easily stay forgotten forever.

But you are right, the value of this call shouldn't be exaggerated,
and my proposal was an overkill. Anyway, it seems, there could be
imagined something better than current ways of doing this. They look
like two extremes and something in between and not too complicated
should suffice. Currently, I wonder if simply charging (with a key
recalculated) such a task for all the time it could've used isn't one
of such methods. It seems, it's functionally analogous with going to
the end of que of tasks with the same priority according to the old
sched.

Regards,
Jarek P.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-01  8:43                         ` Jarek Poplawski
  2007-10-01 16:25                           ` Ingo Molnar
@ 2007-10-02  9:26                           ` Jarek Poplawski
  1 sibling, 0 replies; 71+ messages in thread
From: Jarek Poplawski @ 2007-10-02  9:26 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Ingo Molnar, David Schwartz, linux-kernel, Mike Galbraith,
	Peter Zijlstra, Martin Michlmayr, Srivatsa Vaddagiri,
	Stephen Hemminger

On Mon, Oct 01, 2007 at 10:43:56AM +0200, Jarek Poplawski wrote:
...
> etc., if we know (after testing) eg. average expedition time of such

No new theory - it's only my reverse Polish translation. Should be:
"etc., if we know (after testing) eg. average dispatch time of such".

Sorry,
Jarek P.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: yield API
  2007-10-02  6:46                                   ` yield API Ingo Molnar
@ 2007-10-02 11:50                                     ` linux-os (Dick Johnson)
  2007-10-02 15:24                                       ` Douglas McNaught
  2007-10-02 21:57                                     ` Eric St-Laurent
  2007-12-12 22:39                                     ` Jesper Juhl
  2 siblings, 1 reply; 71+ messages in thread
From: linux-os (Dick Johnson) @ 2007-10-02 11:50 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: David Schwartz, linux-kernel


On Tue, 2 Oct 2007, Ingo Molnar wrote:

>
> * David Schwartz <davids@webmaster.com> wrote:
>
>>> These are generic statements, but i'm _really_ interested in the
>>> specifics. Real, specific code that i can look at. The typical Linux
>>> distro consists of in execess of 500 millions of lines of code, in
>>> tens of thousands of apps, so there really must be some good, valid
>>> and "right" use of sched_yield() somewhere in there, in some
>>> mainstream app, right? (because, as you might have guessed it, in
>>> the past decade of sched_yield() existence i _have_ seen my share of
>>> sched_yield() utilizing user-space code, and at the moment i'm not
>>> really impressed by those examples.)
>>
>> Maybe, maybe not. Even if so, it would be very difficult to find.
>> Simply grepping for sched_yield is not going to help because
>> determining whether a given use of sched_yield is smart is not going
>> to be easy.
>
> sched_yield() has been around for a decade (about three times longer
> than futexes were around), so if it's useful, it sure should have grown
> some 'crown jewel' app that uses it and shows off its advantages,
> compared to other locking approaches, right?
>
> For example, if you asked me whether pipes are the best thing for
> certain apps, i could immediately show you tons of examples where they
> are. Same for sockets. Or RT priorities. Or nice levels. Or futexes. Or
> just about any other core kernel concept or API. Your notion that
> showing a good example of an API would be "difficult" because it's hard
> to determine "smart" use is not tenable i believe and does not
> adequately refute my pretty plain-meaning "it does not exist" assertion.
>
> If then this is one more supporting proof for the fundamental weakness
> of the sched_yield() API. Rarely are we able to so universally condemn
> an API: real-life is usually more varied and even for theoretically
> poorly defined APIs _some_ sort of legitimate use does grow up.
>
> APIs that are not in any real, meaningful use, despite a decade of
> presence are not really interesting to me personally. (especially in
> this case where we know exactly _why_ the API is used so rarely.) Sure
> we'll continue to support it in the best possible way, with the usual
> kernel maintainance policy: without hurting other, more commonly used
> APIs. That was the principle we followed in previous schedulers too. And
> if anyone has a patch to make sched_yield() better than it is today, i'm
> of course interested in it.
>
> 	Ingo

But sched_yield() on Linux never did what the majority of
programmers assumed it would do (give up the CPU to some
runnable processes for the rest of the time-slice). Instead,
it just appeared to spin in the kernel. Therefore, those
who needed a sched_yield(), just used usleep().

Whether or not there is a POSIX definition of sched_yield(),
there is a need for something that will give up the CPU
and not busy-wait. There are many control applications
where state-machines are kept in user-mode code. The code
waits for an event. It shouldn't be spinning, wasting
CPU time, when the kernel can be doing file and network
I/O with the wasted CPU cycles.

So, just because sched_yield() doesn't work as expected,
is not the reason to get rid of it.

Cheers,
Dick Johnson
Penguin : Linux version 2.6.16.24 on an i686 machine (5592.59 BogoMips).
My book : http://www.AbominableFirebug.com/
_


****************************************************************
The information transmitted in this message is confidential and may be privileged.  Any review, retransmission, dissemination, or other use of this information by persons or entities other than the intended recipient is prohibited.  If you are not the intended recipient, please notify Analogic Corporation immediately - by replying to this message or by sending an email to DeliveryErrors@analogic.com - and destroy all copies of this information, including any attachments, without reading or disclosing them.

Thank you.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-02  9:03                             ` Network slowdown due to CFS Jarek Poplawski
@ 2007-10-02 13:39                               ` Jarek Poplawski
  0 siblings, 0 replies; 71+ messages in thread
From: Jarek Poplawski @ 2007-10-02 13:39 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Nick Piggin, David Schwartz, linux-kernel, Mike Galbraith,
	Peter Zijlstra, Martin Michlmayr, Srivatsa Vaddagiri,
	Stephen Hemminger

On Tue, Oct 02, 2007 at 11:03:46AM +0200, Jarek Poplawski wrote:
...
> should suffice. Currently, I wonder if simply charging (with a key
> recalculated) such a task for all the time it could've used isn't one
> of such methods. It seems, it's functionally analogous with going to
> the end of que of tasks with the same priority according to the old
> sched.

Only now I've read I repeat the idea of David Schwartz (and probably
not only him) from a nearby thread, sorry. But, I still try to find
what was wrong with it?

Jarek P.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: yield API
  2007-10-02 11:50                                     ` linux-os (Dick Johnson)
@ 2007-10-02 15:24                                       ` Douglas McNaught
  0 siblings, 0 replies; 71+ messages in thread
From: Douglas McNaught @ 2007-10-02 15:24 UTC (permalink / raw)
  To: linux-os (Dick Johnson); +Cc: Ingo Molnar, David Schwartz, linux-kernel

"linux-os \(Dick Johnson\)" <linux-os@analogic.com> writes:

> Whether or not there is a POSIX definition of sched_yield(),
> there is a need for something that will give up the CPU
> and not busy-wait. There are many control applications
> where state-machines are kept in user-mode code. The code
> waits for an event. It shouldn't be spinning, wasting
> CPU time, when the kernel can be doing file and network
> I/O with the wasted CPU cycles.

These "control applications" would be real-time processes, for which
(AIUI) sched_yield() behavior is completely well-defined and
implemented as such by Linux.  The question here is how useful the
call is for SCHED_OTHER (non-real-time) processes, for which it has no
well-defined semantics.

-Doug

^ permalink raw reply	[flat|nested] 71+ messages in thread

* RE: Network slowdown due to CFS
  2007-10-01 22:55                                       ` Arjan van de Ven
@ 2007-10-02 15:37                                         ` David Schwartz
  2007-10-03  7:15                                           ` Jarek Poplawski
  0 siblings, 1 reply; 71+ messages in thread
From: David Schwartz @ 2007-10-02 15:37 UTC (permalink / raw)
  To: Arjan van de Ven, Ingo Molnar; +Cc: linux-kernel


This is a combined response to Arjan's:

> that's also what trylock is for... as well as spinaphores...
> (you can argue that futexes should be more intelligent and do
> spinaphore stuff etc... and I can buy that, lets improve them in the
> kernel by any means. But userspace yield() isn't the answer. A
> yield_to() would have been a ton better (which would return immediately
> if the thing you want to yield to is running already somethere), a
> blind "yield" isn't, since it doesn't say what you want to yield to.

And Ingo's:

> but i'll attempt to weave the chain of argument one step forward (in the
> hope of not distorting your point in any way): _if_ the sched_yield()
> call in that memory allocator is done because it uses a locking
> primitive that is unfair (hence the memory pool lock can be starved),
> then the "guaranteed large latency" is caused by "guaranteed
> unfairness". The solution is not to insert a random latency (via a
> sched_yield() call) that also has a side-effect of fairness to other
> tasks, because this random latency introduces guaranteed unfairness for
> this particular task. The correct solution IMO is to make the locking
> primitive more fair _without_ random delays, and there are a number of
> good techniques for that. (they mostly center around the use of futexes)

So now I not only have to come up with an example where sched_yield is the
best practical choice, I have to come up with one where sched_yield is the
best conceivable choice? Didn't we start out by agreeing these are very rare
cases? Why are we designing new APIs for them (Arjan) and why do we care
about their performance (Ingo)?

These are *rare* cases. It is a waste of time to optimize them.

In this case, nobody cares about fairness to the service thread. It is a
cleanup task that probably runs every few minutes. It could be delayed for
minutes and nobody would care. What they do care about is the impact of the
service thread on the threads doing real work.

You two challenged me to present any legitimate use case for sched_yield. I
see now that was not a legitimate challenge and you two were determined to
shoot down any response no matter how reasonable on the grounds that there
is some way to do it better, no matter how complex, impractical, or
unjustified by the real-world problem.

I think if a pthread_mutex had a 'yield to others blocking on this mutex'
kind of a 'go to the back of the line' option, that would cover the majority
of cases where sched_yield is your best choice currently. Unfortunately,
POSIX gave us yield.

Note that I think we all agree that any program whose performance relies on
quirks of sched_yield (such as the examples that have been cited as CFS
'regressions') are broken horribly. None of the cases I am suggesting use
sched_yield as anything more than a minor optimization.

DS



^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: yield API
  2007-10-02  6:46                                   ` yield API Ingo Molnar
  2007-10-02 11:50                                     ` linux-os (Dick Johnson)
@ 2007-10-02 21:57                                     ` Eric St-Laurent
  2007-12-12 22:39                                     ` Jesper Juhl
  2 siblings, 0 replies; 71+ messages in thread
From: Eric St-Laurent @ 2007-10-02 21:57 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: David Schwartz, linux-kernel


On Tue, 2007-10-02 at 08:46 +0200, Ingo Molnar wrote:

[...]

> APIs that are not in any real, meaningful use, despite a decade of 
> presence are not really interesting to me personally. (especially in 
> this case where we know exactly _why_ the API is used so rarely.) Sure 
> we'll continue to support it in the best possible way, with the usual 
> kernel maintainance policy: without hurting other, more commonly used 
> APIs. That was the principle we followed in previous schedulers too. And 
> if anyone has a patch to make sched_yield() better than it is today, i'm 
> of course interested in it.

Do you still have intentions to add a directed yield API?  I remember
seeing it in the earlier CFS patches.


- Eric



^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-02 15:37                                         ` David Schwartz
@ 2007-10-03  7:15                                           ` Jarek Poplawski
  0 siblings, 0 replies; 71+ messages in thread
From: Jarek Poplawski @ 2007-10-03  7:15 UTC (permalink / raw)
  To: davids; +Cc: Arjan van de Ven, Ingo Molnar, linux-kernel

On 02-10-2007 17:37, David Schwartz wrote:
...
> So now I not only have to come up with an example where sched_yield is the
> best practical choice, I have to come up with one where sched_yield is the
> best conceivable choice? Didn't we start out by agreeing these are very rare
> cases? Why are we designing new APIs for them (Arjan) and why do we care
> about their performance (Ingo)?
> 
> These are *rare* cases. It is a waste of time to optimize them.

Probably we'll start to care after first comparison tests done by our
rivals. It should be a piece of cake for them to find the "right"
code...

Regards,
Jarek P.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-02  6:06                                   ` Ingo Molnar
  2007-10-02  6:47                                     ` Andi Kleen
@ 2007-10-03  8:02                                     ` Jarek Poplawski
  2007-10-03  8:16                                       ` Ingo Molnar
  1 sibling, 1 reply; 71+ messages in thread
From: Jarek Poplawski @ 2007-10-03  8:02 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: David Schwartz, linux-kernel

On 02-10-2007 08:06, Ingo Molnar wrote:
> * David Schwartz <davids@webmaster.com> wrote:
...
>> I'm not familiar enough with CFS' internals to help much on the 
>> implementation, but there may be some simple compromise yield that 
>> might work well enough. How about simply acting as if the task used up 
>> its timeslice and scheduling the next one? (Possibly with a slight 
>> reduction in penalty or reward for not really using all the time, if 
>> possible?)
> 
> firstly, there's no notion of "timeslices" in CFS. (in CFS tasks "earn" 
> a right to the CPU, and that "right" is not sliced in the traditional 
> sense) But we tried a conceptually similar thing [...]

>From kernel/sched_fair.c:

"/*
 * Targeted preemption latency for CPU-bound tasks:
 * (default: 20ms, units: nanoseconds)
 *
 * NOTE: this latency value is not the same as the concept of
 * 'timeslice length' - timeslices in CFS are of variable length.
 * (to see the precise effective timeslice length of your workload,
 *  run vmstat and monitor the context-switches field)
..."

So, no notion of something, which are(!) of variable length, and which
precise effective timeslice lenght can be seen in nanoseconds? (But
not timeslice!)

Well, I start to think, this new scheduler could be too simple yet...


> [...] [ and this is driven by compatibility 
> goals - regardless of how broken we consider yield use. The ideal 
> solution is of course to almost never use yield. Fortunately 99%+ of 
> Linux apps follow that ideal solution ;-) ]

Nevertheless, it seems, this 1% is important enough to boast a little:

  "( another detail: due to nanosec accounting and timeline sorting,
     sched_yield() support is very simple under CFS, and in fact under
     CFS sched_yield() behaves much better than under any other
     scheduler i have tested so far. )"
				[Documentation/sched-design-CFS.txt]

Cheers,
Jarek P.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-03  8:02                                     ` Jarek Poplawski
@ 2007-10-03  8:16                                       ` Ingo Molnar
  2007-10-03  8:56                                         ` Jarek Poplawski
  0 siblings, 1 reply; 71+ messages in thread
From: Ingo Molnar @ 2007-10-03  8:16 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: David Schwartz, linux-kernel


* Jarek Poplawski <jarkao2@o2.pl> wrote:

> > firstly, there's no notion of "timeslices" in CFS. (in CFS tasks 
> > "earn" a right to the CPU, and that "right" is not sliced in the 
> > traditional sense) But we tried a conceptually similar thing [...]
> 
> >From kernel/sched_fair.c:
> 
> "/*
>  * Targeted preemption latency for CPU-bound tasks:
>  * (default: 20ms, units: nanoseconds)
>  *
>  * NOTE: this latency value is not the same as the concept of
>  * 'timeslice length' - timeslices in CFS are of variable length.
>  * (to see the precise effective timeslice length of your workload,
>  *  run vmstat and monitor the context-switches field)
> ..."
> 
> So, no notion of something, which are(!) of variable length, and which 
> precise effective timeslice lenght can be seen in nanoseconds? (But 
> not timeslice!)

You should really read and understand the code you are arguing about :-/

In the 2.6.22 scheduler, there was a p->time_slice per task variable 
that could be manipulated. (Note, in 2.6.22's sched_yield() did not 
manipulate p->time_slice.)

sysctl_sched_latency on the other hand is not something that is per task 
(it is global) so there is no pending timeslice to be "cleared" as it 
has been suggested naively.

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-03  8:16                                       ` Ingo Molnar
@ 2007-10-03  8:56                                         ` Jarek Poplawski
  2007-10-03  9:10                                           ` Ingo Molnar
  0 siblings, 1 reply; 71+ messages in thread
From: Jarek Poplawski @ 2007-10-03  8:56 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: David Schwartz, linux-kernel

On Wed, Oct 03, 2007 at 10:16:13AM +0200, Ingo Molnar wrote:
> 
> * Jarek Poplawski <jarkao2@o2.pl> wrote:
> 
> > > firstly, there's no notion of "timeslices" in CFS. (in CFS tasks 
> > > "earn" a right to the CPU, and that "right" is not sliced in the 
> > > traditional sense) But we tried a conceptually similar thing [...]
> > 
> > >From kernel/sched_fair.c:
> > 
> > "/*
> >  * Targeted preemption latency for CPU-bound tasks:
> >  * (default: 20ms, units: nanoseconds)
> >  *
> >  * NOTE: this latency value is not the same as the concept of
> >  * 'timeslice length' - timeslices in CFS are of variable length.
> >  * (to see the precise effective timeslice length of your workload,
> >  *  run vmstat and monitor the context-switches field)
> > ..."
> > 
> > So, no notion of something, which are(!) of variable length, and which 
> > precise effective timeslice lenght can be seen in nanoseconds? (But 
> > not timeslice!)
> 
> You should really read and understand the code you are arguing about :-/

Maybe you could help me with better comments? IMHO, it would be enough
to warn new timeslices have different meaning, or stop to use this
term at all. (Btw, in -rc8-mm2 I see new sched_slice() function which
seems to return... time.)

> 
> In the 2.6.22 scheduler, there was a p->time_slice per task variable 
> that could be manipulated. (Note, in 2.6.22's sched_yield() did not 
> manipulate p->time_slice.)
> 
> sysctl_sched_latency on the other hand is not something that is per task 
> (it is global) so there is no pending timeslice to be "cleared" as it 
> has been suggested naively.

But, there is this "something", very similar and very misleading, you
count eg. in check_preempt_curr_fair to find if time is over, and I
think this could be similar enough to what David Schwartz wanted to
use in his idea, and you didn't care to explain why it's so different?

Jarek P.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-03  8:56                                         ` Jarek Poplawski
@ 2007-10-03  9:10                                           ` Ingo Molnar
  2007-10-03  9:50                                             ` Jarek Poplawski
  2007-10-04  5:33                                             ` Casey Dahlin
  0 siblings, 2 replies; 71+ messages in thread
From: Ingo Molnar @ 2007-10-03  9:10 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: David Schwartz, linux-kernel


* Jarek Poplawski <jarkao2@o2.pl> wrote:

> On Wed, Oct 03, 2007 at 10:16:13AM +0200, Ingo Molnar wrote:
> > 
> > * Jarek Poplawski <jarkao2@o2.pl> wrote:
> > 
> > > > firstly, there's no notion of "timeslices" in CFS. (in CFS tasks 
> > > > "earn" a right to the CPU, and that "right" is not sliced in the 
> > > > traditional sense) But we tried a conceptually similar thing [...]
> > > 
> > > >From kernel/sched_fair.c:
> > > 
> > > "/*
> > >  * Targeted preemption latency for CPU-bound tasks:
> > >  * (default: 20ms, units: nanoseconds)
> > >  *
> > >  * NOTE: this latency value is not the same as the concept of
> > >  * 'timeslice length' - timeslices in CFS are of variable length.
> > >  * (to see the precise effective timeslice length of your workload,
> > >  *  run vmstat and monitor the context-switches field)
> > > ..."
> > > 
> > > So, no notion of something, which are(!) of variable length, and which 
> > > precise effective timeslice lenght can be seen in nanoseconds? (But 
> > > not timeslice!)
> > 
> > You should really read and understand the code you are arguing about :-/
> 
> Maybe you could help me with better comments? IMHO, it would be enough 
> to warn new timeslices have different meaning, or stop to use this 
> term at all. [...]

i'm curious, what better do you need than the very detailed comment 
quoted above? Which bit of "this latency value is not the same as the 
concept of timeslice length" is difficult to understand? The timeslices 
of tasks (i.e. the time they spend on a CPU without scheduling away) is 
_not_ maintained directly in CFS as a per-task variable that can be 
"cleared", it's not the metric that drives scheduling. Yes, of course 
CFS too "slices up CPU time", but those slices are not the per-task 
variables of traditional schedulers and cannot be 'cleared'.

> [...] (Btw, in -rc8-mm2 I see new sched_slice() function which seems 
> to return... time.)

wrong again. That is a function, not a variable to be cleared. (Anyway, 
the noise/signal ratio is getting increasingly high in this thread with 
no progress in sight, so i cannot guarantee any further replies - 
possibly others will pick up the tab and explain/discuss any other 
questions that might come up. Patches are welcome of course.)

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-03  9:10                                           ` Ingo Molnar
@ 2007-10-03  9:50                                             ` Jarek Poplawski
  2007-10-03 10:55                                               ` Dmitry Adamushko
  2007-10-07  7:18                                               ` Network slowdown due to CFS Ingo Molnar
  2007-10-04  5:33                                             ` Casey Dahlin
  1 sibling, 2 replies; 71+ messages in thread
From: Jarek Poplawski @ 2007-10-03  9:50 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: David Schwartz, linux-kernel

On Wed, Oct 03, 2007 at 11:10:58AM +0200, Ingo Molnar wrote:
> 
> * Jarek Poplawski <jarkao2@o2.pl> wrote:
> 
> > On Wed, Oct 03, 2007 at 10:16:13AM +0200, Ingo Molnar wrote:
> > > 
> > > * Jarek Poplawski <jarkao2@o2.pl> wrote:
> > > 
> > > > > firstly, there's no notion of "timeslices" in CFS. (in CFS tasks 
> > > > > "earn" a right to the CPU, and that "right" is not sliced in the 
> > > > > traditional sense) But we tried a conceptually similar thing [...]
> > > > 
> > > > >From kernel/sched_fair.c:
> > > > 
> > > > "/*
> > > >  * Targeted preemption latency for CPU-bound tasks:
> > > >  * (default: 20ms, units: nanoseconds)
> > > >  *
> > > >  * NOTE: this latency value is not the same as the concept of
> > > >  * 'timeslice length' - timeslices in CFS are of variable length.
> > > >  * (to see the precise effective timeslice length of your workload,
> > > >  *  run vmstat and monitor the context-switches field)
> > > > ..."
> > > > 
> > > > So, no notion of something, which are(!) of variable length, and which 
> > > > precise effective timeslice lenght can be seen in nanoseconds? (But 
> > > > not timeslice!)
> > > 
> > > You should really read and understand the code you are arguing about :-/
> > 
> > Maybe you could help me with better comments? IMHO, it would be enough 
> > to warn new timeslices have different meaning, or stop to use this 
> > term at all. [...]
> 
> i'm curious, what better do you need than the very detailed comment 
> quoted above? Which bit of "this latency value is not the same as the 
> concept of timeslice length" is difficult to understand? The timeslices 
> of tasks (i.e. the time they spend on a CPU without scheduling away) is 
> _not_ maintained directly in CFS as a per-task variable that can be 
> "cleared", it's not the metric that drives scheduling. Yes, of course 
> CFS too "slices up CPU time", but those slices are not the per-task 
> variables of traditional schedulers and cannot be 'cleared'.

It's not about this comment alone, but this comment plus "no notion"
comment, which appears in sched-design-CFS.txt too.

> 
> > [...] (Btw, in -rc8-mm2 I see new sched_slice() function which seems 
> > to return... time.)
> 
> wrong again. That is a function, not a variable to be cleared. (Anyway, 
> the noise/signal ratio is getting increasingly high in this thread with 
> no progress in sight, so i cannot guarantee any further replies - 
> possibly others will pick up the tab and explain/discuss any other 
> questions that might come up. Patches are welcome of course.)

I can't see anything about clearing. I think, this was about charging,
which should change the key enough, to move a task to, maybe, a better
place in a que (tree) than with current ways.

Jarek P.

PS: Don't you think that a nice argue with some celebrity, like Ingo
Molnar himself, is by far more interesting than those dull patches?

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-03  9:50                                             ` Jarek Poplawski
@ 2007-10-03 10:55                                               ` Dmitry Adamushko
  2007-10-03 10:58                                                 ` Dmitry Adamushko
                                                                   ` (2 more replies)
  2007-10-07  7:18                                               ` Network slowdown due to CFS Ingo Molnar
  1 sibling, 3 replies; 71+ messages in thread
From: Dmitry Adamushko @ 2007-10-03 10:55 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: Ingo Molnar, David Schwartz, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1924 bytes --]

On 03/10/2007, Jarek Poplawski <jarkao2@o2.pl> wrote:
> I can't see anything about clearing. I think, this was about charging,
> which should change the key enough, to move a task to, maybe, a better
> place in a que (tree) than with current ways.

just a quick patch, not tested and I've not evaluated all possible
implications yet.
But someone might give it a try with his/(her -- are even more
welcomed :-) favourite sched_yield() load.

(and white space damaged)

--- sched_fair-old.c    2007-10-03 12:45:17.010306000 +0200
+++ sched_fair.c        2007-10-03 12:44:46.899851000 +0200
@@ -803,7 +803,35 @@ static void yield_task_fair(struct rq *r
                update_curr(cfs_rq);

                return;
+       } else if (sysctl_sched_compat_yield == 2) {
+               unsigned long ideal_runtime, delta_exec,
+                             delta_exec_weighted;
+
+               __update_rq_clock(rq);
+               /*
+                * Update run-time statistics of the 'current'.
+                */
+               update_curr(cfs_rq);
+
+               /*
+                * Emulate (speed up) the effect of us being preempted
+                * by scheduler_tick().
+                */
+               ideal_runtime = sched_slice(cfs_rq, curr);
+               delta_exec = curr->sum_exec_runtime -
curr->prev_sum_exec_runtime;
+
+               if (ideal_runtime > delta_exec) {
+                       delta_exec_weighted = ideal_runtime - delta_exec;
+
+                       if (unlikely(curr->load.weight != NICE_0_LOAD)) {
+                               delta_exec_weighted =
calc_delta_fair(delta_exec_weighted,
+
 &se->load);
+                       }
+                       se->vruntime += delta_exec_weighted;
+               }
+               return;
        }
+
        /*
         * Find the rightmost entry in the rbtree:
         */


>
> Jarek P.
>

-- 
Best regards,
Dmitry Adamushko

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: yield.patch --]
[-- Type: text/x-patch; name="yield.patch", Size: 1012 bytes --]

--- sched_fair-old.c	2007-10-03 12:45:17.010306000 +0200
+++ sched_fair.c	2007-10-03 12:44:46.899851000 +0200
@@ -803,7 +803,35 @@ static void yield_task_fair(struct rq *r
 		update_curr(cfs_rq);
 
 		return;
+	} else if (sysctl_sched_compat_yield == 2) {
+		unsigned long ideal_runtime, delta_exec,
+			      delta_exec_weighted;
+
+		__update_rq_clock(rq);
+		/*
+		 * Update run-time statistics of the 'current'.
+		 */
+		update_curr(cfs_rq);
+
+		/*
+		 * Emulate the effect of us being preempted
+		 * by scheduler_tick().
+		 */
+		ideal_runtime = sched_slice(cfs_rq, curr);
+		delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
+
+		if (ideal_runtime > delta_exec) {
+			delta_exec_weighted = ideal_runtime - delta_exec;
+
+			if (unlikely(curr->load.weight != NICE_0_LOAD)) {
+				delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
+									&se->load);
+			}
+			se->vruntime += delta_exec_weighted;
+		}
+		return;
 	}
+
 	/*
 	 * Find the rightmost entry in the rbtree:
 	 */

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-03 10:55                                               ` Dmitry Adamushko
@ 2007-10-03 10:58                                                 ` Dmitry Adamushko
  2007-10-03 11:20                                                   ` Jarek Poplawski
  2007-10-03 11:22                                                 ` Ingo Molnar
  2007-10-03 11:40                                                 ` Jarek Poplawski
  2 siblings, 1 reply; 71+ messages in thread
From: Dmitry Adamushko @ 2007-10-03 10:58 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: Ingo Molnar, David Schwartz, linux-kernel

On 03/10/2007, Dmitry Adamushko <dmitry.adamushko@gmail.com> wrote:
> On 03/10/2007, Jarek Poplawski <jarkao2@o2.pl> wrote:
> > I can't see anything about clearing. I think, this was about charging,
> > which should change the key enough, to move a task to, maybe, a better
> > place in a que (tree) than with current ways.
>
> just a quick patch, not tested and I've not evaluated all possible
> implications yet.
> But someone might give it a try with his/(her -- are even more
> welcomed :-) favourite sched_yield() load.
>
> (and white space damaged)
>
> --- sched_fair-old.c    2007-10-03 12:45:17.010306000 +0200
> +++ sched_fair.c        2007-10-03 12:44:46.899851000 +0200
> @@ -803,7 +803,35 @@ static void yield_task_fair(struct rq *r
>                 update_curr(cfs_rq);
>
>                 return;
> +       } else if (sysctl_sched_compat_yield == 2) {
> +               unsigned long ideal_runtime, delta_exec,
> +                             delta_exec_weighted;
> +
> +               __update_rq_clock(rq);
> +               /*
> +                * Update run-time statistics of the 'current'.
> +                */
> +               update_curr(cfs_rq);
> +
> +               /*
> +                * Emulate (speed up) the effect of us being preempted
> +                * by scheduler_tick().
> +                */
> +               ideal_runtime = sched_slice(cfs_rq, curr);
> +               delta_exec = curr->sum_exec_runtime -
> curr->prev_sum_exec_runtime;
> +
> +               if (ideal_runtime > delta_exec) {
> +                       delta_exec_weighted = ideal_runtime - delta_exec;
> +
> +                       if (unlikely(curr->load.weight != NICE_0_LOAD)) {
> +                               delta_exec_weighted =
> calc_delta_fair(delta_exec_weighted,
> +
>  &se->load);
> +                       }


s/curr/se


-- 
Best regards,
Dmitry Adamushko

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-03 10:58                                                 ` Dmitry Adamushko
@ 2007-10-03 11:20                                                   ` Jarek Poplawski
  0 siblings, 0 replies; 71+ messages in thread
From: Jarek Poplawski @ 2007-10-03 11:20 UTC (permalink / raw)
  To: Dmitry Adamushko; +Cc: Ingo Molnar, David Schwartz, linux-kernel

On Wed, Oct 03, 2007 at 12:58:26PM +0200, Dmitry Adamushko wrote:
> On 03/10/2007, Dmitry Adamushko <dmitry.adamushko@gmail.com> wrote:
> > On 03/10/2007, Jarek Poplawski <jarkao2@o2.pl> wrote:
> > > I can't see anything about clearing. I think, this was about charging,
> > > which should change the key enough, to move a task to, maybe, a better
> > > place in a que (tree) than with current ways.
> >
> > just a quick patch, not tested and I've not evaluated all possible
> > implications yet.
> > But someone might give it a try with his/(her -- are even more
> > welcomed :-) favourite sched_yield() load.
> >
> > (and white space damaged)
> >
> > --- sched_fair-old.c    2007-10-03 12:45:17.010306000 +0200
> > +++ sched_fair.c        2007-10-03 12:44:46.899851000 +0200
...
> s/curr/se

Thanks very much!

Alas, I'll be able to look at this and try only in the evening.

Best regards,
Jarek P.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-03 10:55                                               ` Dmitry Adamushko
  2007-10-03 10:58                                                 ` Dmitry Adamushko
@ 2007-10-03 11:22                                                 ` Ingo Molnar
  2007-10-03 11:40                                                 ` Jarek Poplawski
  2 siblings, 0 replies; 71+ messages in thread
From: Ingo Molnar @ 2007-10-03 11:22 UTC (permalink / raw)
  To: Dmitry Adamushko; +Cc: Jarek Poplawski, David Schwartz, linux-kernel


* Dmitry Adamushko <dmitry.adamushko@gmail.com> wrote:

> +                       se->vruntime += delta_exec_weighted;

thanks Dmitry.

Btw., this is quite similar to the yield_granularity patch i did 
originally, just less flexible. It turned out that apps want either zero 
granularity or "infinite" granularity, they dont actually want something 
inbetween. That's the two extremes that the current sysctl expresses in 
essence.

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-01 16:49                             ` David Schwartz
  2007-10-01 17:31                               ` Ingo Molnar
  2007-10-01 19:53                               ` Network slowdown due to CFS Arjan van de Ven
@ 2007-10-03 11:31                               ` Helge Hafting
  2007-10-04  0:31                               ` Rusty Russell
  3 siblings, 0 replies; 71+ messages in thread
From: Helge Hafting @ 2007-10-03 11:31 UTC (permalink / raw)
  To: davids; +Cc: Ingo Molnar, linux-kernel

David Schwartz wrote:
>> * Jarek Poplawski <jarkao2@o2.pl> wrote:
>>
>>     
>>> BTW, it looks like risky to criticise sched_yield too much: some
>>> people can misinterpret such discussions and stop using this at all,
>>> even where it's right.
>>>       
>
>   
>> Really, i have never seen a _single_ mainstream app where the use of
>> sched_yield() was the right choice.
>>     
>
> It can occasionally be an optimization. You may have a case where you can do
> something very efficiently if a lock is not held, but you cannot afford to
> wait for the lock to be released. So you check the lock, if it's held, you
> yield and then check again. If that fails, you do it the less optimal way
> (for example, dispatching it to a thread that *can* afford to wait).
>   
How about:
Check the lock. If it is held, sleep for an interval that is shorter
than acceptable waiting time. If it is still held, sleep for twice as long.
Loop until you get the lock and do the work, or until you
you reach the limit for how much you can wait at this point and
dispatch to a thread instead.

This approach should be portable, don't wake up too often,
and don't waste the CPU.  (And it won't go idle either, whoever
holds the lock will be running.)


Helge Hafting

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-03 10:55                                               ` Dmitry Adamushko
  2007-10-03 10:58                                                 ` Dmitry Adamushko
  2007-10-03 11:22                                                 ` Ingo Molnar
@ 2007-10-03 11:40                                                 ` Jarek Poplawski
  2007-10-03 11:56                                                   ` yield Ingo Molnar
  2 siblings, 1 reply; 71+ messages in thread
From: Jarek Poplawski @ 2007-10-03 11:40 UTC (permalink / raw)
  To: Dmitry Adamushko; +Cc: Ingo Molnar, David Schwartz, linux-kernel

On Wed, Oct 03, 2007 at 12:55:34PM +0200, Dmitry Adamushko wrote:
...
> just a quick patch, not tested and I've not evaluated all possible
> implications yet.
> But someone might give it a try with his/(her -- are even more
> welcomed :-) favourite sched_yield() load.

Of course, after some evaluation by yourself and Ingo the most
interesting should be Martin's Michlmayr testing, so I hope you'll
Cc him too?!

Jarek P.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: yield
  2007-10-03 11:40                                                 ` Jarek Poplawski
@ 2007-10-03 11:56                                                   ` Ingo Molnar
  2007-10-03 12:16                                                     ` yield Jarek Poplawski
  0 siblings, 1 reply; 71+ messages in thread
From: Ingo Molnar @ 2007-10-03 11:56 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: Dmitry Adamushko, David Schwartz, linux-kernel


* Jarek Poplawski <jarkao2@o2.pl> wrote:

> On Wed, Oct 03, 2007 at 12:55:34PM +0200, Dmitry Adamushko wrote:
> ...
> > just a quick patch, not tested and I've not evaluated all possible
> > implications yet.
> > But someone might give it a try with his/(her -- are even more
> > welcomed :-) favourite sched_yield() load.
> 
> Of course, after some evaluation by yourself and Ingo the most 
> interesting should be Martin's Michlmayr testing, so I hope you'll Cc 
> him too?!

My current take on this: queue the current task right to the next 
position in the tree (this is what this patch achieves in essence) was 
one of the yield implementations we already tried in CFS but it didnt 
meet the expectations of some apps. So i can only repeat my argument: 
this is not something that can be "solved" in the way you imagine and 
your arguments just reiterate the path that CFS has already taken in the 
past. So please do not expect _us_ to go out and pester people. If 
people feel so inclined, they are of course welcome to test out various 
approaches. (they might as well try the original yield-granularity patch 
which also makes the amount of "delay" tunable, so the ideal amount of 
delay can be figured out. And of course they should also try the 
existing yield flag.)

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: yield
  2007-10-03 11:56                                                   ` yield Ingo Molnar
@ 2007-10-03 12:16                                                     ` Jarek Poplawski
  0 siblings, 0 replies; 71+ messages in thread
From: Jarek Poplawski @ 2007-10-03 12:16 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Dmitry Adamushko, David Schwartz, linux-kernel

On Wed, Oct 03, 2007 at 01:56:46PM +0200, Ingo Molnar wrote:
> 
> * Jarek Poplawski <jarkao2@o2.pl> wrote:
> 
> > On Wed, Oct 03, 2007 at 12:55:34PM +0200, Dmitry Adamushko wrote:
> > ...
> > > just a quick patch, not tested and I've not evaluated all possible
> > > implications yet.
> > > But someone might give it a try with his/(her -- are even more
> > > welcomed :-) favourite sched_yield() load.
> > 
> > Of course, after some evaluation by yourself and Ingo the most 
> > interesting should be Martin's Michlmayr testing, so I hope you'll Cc 
> > him too?!
> 
> My current take on this: queue the current task right to the next 
> position in the tree (this is what this patch achieves in essence) was 
> one of the yield implementations we already tried in CFS but it didnt 
> meet the expectations of some apps. So i can only repeat my argument: 
> this is not something that can be "solved" in the way you imagine and 
> your arguments just reiterate the path that CFS has already taken in the 
> past. So please do not expect _us_ to go out and pester people. If 
> people feel so inclined, they are of course welcome to test out various 
> approaches. (they might as well try the original yield-granularity patch 
> which also makes the amount of "delay" tunable, so the ideal amount of 
> delay can be figured out. And of course they should also try the 
> existing yield flag.)

I'm terribly sorry! Of course, the last thing I would like is to
pester anybody. I simply wasn't sure you've told about the same
idea. And of course, there is no reason to go back to something
checked before.

Thanks,
Jarek P.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* RE: Network slowdown due to CFS
  2007-10-01 16:49                             ` David Schwartz
                                                 ` (2 preceding siblings ...)
  2007-10-03 11:31                               ` Helge Hafting
@ 2007-10-04  0:31                               ` Rusty Russell
  3 siblings, 0 replies; 71+ messages in thread
From: Rusty Russell @ 2007-10-04  0:31 UTC (permalink / raw)
  To: davids; +Cc: Ingo Molnar, linux-kernel

On Mon, 2007-10-01 at 09:49 -0700, David Schwartz wrote:
> > * Jarek Poplawski <jarkao2@o2.pl> wrote:
> >
> > > BTW, it looks like risky to criticise sched_yield too much: some
> > > people can misinterpret such discussions and stop using this at all,
> > > even where it's right.
> 
> > Really, i have never seen a _single_ mainstream app where the use of
> > sched_yield() was the right choice.
> 
> It can occasionally be an optimization. You may have a case where you can do
> something very efficiently if a lock is not held, but you cannot afford to
> wait for the lock to be released. So you check the lock, if it's held, you
> yield and then check again. If that fails, you do it the less optimal way
> (for example, dispatching it to a thread that *can* afford to wait).

This used to be true, and still is if you want to be portable.  But the
point of futexes was precisely to attack this use case: whereas
sched_yield() says "I'm waiting for something, but I won't tell you
what" the futex ops tells the kernel what you're waiting for.

While the time to do a futex op is slightly slower than sched_yield(),
futexes win in so many cases that we haven't found a benchmark where
yield wins.  Yield-lose cases include:
1) There are other unrelated process that yield() ends up queueing
   behind.
2) The process you're waiting for doesn't conveniently sleep as soon as
   it releases the lock, so you wait for longer than intended,
3) You race between the yield and the lock being dropped.

In summary: spin N times & futex seems optimal.  The value of N depends
on the number of CPUs in the machine and other factors, but N=1 has
shown itself pretty reasonable.

Hope that helps,
Rusty.


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-03  9:10                                           ` Ingo Molnar
  2007-10-03  9:50                                             ` Jarek Poplawski
@ 2007-10-04  5:33                                             ` Casey Dahlin
  1 sibling, 0 replies; 71+ messages in thread
From: Casey Dahlin @ 2007-10-04  5:33 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Jarek Poplawski, David Schwartz, linux-kernel

Ingo Molnar wrote:
> * Jarek Poplawski <jarkao2@o2.pl> wrote:
>   
>> [...] (Btw, in -rc8-mm2 I see new sched_slice() function which seems 
>> to return... time.)
>>     
>
> wrong again. That is a function, not a variable to be cleared.

It still gives us a target time, so could we not simply have sched_yield 
put the thread completely to sleep for the given amount of time? It 
wholly redefines the operation, and its far more expensive (now there's 
a whole new timer involved) but it might emulate the expected behavior. 
Its hideous, but so is sched_yield in the first place, so why not?

--CJD

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
  2007-10-03  9:50                                             ` Jarek Poplawski
  2007-10-03 10:55                                               ` Dmitry Adamushko
@ 2007-10-07  7:18                                               ` Ingo Molnar
  1 sibling, 0 replies; 71+ messages in thread
From: Ingo Molnar @ 2007-10-07  7:18 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: David Schwartz, linux-kernel


* Jarek Poplawski <jarkao2@o2.pl> wrote:

> > [...] The timeslices of tasks (i.e. the time they spend on a CPU 
> > without scheduling away) is _not_ maintained directly in CFS as a 
> > per-task variable that can be "cleared", it's not the metric that 
> > drives scheduling. Yes, of course CFS too "slices up CPU time", but 
> > those slices are not the per-task variables of traditional 
> > schedulers and cannot be 'cleared'.
> 
> It's not about this comment alone, but this comment plus "no notion" 
> comment, which appears in sched-design-CFS.txt too.

ok - i've re-read it and it indeed is somewhat confusing without 
additional context. I'll improve the wording. (sched-design-CFS.txt 
needs an update anyway)

	Ingo

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: yield API
  2007-10-02  6:46                                   ` yield API Ingo Molnar
  2007-10-02 11:50                                     ` linux-os (Dick Johnson)
  2007-10-02 21:57                                     ` Eric St-Laurent
@ 2007-12-12 22:39                                     ` Jesper Juhl
  2007-12-13  4:43                                       ` Kyle Moffett
  2 siblings, 1 reply; 71+ messages in thread
From: Jesper Juhl @ 2007-12-12 22:39 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: David Schwartz, linux-kernel

On 02/10/2007, Ingo Molnar <mingo@elte.hu> wrote:
>
> * David Schwartz <davids@webmaster.com> wrote:
>
> > > These are generic statements, but i'm _really_ interested in the
> > > specifics. Real, specific code that i can look at. The typical Linux
> > > distro consists of in execess of 500 millions of lines of code, in
> > > tens of thousands of apps, so there really must be some good, valid
> > > and "right" use of sched_yield() somewhere in there, in some
> > > mainstream app, right? (because, as you might have guessed it, in
> > > the past decade of sched_yield() existence i _have_ seen my share of
> > > sched_yield() utilizing user-space code, and at the moment i'm not
> > > really impressed by those examples.)
> >
> > Maybe, maybe not. Even if so, it would be very difficult to find.
> > Simply grepping for sched_yield is not going to help because
> > determining whether a given use of sched_yield is smart is not going
> > to be easy.
>
> sched_yield() has been around for a decade (about three times longer
> than futexes were around), so if it's useful, it sure should have grown
> some 'crown jewel' app that uses it and shows off its advantages,
> compared to other locking approaches, right?
>

I have one example of sched_yield() use in a real app. Unfortunately
it's proprietary so I can't show you the source, but I can tell you
how it's used.

The case is this:  Process A forks process B. Process B does some work
that takes aproximately between 50 and 1000ms to complete (varies),
then it creates a file and continues to do other work.  Process A
needs to wait for the file B creates before it can continue.
Process A *could* immediately go into some kind of "check for file;
sleep n ms" loop, but instead it starts off by calling sched_yield()
to give process B a chance to run and hopefully get to the point where
it has created the file before process A is again scheduled and starts
to look for it - after the single sched yield call, process A does
indeed go into a "check for file; sleep 250ms;" loop, but most of the
time the initial sched_yield() call actually results in the file being
present without having to loop like that.

Now is this the best way to handle this situation? No.
Does it work better than just doing the wait loop from the start? Yes.
Is this a good way to use sched_yield()? Maybe, maybe not.  But it
*is* an actual use of the API in a real app.

> For example, if you asked me whether pipes are the best thing for
> certain apps, i could immediately show you tons of examples where they
> are. Same for sockets. Or RT priorities. Or nice levels. Or futexes. Or
> just about any other core kernel concept or API.

True. But in the app I'm talking about above, rewriting the code to
communicate over a pipe, socket or anything else would have been too
large a change to make (released product, can't risk introducing (new)
bugs).

>Your notion that
> showing a good example of an API would be "difficult" because it's hard
> to determine "smart" use is not tenable i believe and does not
> adequately refute my pretty plain-meaning "it does not exist" assertion.
>

I agree that sched_yield() is not a very good API. I also agree that
our use of it is not the best solution to the problem we wanted to
solve, but it actually works pretty well most of the time.

> If then this is one more supporting proof for the fundamental weakness
> of the sched_yield() API. Rarely are we able to so universally condemn
> an API: real-life is usually more varied and even for theoretically
> poorly defined APIs _some_ sort of legitimate use does grow up.
>
> APIs that are not in any real, meaningful use, despite a decade of
> presence are not really interesting to me personally. (especially in
> this case where we know exactly _why_ the API is used so rarely.) Sure
> we'll continue to support it in the best possible way, with the usual
> kernel maintainance policy: without hurting other, more commonly used
> APIs. That was the principle we followed in previous schedulers too. And
> if anyone has a patch to make sched_yield() better than it is today, i'm
> of course interested in it.
>
Just for the record; for our use, sched_yield() seems to work just
fine both with older and newer kernels, so from my point of view the
new scheduler is doing fine in this regard.

-- 
Jesper Juhl <jesper.juhl@gmail.com>
Don't top-post  http://www.catb.org/~esr/jargon/html/T/top-post.html
Plain text mails only, please      http://www.expita.com/nomime.html

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: yield API
  2007-12-12 22:39                                     ` Jesper Juhl
@ 2007-12-13  4:43                                       ` Kyle Moffett
  2007-12-13 20:10                                         ` David Schwartz
  0 siblings, 1 reply; 71+ messages in thread
From: Kyle Moffett @ 2007-12-13  4:43 UTC (permalink / raw)
  To: Jesper Juhl; +Cc: Ingo Molnar, David Schwartz, linux-kernel

On Dec 12, 2007, at 17:39:15, Jesper Juhl wrote:
> On 02/10/2007, Ingo Molnar <mingo@elte.hu> wrote:
>> sched_yield() has been around for a decade (about three times  
>> longer than futexes were around), so if it's useful, it sure  
>> should have grown some 'crown jewel' app that uses it and shows  
>> off its advantages, compared to other locking approaches, right?
>
> I have one example of sched_yield() use in a real app.  
> Unfortunately it's proprietary so I can't show you the source, but  
> I can tell you how it's used.
>
> The case is this:  Process A forks process B. Process B does some  
> work that takes aproximately between 50 and 1000ms to complete  
> (varies), then it creates a file and continues to do other work.   
> Process A needs to wait for the file B creates before it can  
> continue. Process A *could* immediately go into some kind of "check  
> for file; sleep n ms" loop, but instead it starts off by calling  
> sched_yield() to give process B a chance to run and hopefully get  
> to the point where it has created the file before process A is  
> again scheduled and starts to look for it - after the single sched  
> yield call, process A does indeed go into a "check for file; sleep  
> 250ms;" loop, but most of the time the initial sched_yield() call  
> actually results in the file being present without having to loop  
> like that.

That is a *terrible* disgusting way to use yield.  Better options:
   (1) inotify/dnotify
   (2) create a "foo.lock" file and put the mutex in that
   (3) just start with the check-file-and-sleep loop.


> Now is this the best way to handle this situation? No.  Does it  
> work better than just doing the wait loop from the start? Yes.

It works better than doing the wait-loop from the start?  What  
evidence do you provide to support this assertion?  Specifically, in  
the first case you tell the kernel "I'm waiting for something but I  
don't know what it is or how long it will take"; while in the second  
case you tell the kernel "I'm waiting for something that will take  
exactly X milliseconds, even though I don't know what it is.  If you  
really want something similar to the old behavior then just replace  
the "sched_yield()" call with a proper sleep for the estimated time  
it will take the program to create the file.


> Is this a good way to use sched_yield()? Maybe, maybe not.  But it  
> *is* an actual use of the API in a real app.

We weren't looking for "actual uses", especially not in binary-only  
apps.  What we are looking for is optimal uses of sched_yield(); ones  
where that is the best alternative.  This... certainly isn't.

Cheers,
Kyle Moffett


^ permalink raw reply	[flat|nested] 71+ messages in thread

* RE: yield API
  2007-12-13  4:43                                       ` Kyle Moffett
@ 2007-12-13 20:10                                         ` David Schwartz
  0 siblings, 0 replies; 71+ messages in thread
From: David Schwartz @ 2007-12-13 20:10 UTC (permalink / raw)
  To: mrmacman_g4, Jesper Juhl; +Cc: Ingo Molnar, linux-kernel


Kyle Moffett wrote:

> That is a *terrible* disgusting way to use yield.  Better options:
>    (1) inotify/dnotify

Sure, tie yourself to a Linux-specific mechanism that may or may not work
over things like NFS. That's much worse.

>    (2) create a "foo.lock" file and put the mutex in that

Right, tie yourself to process-shared mutexes which historically weren't
available on Linux. That's much better than an option that's been stable for
a decade.

>    (3) just start with the check-file-and-sleep loop.

How is that better? There is literally no improvement, since the first check
will (almost) always fail.

> > Now is this the best way to handle this situation? No.  Does it
> > work better than just doing the wait loop from the start? Yes.
>
> It works better than doing the wait-loop from the start?  What
> evidence do you provide to support this assertion?

The evidence is that more than half the time, this avoids the sleep. That
means it has zero cost, since the yield is no heavier than a sleep would be,
and has a possible benefit, since the first sleep may be too long.

> Specifically, in
> the first case you tell the kernel "I'm waiting for something but I
> don't know what it is or how long it will take"; while in the second
> case you tell the kernel "I'm waiting for something that will take
> exactly X milliseconds, even though I don't know what it is.  If you
> really want something similar to the old behavior then just replace
> the "sched_yield()" call with a proper sleep for the estimated time
> it will take the program to create the file.

The problem is that if the estimate is too short, pre-emption will result in
a huge performance drop. If the estimate is too long, there will be some
wasted CPU. What was the claimed benefit of doing this again?

> > Is this a good way to use sched_yield()? Maybe, maybe not.  But it
> > *is* an actual use of the API in a real app.

> We weren't looking for "actual uses", especially not in binary-only
> apps.  What we are looking for is optimal uses of sched_yield(); ones
> where that is the best alternative.  This... certainly isn't.

Your standards for "optimal" are totally unrealistic. In his case, it was
optimal. Using platform-specific optimizations would have meant more
development and test time for minimal benefit. Sleeping first would have had
some performance cost and no benefit. In his case, sched_yield was optimal.
Really.

DS



^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: Network slowdown due to CFS
@ 2007-10-01 22:27 Hubert Tonneau
  0 siblings, 0 replies; 71+ messages in thread
From: Hubert Tonneau @ 2007-10-01 22:27 UTC (permalink / raw)
  To: linux-kernel

Ingo Molnar wrote:
>
> Really, i have never seen a _single_ mainstream app where the use of
> sched_yield() was the right choice.

Pliant 'FastSem' semaphore implementation (as oppsed to 'Sem') uses 'yield'
http://old.fullpliant.org/

Basically, if the ressource you are protecting with the semaphore will be held
for a significant time, then a full semaphore might be better, but if the
ressource will be held just a fiew cycles, then light aquiering might bring best
result because the most significant cost is in aquiering/releasing.

So the aquiering algorithm for fast semaphores might be:
try to aquire with a hardware atomic read and set instruction, then if it fails,
call yield then retry (at least on a single processor single core system).



^ permalink raw reply	[flat|nested] 71+ messages in thread

end of thread, other threads:[~2007-12-13 20:11 UTC | newest]

Thread overview: 71+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-09-26  8:52 Network slowdown due to CFS Martin Michlmayr
2007-09-26  9:34 ` Ingo Molnar
2007-09-26  9:47   ` Ingo Molnar
2007-09-26 10:08     ` Martin Michlmayr
2007-09-26 10:18       ` Ingo Molnar
2007-09-26 10:20 ` Mike Galbraith
2007-09-26 10:23 ` Mike Galbraith
2007-09-26 10:48   ` Martin Michlmayr
2007-09-26 11:21     ` Ingo Molnar
2007-09-26 11:29       ` Martin Michlmayr
2007-09-26 12:00         ` David Schwartz
2007-09-26 13:31           ` Ingo Molnar
2007-09-26 15:40             ` Stephen Hemminger
2007-09-26 15:46             ` Stephen Hemminger
2007-09-27  9:30             ` Jarek Poplawski
2007-09-27  9:46               ` Ingo Molnar
2007-09-27 12:27                 ` Jarek Poplawski
2007-09-27 13:31                   ` Ingo Molnar
2007-09-27 14:42                     ` Jarek Poplawski
2007-09-28  6:10                       ` Nick Piggin
2007-10-01  8:43                         ` Jarek Poplawski
2007-10-01 16:25                           ` Ingo Molnar
2007-10-01 16:49                             ` David Schwartz
2007-10-01 17:31                               ` Ingo Molnar
2007-10-01 18:23                                 ` David Schwartz
2007-10-02  6:06                                   ` Ingo Molnar
2007-10-02  6:47                                     ` Andi Kleen
2007-10-03  8:02                                     ` Jarek Poplawski
2007-10-03  8:16                                       ` Ingo Molnar
2007-10-03  8:56                                         ` Jarek Poplawski
2007-10-03  9:10                                           ` Ingo Molnar
2007-10-03  9:50                                             ` Jarek Poplawski
2007-10-03 10:55                                               ` Dmitry Adamushko
2007-10-03 10:58                                                 ` Dmitry Adamushko
2007-10-03 11:20                                                   ` Jarek Poplawski
2007-10-03 11:22                                                 ` Ingo Molnar
2007-10-03 11:40                                                 ` Jarek Poplawski
2007-10-03 11:56                                                   ` yield Ingo Molnar
2007-10-03 12:16                                                     ` yield Jarek Poplawski
2007-10-07  7:18                                               ` Network slowdown due to CFS Ingo Molnar
2007-10-04  5:33                                             ` Casey Dahlin
2007-10-02  6:08                                   ` Ingo Molnar
2007-10-02  6:26                                   ` Ingo Molnar
2007-10-02  6:46                                   ` yield API Ingo Molnar
2007-10-02 11:50                                     ` linux-os (Dick Johnson)
2007-10-02 15:24                                       ` Douglas McNaught
2007-10-02 21:57                                     ` Eric St-Laurent
2007-12-12 22:39                                     ` Jesper Juhl
2007-12-13  4:43                                       ` Kyle Moffett
2007-12-13 20:10                                         ` David Schwartz
2007-10-01 19:53                               ` Network slowdown due to CFS Arjan van de Ven
2007-10-01 22:17                                 ` David Schwartz
2007-10-01 22:35                                   ` Arjan van de Ven
2007-10-01 22:44                                     ` David Schwartz
2007-10-01 22:55                                       ` Arjan van de Ven
2007-10-02 15:37                                         ` David Schwartz
2007-10-03  7:15                                           ` Jarek Poplawski
2007-10-03 11:31                               ` Helge Hafting
2007-10-04  0:31                               ` Rusty Russell
2007-10-01 16:55                             ` Chris Friesen
2007-10-01 17:09                               ` Ingo Molnar
2007-10-01 17:45                                 ` Chris Friesen
2007-10-01 19:09                                   ` iperf yield usage Ingo Molnar
2007-10-02  9:03                             ` Network slowdown due to CFS Jarek Poplawski
2007-10-02 13:39                               ` Jarek Poplawski
2007-10-02  9:26                           ` Jarek Poplawski
2007-09-27  9:49         ` Ingo Molnar
2007-09-27 10:54           ` Martin Michlmayr
2007-09-27 10:56             ` Ingo Molnar
2007-09-27 11:12               ` Martin Michlmayr
2007-10-01 22:27 Hubert Tonneau

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).