All of lore.kernel.org
 help / color / mirror / Atom feed
* nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz?
@ 2007-02-19 23:14 bert hubert
  2007-02-19 23:56 ` Stephen Hemminger
  2007-02-20 10:50 ` Andi Kleen
  0 siblings, 2 replies; 26+ messages in thread
From: bert hubert @ 2007-02-19 23:14 UTC (permalink / raw)
  To: netdev

Hi people,

I'm trying to save people the cost of buying extra servers by making
PowerDNS (GPL) ever faster, but I've hit a rather fundamental problem.

Linux 2.6.20-rc4 appears to take 4 microseconds on my P4 3GHz for a
non-blocking UDPv4 recvfrom() call, both on loopback and ethernet.

Linux 2.6.18 on my 64 bit Athlon64 3200+ takes a similar amount of time.

This seems like rather a lot for a 50 byte datagram, but perhaps I'm
overestimating your abilities :-)

The program is unthreaded, and I measure like this:

#define RDTSC(qp) \
do { \
  unsigned long lowPart, highPart;					\
  __asm__ __volatile__("rdtsc" : "=a" (lowPart), "=d" (highPart)); \
    qp = (((unsigned long long) highPart) << 32) | lowPart; \
} while (0)

...

uint64_t tsc1, tsc2;
RDTSC(tsc1);      

if((len=recvfrom(fd, data, sizeof(data), 0, (sockaddr *)&fromaddr, &addrlen)) >= 0) { 
    RDTSC(tsc2);      
    printf("%f\n", (tsc2-tsc1)/3000.0);  // 3GHz P4
}

gdb generates the following dump from the actual program,
x=_Z20handleNewUDPQuestioniRN5boost3anyE, I see nothing untoward happening
between the two 'rdtsc' opcodes.

0x08091de0 <x+0>:  push   %ebp
0x08091de1 <x+1>:  mov    %esp,%ebp
0x08091de3 <x+3>:  push   %edi
0x08091de4 <x+4>:  push   %esi
0x08091de5 <x+5>:  push   %ebx
0x08091de6 <x+6>:  sub    $0x78c,%esp
0x08091dec <x+12>: mov    %gs:0x14,%eax
0x08091df2 <x+18>: mov    %eax,0xffffffe4(%ebp)
0x08091df5 <x+21>: xor    %eax,%eax
0x08091df7 <x+23>: movw   $0x2,0xffffffac(%ebp)
0x08091dfd <x+29>: movl   $0x0,0xffffffb0(%ebp)
0x08091e04 <x+36>: movw   $0x0,0xffffffae(%ebp)
0x08091e0a <x+42>: movl   $0x1c,0xfffff8f4(%ebp)
0x08091e14 <x+52>: rdtsc  
0x08091e16 <x+54>: mov    %edx,%ebx
0x08091e18 <x+56>: mov    0x8(%ebp),%edx
0x08091e1b <x+59>: mov    %eax,%esi
0x08091e1d <x+61>: lea    0xfffff8f4(%ebp),%eax
0x08091e23 <x+67>: mov    %eax,0x14(%esp)
0x08091e27 <x+71>: lea    0xffffffac(%ebp),%ecx
0x08091e2a <x+74>: lea    0xfffff950(%ebp),%eax
0x08091e30 <x+80>: mov    %ecx,0x10(%esp)
0x08091e34 <x+84>: movl   $0x0,0xc(%esp)
0x08091e3c <x+92>: movl   $0x5dc,0x8(%esp)
0x08091e44 <x+100>:        mov    %eax,0x4(%esp)
0x08091e48 <x+104>:        mov    %edx,(%esp)
0x08091e4b <x+107>:        call   0x8192110 <recvfrom>
0x08091e50 <x+112>:        test   %eax,%eax
0x08091e52 <x+114>:        mov    %eax,0xfffff8b0(%ebp)
0x08091e58 <x+120>:        js     0x8092168 <x+904>
0x08091e5e <x+126>:        mov    %ebx,%eax
0x08091e60 <x+128>:        xor    %edx,%edx
0x08091e62 <x+130>:        mov    %eax,%edx
0x08091e64 <x+132>:        mov    $0x0,%eax
0x08091e69 <x+137>:        mov    %esi,%ecx
0x08091e6b <x+139>:        mov    %eax,%esi
0x08091e6d <x+141>:        or     %ecx,%esi
0x08091e6f <x+143>:        mov    %edx,%edi
0x08091e71 <x+145>:        rdtsc  
0x08091e73 <x+147>:        mov    %eax,0xfffff8a0(%ebp)
0x08091e79 <x+153>:        mov    0xfffff8a0(%ebp),%eax
0x08091e7f <x+159>:        mov    %edx,%ecx
0x08091e81 <x+161>:        xor    %ebx,%ebx
0x08091e83 <x+163>:        mov    %ecx,%ebx

recvfrom itself is a tad worrisome, x=recvfrom. I didn't ask for the
'libc_enable_asynccancel' stuff. I'm trying to isolate the actual syscall
but it is proving hard work for an assemnly newbie like me - socketcall
doesn't make things easier.

0xb7d62410 <x+0>:        cmpl   $0x0,%gs:0xc
0xb7d62418 <x+8>:        jne    0xb7d62439 <x+41>
0xb7d6241a <x+10>:       mov    %ebx,%edx
0xb7d6241c <x+12>:       mov    $0x66,%eax
0xb7d62421 <x+17>:       mov    $0xc,%ebx
0xb7d62426 <x+22>:       lea    0x4(%esp),%ecx
0xb7d6242a <x+26>:       call   *%gs:0x10
0xb7d62431 <x+33>:       mov    %edx,%ebx
0xb7d62433 <x+35>:       cmp    $0xffffff83,%eax
0xb7d62436 <x+38>:       jae    0xb7d62469 <x+89>
0xb7d62438 <x+40>:       ret    
0xb7d62439 <x+41>:       push   %esi
0xb7d6243a <x+42>:       call   0xb7d6ddd0 <__libc_enable_asynccancel>
0xb7d6243f <x+47>:       mov    %eax,%esi
0xb7d62441 <x+49>:       mov    %ebx,%edx
0xb7d62443 <x+51>:       mov    $0x66,%eax
0xb7d62448 <x+56>:       mov    $0xc,%ebx
0xb7d6244d <x+61>:       lea    0x8(%esp),%ecx
0xb7d62451 <x+65>:       call   *%gs:0x10
0xb7d62458 <x+72>:       mov    %edx,%ebx
0xb7d6245a <x+74>:       xchg   %eax,%esi
0xb7d6245b <x+75>:       call   0xb7d6dd90 <__libc_disable_asynccancel>
0xb7d62460 <x+80>:       mov    %esi,%eax
0xb7d62462 <x+82>:       pop    %esi
0xb7d62463 <x+83>:       cmp    $0xffffff83,%eax
0xb7d62466 <x+86>:       jae    0xb7d62469 <x+89>
0xb7d62468 <x+88>:       ret    
0xb7d62469 <x+89>:       call   0xb7d998f8 <__i686.get_pc_thunk.cx>
0xb7d6246e <x+94>:       add    $0x61b86,%ecx
0xb7d62474 <x+100>:      mov    0xffffff2c(%ecx),%ecx
0xb7d6247a <x+106>:      xor    %edx,%edx
0xb7d6247c <x+108>:      sub    %eax,%edx
0xb7d6247e <x+110>:      mov    %edx,%gs:(%ecx)
0xb7d62481 <x+113>:      or     $0xffffffff,%eax
0xb7d62484 <x+116>:      jmp    0xb7d62438 <x+40>

Any clues?

-- 
http://www.PowerDNS.com      Open source, database driven DNS Software 
http://netherlabs.nl              Open and Closed source services

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz?
  2007-02-19 23:14 nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz? bert hubert
@ 2007-02-19 23:56 ` Stephen Hemminger
  2007-02-20  8:04   ` bert hubert
  2007-02-20 10:50 ` Andi Kleen
  1 sibling, 1 reply; 26+ messages in thread
From: Stephen Hemminger @ 2007-02-19 23:56 UTC (permalink / raw)
  To: bert hubert; +Cc: netdev

On Tue, 20 Feb 2007 00:14:47 +0100
bert hubert <bert.hubert@netherlabs.nl> wrote:

> Hi people,
> 
> I'm trying to save people the cost of buying extra servers by making
> PowerDNS (GPL) ever faster, but I've hit a rather fundamental problem.
> 
> Linux 2.6.20-rc4 appears to take 4 microseconds on my P4 3GHz for a
> non-blocking UDPv4 recvfrom() call, both on loopback and ethernet.
> 
> Linux 2.6.18 on my 64 bit Athlon64 3200+ takes a similar amount of time.
> 
> This seems like rather a lot for a 50 byte datagram, but perhaps I'm
> overestimating your abilities :-)
> 
> The program is unthreaded, and I measure like this:
> 
> #define RDTSC(qp) \
> do { \
>   unsigned long lowPart, highPart;					\
>   __asm__ __volatile__("rdtsc" : "=a" (lowPart), "=d" (highPart)); \
>     qp = (((unsigned long long) highPart) << 32) | lowPart; \
> } while (0)
> 
> ...
> 
> uint64_t tsc1, tsc2;
> RDTSC(tsc1);      
> 
> if((len=recvfrom(fd, data, sizeof(data), 0, (sockaddr *)&fromaddr, &addrlen)) >= 0) { 
>     RDTSC(tsc2);      
>     printf("%f\n", (tsc2-tsc1)/3000.0);  // 3GHz P4
> }
> 
> gdb generates the following dump from the actual program,
> x=_Z20handleNewUDPQuestioniRN5boost3anyE, I see nothing untoward happening
> between the two 'rdtsc' opcodes.
> 
> 0x08091de0 <x+0>:  push   %ebp
> 0x08091de1 <x+1>:  mov    %esp,%ebp
> 0x08091de3 <x+3>:  push   %edi
> 0x08091de4 <x+4>:  push   %esi
> 0x08091de5 <x+5>:  push   %ebx
> 0x08091de6 <x+6>:  sub    $0x78c,%esp
> 0x08091dec <x+12>: mov    %gs:0x14,%eax
> 0x08091df2 <x+18>: mov    %eax,0xffffffe4(%ebp)
> 0x08091df5 <x+21>: xor    %eax,%eax
> 0x08091df7 <x+23>: movw   $0x2,0xffffffac(%ebp)
> 0x08091dfd <x+29>: movl   $0x0,0xffffffb0(%ebp)
> 0x08091e04 <x+36>: movw   $0x0,0xffffffae(%ebp)
> 0x08091e0a <x+42>: movl   $0x1c,0xfffff8f4(%ebp)
> 0x08091e14 <x+52>: rdtsc  
> 0x08091e16 <x+54>: mov    %edx,%ebx
> 0x08091e18 <x+56>: mov    0x8(%ebp),%edx
> 0x08091e1b <x+59>: mov    %eax,%esi
> 0x08091e1d <x+61>: lea    0xfffff8f4(%ebp),%eax
> 0x08091e23 <x+67>: mov    %eax,0x14(%esp)
> 0x08091e27 <x+71>: lea    0xffffffac(%ebp),%ecx
> 0x08091e2a <x+74>: lea    0xfffff950(%ebp),%eax
> 0x08091e30 <x+80>: mov    %ecx,0x10(%esp)
> 0x08091e34 <x+84>: movl   $0x0,0xc(%esp)
> 0x08091e3c <x+92>: movl   $0x5dc,0x8(%esp)
> 0x08091e44 <x+100>:        mov    %eax,0x4(%esp)
> 0x08091e48 <x+104>:        mov    %edx,(%esp)
> 0x08091e4b <x+107>:        call   0x8192110 <recvfrom>
> 0x08091e50 <x+112>:        test   %eax,%eax
> 0x08091e52 <x+114>:        mov    %eax,0xfffff8b0(%ebp)
> 0x08091e58 <x+120>:        js     0x8092168 <x+904>
> 0x08091e5e <x+126>:        mov    %ebx,%eax
> 0x08091e60 <x+128>:        xor    %edx,%edx
> 0x08091e62 <x+130>:        mov    %eax,%edx
> 0x08091e64 <x+132>:        mov    $0x0,%eax
> 0x08091e69 <x+137>:        mov    %esi,%ecx
> 0x08091e6b <x+139>:        mov    %eax,%esi
> 0x08091e6d <x+141>:        or     %ecx,%esi
> 0x08091e6f <x+143>:        mov    %edx,%edi
> 0x08091e71 <x+145>:        rdtsc  
> 0x08091e73 <x+147>:        mov    %eax,0xfffff8a0(%ebp)
> 0x08091e79 <x+153>:        mov    0xfffff8a0(%ebp),%eax
> 0x08091e7f <x+159>:        mov    %edx,%ecx
> 0x08091e81 <x+161>:        xor    %ebx,%ebx
> 0x08091e83 <x+163>:        mov    %ecx,%ebx
> 
> recvfrom itself is a tad worrisome, x=recvfrom. I didn't ask for the
> 'libc_enable_asynccancel' stuff. I'm trying to isolate the actual syscall
> but it is proving hard work for an assemnly newbie like me - socketcall
> doesn't make things easier.
> 
> 0xb7d62410 <x+0>:        cmpl   $0x0,%gs:0xc
> 0xb7d62418 <x+8>:        jne    0xb7d62439 <x+41>
> 0xb7d6241a <x+10>:       mov    %ebx,%edx
> 0xb7d6241c <x+12>:       mov    $0x66,%eax
> 0xb7d62421 <x+17>:       mov    $0xc,%ebx
> 0xb7d62426 <x+22>:       lea    0x4(%esp),%ecx
> 0xb7d6242a <x+26>:       call   *%gs:0x10
> 0xb7d62431 <x+33>:       mov    %edx,%ebx
> 0xb7d62433 <x+35>:       cmp    $0xffffff83,%eax
> 0xb7d62436 <x+38>:       jae    0xb7d62469 <x+89>
> 0xb7d62438 <x+40>:       ret    
> 0xb7d62439 <x+41>:       push   %esi
> 0xb7d6243a <x+42>:       call   0xb7d6ddd0 <__libc_enable_asynccancel>
> 0xb7d6243f <x+47>:       mov    %eax,%esi
> 0xb7d62441 <x+49>:       mov    %ebx,%edx
> 0xb7d62443 <x+51>:       mov    $0x66,%eax
> 0xb7d62448 <x+56>:       mov    $0xc,%ebx
> 0xb7d6244d <x+61>:       lea    0x8(%esp),%ecx
> 0xb7d62451 <x+65>:       call   *%gs:0x10
> 0xb7d62458 <x+72>:       mov    %edx,%ebx
> 0xb7d6245a <x+74>:       xchg   %eax,%esi
> 0xb7d6245b <x+75>:       call   0xb7d6dd90 <__libc_disable_asynccancel>
> 0xb7d62460 <x+80>:       mov    %esi,%eax
> 0xb7d62462 <x+82>:       pop    %esi
> 0xb7d62463 <x+83>:       cmp    $0xffffff83,%eax
> 0xb7d62466 <x+86>:       jae    0xb7d62469 <x+89>
> 0xb7d62468 <x+88>:       ret    
> 0xb7d62469 <x+89>:       call   0xb7d998f8 <__i686.get_pc_thunk.cx>
> 0xb7d6246e <x+94>:       add    $0x61b86,%ecx
> 0xb7d62474 <x+100>:      mov    0xffffff2c(%ecx),%ecx
> 0xb7d6247a <x+106>:      xor    %edx,%edx
> 0xb7d6247c <x+108>:      sub    %eax,%edx
> 0xb7d6247e <x+110>:      mov    %edx,%gs:(%ecx)
> 0xb7d62481 <x+113>:      or     $0xffffffff,%eax
> 0xb7d62484 <x+116>:      jmp    0xb7d62438 <x+40>
> 
> Any clues?
> 

Use oprofile to find the hotspot.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz?
  2007-02-19 23:56 ` Stephen Hemminger
@ 2007-02-20  8:04   ` bert hubert
  0 siblings, 0 replies; 26+ messages in thread
From: bert hubert @ 2007-02-20  8:04 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev

On Mon, Feb 19, 2007 at 03:56:23PM -0800, Stephen Hemminger wrote:

> > Linux 2.6.20-rc4 appears to take 4 microseconds on my P4 3GHz for a
> > non-blocking UDPv4 recvfrom() call, both on loopback and ethernet.
> > 
> > Linux 2.6.18 on my 64 bit Athlon64 3200+ takes a similar amount of time.

> > recvfrom itself is a tad worrisome, x=recvfrom. I didn't ask for the
> > 'libc_enable_asynccancel' stuff. I'm trying to isolate the actual syscall
> > but it is proving hard work for an assemnly newbie like me - socketcall
> > doesn't make things easier.

Together with Zwane Mwaikambo, we managed to isolate the pure syscall, it
doesn't make a difference, a single recvfrom continues to take around 4
microseconds at 3GHz. Many thanks to Zwane for helping out.

> Use oprofile to find the hotspot.

Will do this next - I need to get me a setup where I can do oprofile *and*
decent query rates, I don't do oprofile on my remote machines I don't have
easy access to.

Thanks.

-- 
http://www.PowerDNS.com      Open source, database driven DNS Software 
http://netherlabs.nl              Open and Closed source services

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz?
  2007-02-19 23:14 nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz? bert hubert
  2007-02-19 23:56 ` Stephen Hemminger
@ 2007-02-20 10:50 ` Andi Kleen
  2007-02-20 16:27   ` all syscalls initially taking 4usec on a P4? " bert hubert
  1 sibling, 1 reply; 26+ messages in thread
From: Andi Kleen @ 2007-02-20 10:50 UTC (permalink / raw)
  To: bert hubert; +Cc: netdev

bert hubert <bert.hubert@netherlabs.nl> writes:

> Hi people,
> 
> I'm trying to save people the cost of buying extra servers by making
> PowerDNS (GPL) ever faster, but I've hit a rather fundamental problem.
> 
> Linux 2.6.20-rc4 appears to take 4 microseconds on my P4 3GHz for a
> non-blocking UDPv4 recvfrom() call, both on loopback and ethernet.

P4s are pretty slow at taking locks (or rather doing atomical operations)
and there are several of them in this path. You could try it with a UP
kernel. Actually hotunplugging the other virtual CPU should be sufficient 
with recent kernels.

Also BTW RDTSC on P4 is not very accurate for small measurements
because it has a quite high overhead by itself, i would suggest
running it in a loop.

-Andi

^ permalink raw reply	[flat|nested] 26+ messages in thread

* all syscalls initially taking 4usec on a P4? Re: nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz?
  2007-02-20 10:50 ` Andi Kleen
@ 2007-02-20 16:27   ` bert hubert
  2007-02-20 16:41     ` Evgeniy Polyakov
  2007-02-20 16:57     ` Eric Dumazet
  0 siblings, 2 replies; 26+ messages in thread
From: bert hubert @ 2007-02-20 16:27 UTC (permalink / raw)
  To: Andi Kleen; +Cc: netdev, linux-kernel

On Tue, Feb 20, 2007 at 11:50:13AM +0100, Andi Kleen wrote:
> P4s are pretty slow at taking locks (or rather doing atomical operations)
> and there are several of them in this path. You could try it with a UP
> kernel. Actually hotunplugging the other virtual CPU should be sufficient 
> with recent kernels.

This is on a UP kernel, on a single CPU. It does have hyperthreading, but
the kernel is uniprocessor, non-preempt. No frequency scaling. Linux
2.6.20-rc4, 2.6.19, 2.6.18, P4, P-M, Athlon 64. Ubunty Edgy Eft on the P4.

> Also BTW RDTSC on P4 is not very accurate for small measurements
> because it has a quite high overhead by itself, i would suggest
> running it in a loop.

I've done so, with some interesting results. Source on
http://ds9a.nl/tmp/recvtimings.c - be careful to adjust the '3000' divider
to your CPU frequency if you care about absolute numbers!

These are two groups, each consisting of 10 consecutive nonblocking UDP
recvfroms, with 10 packets preloaded. Reported is the number of microseconds
per recvfrom call which yielded a packet:

$ ./recvtimings
4.142333
2.237667
1.927333
1.580000
1.770000
1.632333
1.712667
1.685000
1.620000
2.415000
1.347333
1.545000
1.492667
1.902333
1.485000
1.532667
1.460000
1.517667
1.492333
1.580000

This in a nearly quiet P4 - I've removed the first line:
$ vmstat 1
procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa
 0  0      0 290064 307036 296036    0    0     0     0  124   58  0  0 100  0
 0  0      0 289972 307036 296036    0    0     0     4  139   95  0  0 100  0
 0  0      0 289972 307036 296036    0    0     0     0  119   55  0  0 100  0
 1  0      0 289972 307036 296036    0    0     0     0  135   71  0  0 100  0

HZ is clearly 100. If I usleep in between, timings for each recvfrom call
become higher. If I sleep for a full second, I get nearly flat results:
4.250000
5.317667
3.525000
4.147333
3.360000
3.552667
3.087667

Various differing CPUs report more or less the same results. Now I know we
have caching effects, but these effects are HUGE.

Is this supposed to be the case? I'm on an up to date system, glibc 2.4.

	Bert

-- 
http://www.PowerDNS.com      Open source, database driven DNS Software 
http://netherlabs.nl              Open and Closed source services

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: all syscalls initially taking 4usec on a P4? Re: nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz?
  2007-02-20 16:27   ` all syscalls initially taking 4usec on a P4? " bert hubert
@ 2007-02-20 16:41     ` Evgeniy Polyakov
  2007-02-20 17:02       ` bert hubert
                         ` (2 more replies)
  2007-02-20 16:57     ` Eric Dumazet
  1 sibling, 3 replies; 26+ messages in thread
From: Evgeniy Polyakov @ 2007-02-20 16:41 UTC (permalink / raw)
  To: bert hubert; +Cc: Andi Kleen, netdev, linux-kernel

On Tue, Feb 20, 2007 at 05:27:14PM +0100, bert hubert (bert.hubert@netherlabs.nl) wrote:
> I've done so, with some interesting results. Source on
> http://ds9a.nl/tmp/recvtimings.c - be careful to adjust the '3000' divider
> to your CPU frequency if you care about absolute numbers!
> 
> These are two groups, each consisting of 10 consecutive nonblocking UDP
> recvfroms, with 10 packets preloaded. Reported is the number of microseconds
> per recvfrom call which yielded a packet:
> 
> $ ./recvtimings
> 4.142333

It can be recvfrom only problem - syscall overhead on my p4 (core duo,
debian testing) is bout 300 usec - to test I ran 
read('dev/zero', &data, 0)
in a loop.

Could you try to hack recvfrom() for your socket to always copy some
empty buffer and check the results without waiting for packet?

If you are not hurry I can test it myself tomorrow.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: all syscalls initially taking 4usec on a P4? Re: nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz?
  2007-02-20 16:27   ` all syscalls initially taking 4usec on a P4? " bert hubert
  2007-02-20 16:41     ` Evgeniy Polyakov
@ 2007-02-20 16:57     ` Eric Dumazet
  1 sibling, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2007-02-20 16:57 UTC (permalink / raw)
  To: bert hubert; +Cc: Andi Kleen, netdev, linux-kernel

On Tuesday 20 February 2007 17:27, bert hubert wrote:
> On Tue, Feb 20, 2007 at 11:50:13AM +0100, Andi Kleen wrote:
> > P4s are pretty slow at taking locks (or rather doing atomical operations)
> > and there are several of them in this path. You could try it with a UP
> > kernel. Actually hotunplugging the other virtual CPU should be sufficient
> > with recent kernels.
>
> This is on a UP kernel, on a single CPU. It does have hyperthreading, but
> the kernel is uniprocessor, non-preempt. No frequency scaling. Linux
> 2.6.20-rc4, 2.6.19, 2.6.18, P4, P-M, Athlon 64. Ubunty Edgy Eft on the P4.
>
> > Also BTW RDTSC on P4 is not very accurate for small measurements
> > because it has a quite high overhead by itself, i would suggest
> > running it in a loop.
>
> I've done so, with some interesting results. Source on
> http://ds9a.nl/tmp/recvtimings.c - be careful to adjust the '3000' divider
> to your CPU frequency if you care about absolute numbers!
>
> These are two groups, each consisting of 10 consecutive nonblocking UDP
> recvfroms, with 10 packets preloaded. Reported is the number of
> microseconds per recvfrom call which yielded a packet:
>
> $ ./recvtimings
> 4.142333
> 2.237667
> 1.927333
> 1.580000
> 1.770000
> 1.632333
> 1.712667
> 1.685000
> 1.620000
> 2.415000
> 1.347333
> 1.545000
> 1.492667
> 1.902333
> 1.485000
> 1.532667
> 1.460000
> 1.517667
> 1.492333
> 1.580000
>
> This in a nearly quiet P4 - I've removed the first line:
> $ vmstat 1
> procs -----------memory---------- ---swap-- -----io---- -system--
> ----cpu---- r  b   swpd   free   buff  cache   si   so    bi    bo   in  
> cs us sy id wa 0  0      0 290064 307036 296036    0    0     0     0  124 
>  58  0  0 100  0 0  0      0 289972 307036 296036    0    0     0     4 
> 139   95  0  0 100  0 0  0      0 289972 307036 296036    0    0     0    
> 0  119   55  0  0 100  0 1  0      0 289972 307036 296036    0    0     0  
>   0  135   71  0  0 100  0
>
> HZ is clearly 100. If I usleep in between, timings for each recvfrom call
> become higher. If I sleep for a full second, I get nearly flat results:
> 4.250000
> 5.317667
> 3.525000
> 4.147333
> 3.360000
> 3.552667
> 3.087667
>
> Various differing CPUs report more or less the same results. Now I know we
> have caching effects, but these effects are HUGE.
>
> Is this supposed to be the case? I'm on an up to date system, glibc 2.4.
>
> 	Bert

Here are my results :

(I did change your program, to have the /1600.0 divide)
(my kernel is SMP, but my machine is UP)

$ uname -a
Linux ubuntu 2.6.20 #5 SMP Wed Feb 7 18:32:11 CET 2007 i686 GNU/Linux
$ cat /proc/cpuinfo 
processor       : 0
vendor_id       : GenuineIntel
cpu family      : 6
model           : 13
model name      : Intel(R) Pentium(R) M processor 1.60GHz
stepping        : 8
cpu MHz         : 1596.065
cache size      : 2048 KB
fdiv_bug        : no
hlt_bug         : no
f00f_bug        : no
coma_bug        : no
fpu             : yes
fpu_exception   : yes
cpuid level     : 2
wp              : yes
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca 
cmov pat clflush dts acpi mmx fxsr sse sse2 ss tm pbe nx up est tm2
bogomips        : 3195.41
clflush size    : 64

$ ./recvtimings
4.175625
1.398125
1.264375
1.288125
1.102500
1.106875
1.318750
1.276250
1.237500
1.408750
0.971250
1.075625
1.083750
1.075625
1.098125
1.112500
1.109375
1.072500
1.114375
1.080625

For info, getppid() system call gives : 0.156250, and umask(0) gives 0.191875
1.102500 for a much more complex syscall seems OK for me.



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: all syscalls initially taking 4usec on a P4? Re: nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz?
  2007-02-20 16:41     ` Evgeniy Polyakov
@ 2007-02-20 17:02       ` bert hubert
  2007-02-20 17:11         ` Evgeniy Polyakov
  2007-02-20 18:42       ` Josef Sipek
  2007-02-25 10:41       ` Pavel Machek
  2 siblings, 1 reply; 26+ messages in thread
From: bert hubert @ 2007-02-20 17:02 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: Andi Kleen, netdev, linux-kernel

On Tue, Feb 20, 2007 at 07:41:25PM +0300, Evgeniy Polyakov wrote:

> It can be recvfrom only problem - syscall overhead on my p4 (core duo,
> debian testing) is bout 300 usec - to test I ran read('dev/zero', &data,
> 0) in a loop.

nsec I assume?

The usec numbers for read(fd, &c, 0) where fd is /dev/zero:
1.557667, 0.627667, 0.447333, 0.440000, 0.440000, 0.440000, 0.442333,
0.440000, 0.440000, 0.442333, 0.442333, 0.440000, 0.440000, 0.442333,
0.442667, 0.440000, 0.440000, 0.440000, 0.442333, 0.442667,

In usecs. Notice the same declining figure, but not as pronounced. With a
sleep(1) in between, we get:
1.692667, 1.800000, 0.782667, 1.282667, 0.665000, 0.980000, 0.925000,
0.887667, 0.662667, 0.862667, 1.077333, 1.442333, 0.660000, 1.890000,
0.672333, 0.795000, 0.647667, 0.692333, 0.750000, 0.865000,

This doesn't look all that unhealthy.

> Could you try to hack recvfrom() for your socket to always copy some
> empty buffer and check the results without waiting for packet?

That might be out of my reach before tomorrow :-)

> If you are not hurry I can test it myself tomorrow.

Thanks. My major problem is that in my measurements, I quite often see the
'worst case' 4usec result. It would not be a problem if it happens only
once, of course.

	Bert

-- 
http://www.PowerDNS.com      Open source, database driven DNS Software 
http://netherlabs.nl              Open and Closed source services

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: all syscalls initially taking 4usec on a P4? Re: nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz?
  2007-02-20 17:02       ` bert hubert
@ 2007-02-20 17:11         ` Evgeniy Polyakov
  2007-02-20 17:18           ` Evgeniy Polyakov
  2007-02-21 11:06           ` Evgeniy Polyakov
  0 siblings, 2 replies; 26+ messages in thread
From: Evgeniy Polyakov @ 2007-02-20 17:11 UTC (permalink / raw)
  To: bert hubert; +Cc: Andi Kleen, netdev, linux-kernel

On Tue, Feb 20, 2007 at 06:02:32PM +0100, bert hubert (bert.hubert@netherlabs.nl) wrote:
> On Tue, Feb 20, 2007 at 07:41:25PM +0300, Evgeniy Polyakov wrote:
> 
> > It can be recvfrom only problem - syscall overhead on my p4 (core duo,
> > debian testing) is bout 300 usec - to test I ran read('dev/zero', &data,
> > 0) in a loop.
> 
> nsec I assume?

Yes, sure - mistyped - it is about 300 nanoseconds (280-290 or so).

> The usec numbers for read(fd, &c, 0) where fd is /dev/zero:
> 1.557667, 0.627667, 0.447333, 0.440000, 0.440000, 0.440000, 0.442333,
> 0.440000, 0.440000, 0.442333, 0.442333, 0.440000, 0.440000, 0.442333,
> 0.442667, 0.440000, 0.440000, 0.440000, 0.442333, 0.442667,
> 
> In usecs. Notice the same declining figure, but not as pronounced. With a
> sleep(1) in between, we get:
> 1.692667, 1.800000, 0.782667, 1.282667, 0.665000, 0.980000, 0.925000,
> 0.887667, 0.662667, 0.862667, 1.077333, 1.442333, 0.660000, 1.890000,
> 0.672333, 0.795000, 0.647667, 0.692333, 0.750000, 0.865000,
> 
> This doesn't look all that unhealthy.
> 
> > Could you try to hack recvfrom() for your socket to always copy some
> > empty buffer and check the results without waiting for packet?
> 
> That might be out of my reach before tomorrow :-)

I would try it today - but it is a bit late in Moscow already - and
there are some things to complete yet. So, tomorrow I will create a patch
and run it, but I seriously doubt that there is _that_ high per-recvfrom 
latency.

> > If you are not hurry I can test it myself tomorrow.
> 
> Thanks. My major problem is that in my measurements, I quite often see the
> 'worst case' 4usec result. It would not be a problem if it happens only
> once, of course.

Depending on how frequent is 'quite often' - if it ever changes the
distribution picture, then there is some problem.

> 	Bert

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: all syscalls initially taking 4usec on a P4? Re: nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz?
  2007-02-20 17:11         ` Evgeniy Polyakov
@ 2007-02-20 17:18           ` Evgeniy Polyakov
  2007-02-21 11:06           ` Evgeniy Polyakov
  1 sibling, 0 replies; 26+ messages in thread
From: Evgeniy Polyakov @ 2007-02-20 17:18 UTC (permalink / raw)
  To: bert hubert; +Cc: Andi Kleen, netdev, linux-kernel

On Tue, Feb 20, 2007 at 08:11:20PM +0300, Evgeniy Polyakov (johnpol@2ka.mipt.ru) wrote:
> I would try it today - but it is a bit late in Moscow already - and
> there are some things to complete yet. So, tomorrow I will create a patch
> and run it, but I seriously doubt that there is _that_ high per-recvfrom 
> latency.

As of now - syscall which just copies 50 bytes from /dev/zero eats about
400-450 nanosecods per run (core duo 3.7 ghz).

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: all syscalls initially taking 4usec on a P4? Re: nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz?
  2007-02-20 16:41     ` Evgeniy Polyakov
  2007-02-20 17:02       ` bert hubert
@ 2007-02-20 18:42       ` Josef Sipek
  2007-02-20 18:48         ` Evgeniy Polyakov
  2007-02-25 10:41       ` Pavel Machek
  2 siblings, 1 reply; 26+ messages in thread
From: Josef Sipek @ 2007-02-20 18:42 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: bert hubert, Andi Kleen, netdev, linux-kernel

On Tue, Feb 20, 2007 at 07:41:25PM +0300, Evgeniy Polyakov wrote:
> On Tue, Feb 20, 2007 at 05:27:14PM +0100, bert hubert (bert.hubert@netherlabs.nl) wrote:
> > I've done so, with some interesting results. Source on
> > http://ds9a.nl/tmp/recvtimings.c - be careful to adjust the '3000' divider
> > to your CPU frequency if you care about absolute numbers!
> > 
> > These are two groups, each consisting of 10 consecutive nonblocking UDP
> > recvfroms, with 10 packets preloaded. Reported is the number of microseconds
> > per recvfrom call which yielded a packet:
> > 
> > $ ./recvtimings
> > 4.142333
> 
> It can be recvfrom only problem - syscall overhead on my p4 (core duo,
> debian testing) is bout 300 usec - to test I ran 
> read('dev/zero', &data, 0)
> in a loop.
 
A better thing would be to use getuid - it turns into just a return with a
memory dereference). I ran it on my 3.06GHz P4 (HT, but only UP kernel),
PREEMPT, HZ=1000...

3.290196 0.470588 0.402614 0.396078 0.393464 0.396078 0.386928 0.386928 0.386928 0.386928
0.386928 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928
0.386928 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928
0.386928 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928

I used the rdtsc instruction to measure the time for the getuid syscall (see
bellow for source to the test app).
 
---8<-------------
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>

/* CPU speed in MHz */
#define CPUFREQ 3060

#define rdtscll(val) \
      __asm__ __volatile__("rdtsc" : "=A" (val))

int main()
{
	unsigned long long start, end;
	int i;

	for(i=0;i<10000;i++) {
		rdtscll(start);
		getuid();
		rdtscll(end);

		printf("delta for syscall: %llu cycles = %f us\n", end-start, (end-start)/(float)CPUFREQ);
	}

	return 0;
}
---8<-------------

Josef "Jeff" Sipek.

-- 
Research, n.:
  Consider Columbus:
    He didn't know where he was going.
    When he got there he didn't know where he was.
    When he got back he didn't know where he had been.
    And he did it all on someone else's money.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: all syscalls initially taking 4usec on a P4? Re: nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz?
  2007-02-20 18:42       ` Josef Sipek
@ 2007-02-20 18:48         ` Evgeniy Polyakov
  2007-02-20 19:33           ` bert hubert
  0 siblings, 1 reply; 26+ messages in thread
From: Evgeniy Polyakov @ 2007-02-20 18:48 UTC (permalink / raw)
  To: Josef Sipek; +Cc: bert hubert, Andi Kleen, netdev, linux-kernel

On Tue, Feb 20, 2007 at 01:42:42PM -0500, Josef Sipek (jsipek@fsl.cs.sunysb.edu) wrote:
> A better thing would be to use getuid - it turns into just a return with a
> memory dereference). I ran it on my 3.06GHz P4 (HT, but only UP kernel),
> PREEMPT, HZ=1000...
> 
> 3.290196 0.470588 0.402614 0.396078 0.393464 0.396078 0.386928 0.386928 0.386928 0.386928
> 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928
> 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928
> 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928 0.386928

I got following.

delta for syscall: 9112 cycles = 2.977778 us
delta for syscall: 1224 cycles = 0.400000 us
delta for syscall: 1071 cycles = 0.350000 us
delta for syscall: 1054 cycles = 0.344444 us
delta for syscall: 1054 cycles = 0.344444 us

Likely first overhead related to cache population or gamma-ray radiation.
If it happens only one (it does in my test), then everything is ok I
think. Bert, how frequently you get that long recvfrom()?

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: all syscalls initially taking 4usec on a P4? Re: nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz?
  2007-02-20 18:48         ` Evgeniy Polyakov
@ 2007-02-20 19:33           ` bert hubert
  2007-02-20 19:40             ` Benjamin LaHaise
  2007-02-20 22:02             ` Rick Jones
  0 siblings, 2 replies; 26+ messages in thread
From: bert hubert @ 2007-02-20 19:33 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: Josef Sipek, Andi Kleen, netdev, linux-kernel

On Tue, Feb 20, 2007 at 09:48:59PM +0300, Evgeniy Polyakov wrote:

> Likely first overhead related to cache population or gamma-ray radiation.
> If it happens only one (it does in my test), then everything is ok I
> think. Bert, how frequently you get that long recvfrom()?

I have plotted the average time for a single non-blocking UPDv4 recvfrom
call returning 100 bytes, based on the delay I insert between recvfrom
calls, as measured in cycles spent busywaiting.

In theory, this graph should show some slope, perhaps because of the higher
chance of context switches, cache evictions and purging of any branche-prediction
information the CPU might have kept. I'm no expert.

I measure a huge slope, however. Starting at 1usec for back-to-back system
calls, it rises to 2usec after interleaving calls with a count to 20
million.

4usec is hit after 110 million.

The graph, with semi-scientific error-bars is on
http://ds9a.nl/tmp/recvfrom-usec-vs-wait.png

The code to generate it is on:
http://ds9a.nl/tmp/recvtimings.c

I'm investigating this further for other system calls. It might be that my
measurements are off, but it appears even a slight delay between calls
incurs a large penalty.

	Bert

-- 
http://www.PowerDNS.com      Open source, database driven DNS Software 
http://netherlabs.nl              Open and Closed source services

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: all syscalls initially taking 4usec on a P4? Re: nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz?
  2007-02-20 19:33           ` bert hubert
@ 2007-02-20 19:40             ` Benjamin LaHaise
  2007-02-20 20:45               ` bert hubert
  2007-02-20 22:02             ` Rick Jones
  1 sibling, 1 reply; 26+ messages in thread
From: Benjamin LaHaise @ 2007-02-20 19:40 UTC (permalink / raw)
  To: bert hubert, Evgeniy Polyakov, Josef Sipek, Andi Kleen, netdev,
	linux-kernel

On Tue, Feb 20, 2007 at 08:33:20PM +0100, bert hubert wrote:
> I'm investigating this further for other system calls. It might be that my
> measurements are off, but it appears even a slight delay between calls
> incurs a large penalty.

Make sure your system is idle.  Userspace bloat means that *lots* of idle 
activity occurs in between timer ticks on recent distributions -- all those 
widgets polling the hardware to see if something changed or needs updating 
do a lot of damage to the caches.  Try comparing a run under init=/bin/bash 
with one while logged in to a desktop environment to see just how painful it 
is.

		-ben
-- 
"Time is of no importance, Mr. President, only life is important."
Don't Email: <dont@kvack.org>.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: all syscalls initially taking 4usec on a P4? Re: nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz?
  2007-02-20 19:40             ` Benjamin LaHaise
@ 2007-02-20 20:45               ` bert hubert
  2007-02-20 21:02                 ` Stephen Hemminger
  0 siblings, 1 reply; 26+ messages in thread
From: bert hubert @ 2007-02-20 20:45 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Evgeniy Polyakov, Josef Sipek, Andi Kleen, netdev, linux-kernel

On Tue, Feb 20, 2007 at 02:40:40PM -0500, Benjamin LaHaise wrote:

> Make sure your system is idle.  Userspace bloat means that *lots* of idle 
> activity occurs in between timer ticks on recent distributions -- all those 

You hit the nail on the head. I had previously measured with X shut down,
but the effect didn't disappear.

With init=/bin/bash, recvfrom suddenly takes from 900nsec to 1.3usec, with
only slight correlation between inter-call delay and cycles spent.

I'm investigating this further as it appears this has a real life effect on
my P4 - a drastic one!

processor       : 0
vendor_id       : GenuineIntel
cpu family      : 15
model           : 4
model name      : Intel(R) Pentium(R) 4 CPU 3.00GHz
stepping        : 1
cpu MHz         : 3000.131
cache size      : 1024 KB
fdiv_bug        : no
hlt_bug         : no
f00f_bug        : no
coma_bug        : no
fpu             : yes
fpu_exception   : yes
cpuid level     : 5
wp              : yes
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe nx
constant_tsc pni monitor ds_cpl cid xtpr
bogomips        : 6003.91
clflush size    : 64

Thanks for your help!

-- 
http://www.PowerDNS.com      Open source, database driven DNS Software 
http://netherlabs.nl              Open and Closed source services

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: all syscalls initially taking 4usec on a P4? Re: nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz?
  2007-02-20 20:45               ` bert hubert
@ 2007-02-20 21:02                 ` Stephen Hemminger
  0 siblings, 0 replies; 26+ messages in thread
From: Stephen Hemminger @ 2007-02-20 21:02 UTC (permalink / raw)
  To: bert hubert
  Cc: Benjamin LaHaise, Evgeniy Polyakov, Josef Sipek, Andi Kleen,
	netdev, linux-kernel

On Tue, 20 Feb 2007 21:45:05 +0100
bert hubert <bert.hubert@netherlabs.nl> wrote:

> On Tue, Feb 20, 2007 at 02:40:40PM -0500, Benjamin LaHaise wrote:
> 
> > Make sure your system is idle.  Userspace bloat means that *lots* of idle 
> > activity occurs in between timer ticks on recent distributions -- all those 
> 
> You hit the nail on the head. I had previously measured with X shut down,
> but the effect didn't disappear.
> 
> With init=/bin/bash, recvfrom suddenly takes from 900nsec to 1.3usec, with
> only slight correlation between inter-call delay and cycles spent.
> 
> I'm investigating this further as it appears this has a real life effect on
> my P4 - a drastic one!
> 
> processor       : 0
> vendor_id       : GenuineIntel
> cpu family      : 15
> model           : 4
> model name      : Intel(R) Pentium(R) 4 CPU 3.00GHz
> stepping        : 1
> cpu MHz         : 3000.131
> cache size      : 1024 KB
> fdiv_bug        : no
> hlt_bug         : no
> f00f_bug        : no
> coma_bug        : no
> fpu             : yes
> fpu_exception   : yes
> cpuid level     : 5
> wp              : yes
> flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
> cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe nx
> constant_tsc pni monitor ds_cpl cid xtpr
> bogomips        : 6003.91
> clflush size    : 64
> 
> Thanks for your help!
> 

What happens with preempt if your process is high priority or SCHED_FIFO/SCHED_RR?

-- 
Stephen Hemminger <shemminger@linux-foundation.org>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: all syscalls initially taking 4usec on a P4? Re: nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz?
  2007-02-20 19:33           ` bert hubert
  2007-02-20 19:40             ` Benjamin LaHaise
@ 2007-02-20 22:02             ` Rick Jones
  2007-02-20 22:17               ` bert hubert
  1 sibling, 1 reply; 26+ messages in thread
From: Rick Jones @ 2007-02-20 22:02 UTC (permalink / raw)
  To: bert hubert
  Cc: Evgeniy Polyakov, Josef Sipek, Andi Kleen, netdev, linux-kernel

> I measure a huge slope, however. Starting at 1usec for back-to-back system
> calls, it rises to 2usec after interleaving calls with a count to 20
> million.
> 
> 4usec is hit after 110 million.
> 
> The graph, with semi-scientific error-bars is on
> http://ds9a.nl/tmp/recvfrom-usec-vs-wait.png
> 
> The code to generate it is on:
> http://ds9a.nl/tmp/recvtimings.c
> 
> I'm investigating this further for other system calls. It might be that my
> measurements are off, but it appears even a slight delay between calls
> incurs a large penalty.

The slope appears to be flattening-out the farther out to the right it 
goes.  Perhaps that is the length of time it takes to take all the 
requisite cache misses.

Some judicious use of HW perf counters might be in order via say papi or 
pfmon.  Otherwise, you could try a test where you don't delay, but do 
try to blow-out the cache(s) between recvfrom() calls.  If the delay 
there starts to match the delay as you go out to the right on the graph 
it would suggest that it is indeed cache effects.

rick jones

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: all syscalls initially taking 4usec on a P4? Re: nonblocking  UDPv4 recvfrom() taking 4usec @ 3GHz?
  2007-02-20 22:02             ` Rick Jones
@ 2007-02-20 22:17               ` bert hubert
  2007-02-20 22:22                   ` Arjan van de Ven
  2007-02-20 22:46                 ` Ian McDonald
  0 siblings, 2 replies; 26+ messages in thread
From: bert hubert @ 2007-02-20 22:17 UTC (permalink / raw)
  To: Rick Jones
  Cc: Evgeniy Polyakov, Josef Sipek, Andi Kleen, netdev, linux-kernel

On Tue, Feb 20, 2007 at 02:02:00PM -0800, Rick Jones wrote:

> The slope appears to be flattening-out the farther out to the right it 
> goes.  Perhaps that is the length of time it takes to take all the 
> requisite cache misses.

The rate of flattening out appears to correlate with the number of processes
running, even though the system is otherwise >99.5% idle during my
measurements.

With only 'gdm' running, things flatten out slowly, iow, it takes longer
delays to see recvfrom slow down. With only 1 process running (init=bash),
the graph is nearly flat.

>From this, it is probable that even an idle GNOME desktop (Ubunty Edgy Eft)
is under fierce cache pressure, enough to blow away my meagre 1MB in a
matter of milliseconds.

I'm trying to figure out which processes have the most impact, I had already
killed anything non-essential. But that still leaves 140 pids.

	Bert

-- 
http://www.PowerDNS.com      Open source, database driven DNS Software 
http://netherlabs.nl              Open and Closed source services

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: all syscalls initially taking 4usec on a P4? Re: nonblocking  UDPv4 recvfrom() taking 4usec @ 3GHz?
  2007-02-20 22:17               ` bert hubert
@ 2007-02-20 22:22                   ` Arjan van de Ven
  2007-02-20 22:46                 ` Ian McDonald
  1 sibling, 0 replies; 26+ messages in thread
From: Arjan van de Ven @ 2007-02-20 22:22 UTC (permalink / raw)
  To: bert hubert
  Cc: Rick Jones, Evgeniy Polyakov, Josef Sipek, Andi Kleen, netdev,
	linux-kernel


> I'm trying to figure out which processes have the most impact, I had already
> killed anything non-essential. But that still leaves 140 pids.

btw if you have systemtap on your system you can see who is doing evil
with

http://www.fenrus.org/cstop.stp

also.. running "vmstat 3" and looking at the "cs" column is interesting;
it shouldn't be above 50 or so in idle (well not above 10 but our
userland stinks too much for that)



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: all syscalls initially taking 4usec on a P4? Re: nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz?
@ 2007-02-20 22:22                   ` Arjan van de Ven
  0 siblings, 0 replies; 26+ messages in thread
From: Arjan van de Ven @ 2007-02-20 22:22 UTC (permalink / raw)
  To: bert hubert
  Cc: Rick Jones, Evgeniy Polyakov, Josef Sipek, Andi Kleen, netdev,
	linux-kernel


> I'm trying to figure out which processes have the most impact, I had already
> killed anything non-essential. But that still leaves 140 pids.

btw if you have systemtap on your system you can see who is doing evil
with

http://www.fenrus.org/cstop.stp

also.. running "vmstat 3" and looking at the "cs" column is interesting;
it shouldn't be above 50 or so in idle (well not above 10 but our
userland stinks too much for that)



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: all syscalls initially taking 4usec on a P4? Re: nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz?
  2007-02-20 22:17               ` bert hubert
  2007-02-20 22:22                   ` Arjan van de Ven
@ 2007-02-20 22:46                 ` Ian McDonald
  1 sibling, 0 replies; 26+ messages in thread
From: Ian McDonald @ 2007-02-20 22:46 UTC (permalink / raw)
  To: bert hubert, Rick Jones, Evgeniy Polyakov, Josef Sipek,
	Andi Kleen, netdev, linux-kernel

On 2/21/07, bert hubert <bert.hubert@netherlabs.nl> wrote:
> I'm trying to figure out which processes have the most impact, I had already
> killed anything non-essential. But that still leaves 140 pids.
>
>         Bert
>
That sounds way too many pids. I run a script to shut down processes
when I do testing as it makes a HUGE difference to my timing of things
which can be quite critical.

Here's my list of 46 and that includes me sshing into a box and
checking for processes:
UnID        PID  PPID CMD
root         1     0 init [2]
root         2     1 [ksoftirqd/0]
root         3     1 [watchdog/0]
root         4     1 [events/0]
root         5     1 [khelper]
root         6     1 [kthread]
root        40     6 [kblockd/0]
root        41     6 [kacpid]
root       110     6 [cqueue/0]
root       111     6 [ata/0]
root       112     6 [ata_aux]
root       113     6 [kseriod]
root       135     6 [rt-test-0]
root       137     6 [rt-test-1]
root       139     6 [rt-test-2]
root       141     6 [rt-test-3]
root       143     6 [rt-test-4]
root       145     6 [rt-test-5]
root       147     6 [rt-test-6]
root       149     6 [rt-test-7]
root       151     6 [pdflush]
root       152     6 [pdflush]
root       153     6 [kswapd0]
root       154     6 [aio/0]
root       838     6 [kedac]
root       843     6 [kjournald]
root      1720     6 [ksuspend_usbd]
root      1721     6 [khubd]
root      1741     6 [kpsmoused]
root      2544     1 /sbin/syslogd
root      2554     1 /sbin/klogd -x
root      2851     1 /usr/sbin/inetd
root      2863     1 /usr/sbin/sshd
ntp       2954     1 /usr/sbin/ntpd -p /var/run/ntpd.pid -u 111:111 -g
root      3061     1 /bin/login --
root      3062     1 /sbin/getty 38400 tty2
root      3063     1 /sbin/getty 38400 tty3
root      3064     1 /sbin/getty 38400 tty4
root      3065     1 /sbin/getty 38400 tty5
root      3066     1 /sbin/getty 38400 tty6
ian       3083  3061 -bash
root     21518  2863 sshd: ian [priv]
ian      21520 21518 sshd: ian@pts/1
ian      21521 21520 -bash
ian      21747 21521 ps -ef

-- 
Web: http://wand.net.nz/~iam4
Blog: http://iansblog.jandi.co.nz
WAND Network Research Group

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: all syscalls initially taking 4usec on a P4? Re: nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz?
  2007-02-20 17:11         ` Evgeniy Polyakov
  2007-02-20 17:18           ` Evgeniy Polyakov
@ 2007-02-21 11:06           ` Evgeniy Polyakov
  2007-02-21 11:34             ` Andi Kleen
  1 sibling, 1 reply; 26+ messages in thread
From: Evgeniy Polyakov @ 2007-02-21 11:06 UTC (permalink / raw)
  To: bert hubert; +Cc: Andi Kleen, netdev, linux-kernel

Here is data for 50 bytes reading for essentially idle machine 
(core duo 2.4 ghz):

delta for syscall: 3326961404-3326969261: 7857 cycles = 3.273750 us
delta for syscall: 3326975687-3326980979: 5292 cycles = 2.205000 us
delta for syscall: 3327199967-3327205583: 5616 cycles = 2.340000 us
delta for syscall: 3327208706-3327214817: 6111 cycles = 2.546250 us
delta for syscall: 3327442427-3327450446: 8019 cycles = 3.341250 us
delta for syscall: 3327453281-3327459104: 5823 cycles = 2.426250 us
delta for syscall: 3327462002-3327466844: 4842 cycles = 2.017500 us
delta for syscall: 3327726809-3327733451: 6642 cycles = 2.767500 us
delta for syscall: 3327736178-3327741947: 5769 cycles = 2.403750 us
delta for syscall: 3327968972-3327974264: 5292 cycles = 2.205000 us
delta for syscall: 3327977981-3327984506: 6525 cycles = 2.718750 us
delta for syscall: 3327987215-3327991958: 4743 cycles = 1.976250 us
delta for syscall: 3328245416-3328252058: 6642 cycles = 2.767500 us
delta for syscall: 3328258646-3328264172: 5526 cycles = 2.302500 us

and sometimes (frequently enough)

delta for syscall: 1937450129-1937469011: 18882 cycles = 7.867500 us
delta for syscall: 1956633332-1956654248: 20916 cycles = 8.715000 us
delta for syscall: 1995042065-1995061703: 19638 cycles = 8.182500 us
delta for syscall: 2014239488-2014257992: 18504 cycles = 7.710000 us
delta for syscall: 2016294674-2016316202: 21528 cycles = 8.970000 us
delta for syscall: 2054518259-2054537447: 19188 cycles = 7.995000 us
delta for syscall: 2071839668-2071857146: 17478 cycles = 7.282500 us

But there is even following sometimes:

delta for syscall: 3271734859-3271872037: 137178 cycles = 57.157501 us

Queue of the non-blocking socket is being kept full by another host
which sends udp with 40k chunks per socket write.

Reading /dev/zero in chunks of 50 bytes ends up with about 190
nanosecond on that machine.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: all syscalls initially taking 4usec on a P4? Re: nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz?
  2007-02-21 11:06           ` Evgeniy Polyakov
@ 2007-02-21 11:34             ` Andi Kleen
  0 siblings, 0 replies; 26+ messages in thread
From: Andi Kleen @ 2007-02-21 11:34 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: bert hubert, Andi Kleen, netdev, linux-kernel

On Wed, Feb 21, 2007 at 02:06:34PM +0300, Evgeniy Polyakov wrote:
> Here is data for 50 bytes reading for essentially idle machine 
> (core duo 2.4 ghz):
> 
> delta for syscall: 3326961404-3326969261: 7857 cycles = 3.273750 us

Can you oprofile it too?

-Andi

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: all syscalls initially taking 4usec on a P4? Re: nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz?
  2007-02-20 22:22                   ` Arjan van de Ven
  (?)
@ 2007-02-21 16:25                   ` Chuck Ebbert
  -1 siblings, 0 replies; 26+ messages in thread
From: Chuck Ebbert @ 2007-02-21 16:25 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: bert hubert, Rick Jones, Evgeniy Polyakov, Josef Sipek,
	Andi Kleen, netdev, linux-kernel

Arjan van de Ven wrote:
> also.. running "vmstat 3" and looking at the "cs" column is interesting;
> it shouldn't be above 50 or so in idle (well not above 10 but our
> userland stinks too much for that)

I average 6 or so with my normal configuration.


Chuck "kill the daemons" Ebbert

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: all syscalls initially taking 4usec on a P4? Re: nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz?
  2007-02-20 16:41     ` Evgeniy Polyakov
  2007-02-20 17:02       ` bert hubert
  2007-02-20 18:42       ` Josef Sipek
@ 2007-02-25 10:41       ` Pavel Machek
  2007-02-25 17:06         ` Evgeniy Polyakov
  2 siblings, 1 reply; 26+ messages in thread
From: Pavel Machek @ 2007-02-25 10:41 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: bert hubert, Andi Kleen, netdev, linux-kernel

Hi!

> > I've done so, with some interesting results. Source on
> > http://ds9a.nl/tmp/recvtimings.c - be careful to adjust the '3000' divider
> > to your CPU frequency if you care about absolute numbers!
> > 
> > These are two groups, each consisting of 10 consecutive nonblocking UDP
> > recvfroms, with 10 packets preloaded. Reported is the number of microseconds
> > per recvfrom call which yielded a packet:
> > 
> > $ ./recvtimings
> > 4.142333
> 
> It can be recvfrom only problem - syscall overhead on my p4 (core duo,
> debian testing) is bout 300 usec - to test I ran 

core duo is _not_ p4 class cpu; rsulets there will be very different.

									Pavel

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: all syscalls initially taking 4usec on a P4? Re: nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz?
  2007-02-25 10:41       ` Pavel Machek
@ 2007-02-25 17:06         ` Evgeniy Polyakov
  0 siblings, 0 replies; 26+ messages in thread
From: Evgeniy Polyakov @ 2007-02-25 17:06 UTC (permalink / raw)
  To: Pavel Machek; +Cc: bert hubert, Andi Kleen, netdev, linux-kernel

On Sun, Feb 25, 2007 at 11:41:54AM +0100, Pavel Machek (pavel@ucw.cz) wrote:
> > > I've done so, with some interesting results. Source on
> > > http://ds9a.nl/tmp/recvtimings.c - be careful to adjust the '3000' divider
> > > to your CPU frequency if you care about absolute numbers!
> > > 
> > > These are two groups, each consisting of 10 consecutive nonblocking UDP
> > > recvfroms, with 10 packets preloaded. Reported is the number of microseconds
> > > per recvfrom call which yielded a packet:
> > > 
> > > $ ./recvtimings
> > > 4.142333
> > 
> > It can be recvfrom only problem - syscall overhead on my p4 (core duo,
> > debian testing) is bout 300 usec - to test I ran 
> 
> core duo is _not_ p4 class cpu; rsulets there will be very different.

Results nevertheless are the same.
Each syscall takes some time first (noticebly more than subsequent
calls), and that was a main problem for Bert.
Given the high load, recvfrom() can even take tens of microseconds
(although I can not provide a profile output yet, but I showed a data).

So, syscall overhead itself is very small no matter which type of the
CPU is used - athlon is about 300 nsec, via epia about 1.4 usec), 
but the whole function can take quite a lot of time.

> 									Pavel
> 
> -- 
> (english) http://www.livejournal.com/~pavelmachek
> (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2007-02-25 17:07 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-02-19 23:14 nonblocking UDPv4 recvfrom() taking 4usec @ 3GHz? bert hubert
2007-02-19 23:56 ` Stephen Hemminger
2007-02-20  8:04   ` bert hubert
2007-02-20 10:50 ` Andi Kleen
2007-02-20 16:27   ` all syscalls initially taking 4usec on a P4? " bert hubert
2007-02-20 16:41     ` Evgeniy Polyakov
2007-02-20 17:02       ` bert hubert
2007-02-20 17:11         ` Evgeniy Polyakov
2007-02-20 17:18           ` Evgeniy Polyakov
2007-02-21 11:06           ` Evgeniy Polyakov
2007-02-21 11:34             ` Andi Kleen
2007-02-20 18:42       ` Josef Sipek
2007-02-20 18:48         ` Evgeniy Polyakov
2007-02-20 19:33           ` bert hubert
2007-02-20 19:40             ` Benjamin LaHaise
2007-02-20 20:45               ` bert hubert
2007-02-20 21:02                 ` Stephen Hemminger
2007-02-20 22:02             ` Rick Jones
2007-02-20 22:17               ` bert hubert
2007-02-20 22:22                 ` Arjan van de Ven
2007-02-20 22:22                   ` Arjan van de Ven
2007-02-21 16:25                   ` Chuck Ebbert
2007-02-20 22:46                 ` Ian McDonald
2007-02-25 10:41       ` Pavel Machek
2007-02-25 17:06         ` Evgeniy Polyakov
2007-02-20 16:57     ` Eric Dumazet

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.