On Mon, Aug 15, 2016 at 06:51:42PM -0700, Linus Torvalds wrote:
> Anyway, including the direct reclaim call paths gets
> __remove_mapping() a bit higher, and _raw_spin_lock_irqsave climbs to
> 0.26%. But perhaps more importlantly, looking at what __remove_mapping
> actually *does* (apart from the spinlock) gives us:
> 
>  - inside remove_mapping itself (0.11% on its own - flat cost, no
> child accounting)
> 
>     48.50 │       lock   cmpxchg %edx,0x1c(%rbx)
> 
>     so that's about 0.05%
> 
>  - 0.40% __delete_from_page_cache (0.22%
> radix_tree_replace_clear_tags, 0.13%__radix_tree_lookup)
> 
>  - 0.06% workingset_eviction()
> 
> so I'm not actually seeing anything *new* expensive in there. The
> __delete_from_page_cache() overhead may have changed a bit with the
> tagged tree changes, but this doesn't look like memcg.
> 
> But we clearly have very different situations.
> 
> What does your profile show for when you actually dig into
> __remove_mapping() itself?, Looking at your flat profile, I'm assuming
> you get

-   22.26%     0.93%  [kernel]          [k] __remove_mapping
   - 3.86% __remove_mapping
      - 18.35% _raw_spin_lock_irqsave
           __pv_queued_spin_lock_slowpath
        1.32% __delete_from_page_cache
      - 0.92% _raw_spin_unlock_irqrestore
           __raw_callee_save___pv_queued_spin_unlock

And the instruction level profile:

.....
       �       xor    %ecx,%ecx
       �       mov    %rax,%r15
  0.39 �       mov    $0x2,%eax
       �       lock   cmpxchg %ecx,0x1c(%rbx)
 32.56 �       cmp    $0x2,%eax
       �     � jne    12e
       �       mov    0x20(%rbx),%rax
       �       lea    -0x1(%rax),%rdx
  0.39 �       test   $0x1,%al
       �       cmove  %rbx,%rdx
       �       mov    (%rdx),%rax
  0.39 �       test   $0x10,%al
       �     � jne    127
       �       mov    (%rbx),%rcx
       �       shr    $0xf,%rcx
       �       and    $0x1,%ecx
       �     � jne    14a
       �       mov    0x68(%r14),%rax
 36.03 �       xor    %esi,%esi
       �       test   %r13b,%r13b
       �       mov    0x50(%rax),%rdx
  1.16 �     � jne    e8
  0.96 � a9:   mov    %rbx,%rdi
.....

Indicates most time on the cmpxchg for the page ref followed by the
grabbing on the ->freepage op vector:

	freepage = mapping->a_ops->freepage;

> I come back to wondering whether maybe you're hitting some PV-lock problem.
> 
> I know queued_spin_lock_slowpath() is ok. I'm not entirely sure
> __pv_queued_spin_lock_slowpath() is.

It's the same code AFAICT, except the pv version jumps straight to
the "queue" case.

> So I'd love to see you try the non-PV case, but I also think it might
> be interesting to see what the instruction profile for
> __pv_queued_spin_lock_slowpath() itself is. They share a lot of code
> (there's some interesting #include games going on to make
> queued_spin_lock_slowpath() actually *be*
> __pv_queued_spin_lock_slowpath() with some magic hooks), but there
> might be issues.

  0.03 �       data16 data16 data16 xchg %ax,%ax
       �       push   %rbp
  0.00 �       mov    %rsp,%rbp
  0.01 �       push   %r15
       �       push   %r14
       �       push   %r13
  0.01 �       push   %r12
       �       mov    $0x18740,%r12
       �       push   %rbx
       �       mov    %rdi,%rbx
       �       sub    $0x10,%rsp
       �       add    %gs:0x7ef0d0e0(%rip),%r12
       �       movslq 0xc(%r12),%rax
  0.02 �       mov    %gs:0x7ef0d0db(%rip),%r15d
       �       add    $0x1,%r15d
       �       shl    $0x12,%r15d
       �       lea    0x1(%rax),%edx
  0.01 �       mov    %edx,0xc(%r12)
       �       mov    %eax,%edx
       �       shl    $0x4,%rax
       �       add    %rax,%r12
       �       shl    $0x10,%edx
       �       movq   $0x0,(%r12)
  0.02 �       or     %edx,%r15d
       �       mov    %gs:0x7ef0d0ad(%rip),%eax
  0.00 �       movl   $0x0,0x8(%r12)
  0.01 �       mov    %eax,0x40(%r12)
       �       movb   $0x0,0x44(%r12)
       �       mov    (%rdi),%eax
  0.88 �       test   %ax,%ax
       �     � jne    8f
  0.02 �       mov    $0x1,%edx
       �       lock   cmpxchg %dl,(%rdi)
  0.38 �       test   %al,%al
       �     � je     14a
  0.02 � 8f:   mov    %r15d,%eax
       �       shr    $0x10,%eax
       �       xchg   %ax,0x2(%rbx)
  2.07 �       shl    $0x10,%eax
       �       test   %eax,%eax
       �     � jne    171
       �       movq   $0x0,-0x30(%rbp)
  0.02 � ac:   movzbl 0x44(%r12),%eax
  0.97 �       mov    $0x1,%r13d
       �       mov    $0x100,%r14d
       �       cmp    $0x2,%al
       �       sete   %al
       �       movzbl %al,%eax
       �       mov    %rax,-0x38(%rbp)
  0.00 � ca:   movb   $0x0,0x44(%r12)
  0.00 �       mov    $0x8000,%edx
       �       movb   $0x1,0x1(%rbx)
       �     � jmp    e6
  0.04 � db:   pause
  8.04 �       sub    $0x1,%edx
       �     � je     229
       � e6:   movzbl (%rbx),%eax
  7.54 �       test   %al,%al
       �     � jne    db
  0.10 �       mov    %r14d,%eax
  0.06 �       lock   cmpxchg %r13w,(%rbx)
  2.93 ?       cmp    $0x100,%ax
       �     � jne    db
       � fc:   mov    (%rbx),%edx
  0.37 �       mov    $0x1,%ecx
       �       or     $0x1,%edx
       �     � jmp    114
  0.01 �108:   mov    %edx,%eax
       �       lock   cmpxchg %ecx,(%rbx)
  0.26 �       cmp    %edx,%eax
       �     � je     14a
       �       mov    %eax,%edx
       �114:   mov    %edx,%eax
  0.00 �       xor    %ax,%ax
       �       cmp    %r15d,%eax
       �     � je     108
  0.01 �       cmpq   $0x0,-0x30(%rbp)
       ?       movb   $0x1,(%rbx)
       �     � je     251
       �12c:   mov    -0x30(%rbp),%rsi
  0.01 �       mov    $0x1,%eax
       �       mov    $0x2,%edx
       �       movl   $0x1,0x8(%rsi)
  0.11 �       lock   cmpxchg %dl,0x44(%rsi)
  2.34 �       cmp    $0x1,%al
       �     � je     160
       �14a:   decl   %gs:0x7ef1b5bb(%rip)
  0.02 �       add    $0x10,%rsp
       �       pop    %rbx
       �       pop    %r12
  0.00 �       pop    %r13
       �       pop    %r14
       �       pop    %r15
       �       pop    %rbp
       �     � retq
       �160:   mov    -0x30(%rbp),%rsi
       �       movb   $0x3,(%rbx)
       �       mov    %rbx,%rdi
       �       callq  0xffffffff810fcf90
       �     � jmp    14a
       �171:   lea    0x44(%r12),%r14
       �       mov    %rax,%r13
       �       shr    $0x12,%eax
       �       shr    $0xc,%r13
       �       sub    $0x1,%eax
       �       and    $0x30,%r13d
       �       cltq
       �       add    $0x18740,%r13
       ?       add    -0x7d8164c0(,%rax,8),%r13
  0.03 �       mov    %r12,0x0(%r13)
  0.38 �19c:   mov    $0x8000,%eax
       �     � jmp    1b7
  0.04 �1a3:   test   %al,%al
       �     � jne    1b0
       �       movzbl 0x44(%r13),%edx
  1.66 �       test   %dl,%dl
       �     � jne    1f1
  1.75 �1b0:   pause
 64.57 �       sub    $0x1
  0.04 �    �  mov    0x8(%r12),%eax
  0.03 �    �  test   %eax,%eax
       �    �� jne    1d4
       �1c9:�  pause
       �    �  mov    0x8(%r12),%eax
       �    �  test   %eax,%eax
       �    �� je     1c9
       �1d4:�  mov    (%r12),%rax
       �    �  test   %rax,%rax
       �    �  mov    %rax,-0x30(%rbp)
  0.05 �    �� je     ac
       �    �  mov    -0x30(%rbp),%rax
       �    �  prefet (%rax)
  0.25 �    �� jmpq   ac
       �1f1:�  mov    $0x1,%eax
       �    �  xchg   %al,0x44(%r12)
       �    �  mov    0x8(%r12),%eax
       �    �  test   %eax,%eax
       �    �� jne    213
       �    �  mov    %r14,%rdi
       �    �  mov    $0x1,%esi
       �    �  callq  0xffffffff8109f7a0
       �    �  xchg   %ax,%ax
       �213:�  mov    $0x1,%eax
       �    �  xor    %edi,%edi
       �    �  lock   cmpxchg %dil,(%r14)
       �    �  mov    0x8(%r12),%eax
       �    �� jmpq   19c
       �229:�  cmpq   $0x0,-0x38(%rbp)
       �    �  movb   $0x0,0x1(%rbx)
       �    �� je     276
       �234:�  movb   $0x1,0x44(%r12)
       �    �  mov    $0x3,%esi
       �    �  mov    %rbx,%rdi
       �    �  callq  0xffffffff8109f7a0
       �    �  xchg   %ax,%ax
       �    �  movzbl (%rbx),%eax
       �    �� jmpq   ca
       �251:�  mov    (%r12),%rax
  0.14 �    �  test   %rax,%rax
       �    �  mov    %rax,-0x30(%rbp)
       �    �� jne    12c
       �262:�  pause
  0.31 �    �  mov    (%r12),%rax
       �    �  test   %rax,%rax
       �    �� je     262
       �    �  mov    %rax,-0x30(%rbp)
       �    �� jmpq   12c
       �276:?  mov    %r12,%rsi
       �    �  mov    %rbx,%rdi
       �    �  callq  0xffffffff810fcf90
       �    �  mov    %rax,-0x38(%rbp)
       �    �  mov    $0x3,%eax
       �    �  xchg   %al,(%rbx)
       �    �  test   %al,%al
       �    �� jne    234
       �    �  mov    -0x38(%rbp),%rax
       �    �  movb   $0x1,(%rbx)
       �    �  movq   $0x0,(%rax)
       �    ���jmpq   fc            

> For example, if you run a virtual 16-core system on a physical machine
> that then doesn't consistently give 16 cores to the virtual machine,
> you'll get no end of hiccups.

I learnt that lesson 6-7 years ago when I first started doing
baseline benchmarking to compare bare metal to virtualised IO
performance.

-Dave.
-- 
Dave Chinner
david@fromorbit.com