From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <linux-kernel-owner@vger.kernel.org>
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S1753859AbcHRMTE (ORCPT <rfc822;w@1wt.eu>);
	Thu, 18 Aug 2016 08:19:04 -0400
Received: from mx1.redhat.com ([209.132.183.28]:25394 "EHLO mx1.redhat.com"
	rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
	id S1752904AbcHRMTD (ORCPT <rfc822;linux-kernel@vger.kernel.org>);
	Thu, 18 Aug 2016 08:19:03 -0400
Subject: Re: RFC: Petition Intel/AMD to add POPF_IF insn
To: Linus Torvalds <torvalds@linux-foundation.org>
References: <f2d39812-8e2b-ed32-398f-8bb32bd20c7f@redhat.com>
 <CA+55aFy0zq5h7zXcL7KNZuhvVHUZvA__U4ztJvP_WTCN+dub2g@mail.gmail.com>
 <8de26c33-5597-bf00-2ac5-d265ba01d0d4@redhat.com>
 <CA+55aFwmxwQBkyDjombS4cy1q=a_buhmDjDnaa8rdC8ZDaDYEA@mail.gmail.com>
 <CALCETrXOuomfta==NmAu3WEsgj69t+-Vi0pwtM0_MHRjn=v_tw@mail.gmail.com>
 <d1c78779-b885-8e3e-a87a-a586b18423fc@redhat.com>
 <CA+55aFyrwdH1ege_rcHp9JDAcwGoqDcCrYHGTBU2HFjXgSJN5w@mail.gmail.com>
 <CA+55aFwfEzVCdeVdK6hUO4s_yy-wisPmQhtYbNUr4hFuRm8NkQ@mail.gmail.com>
 <CA+55aFw=nt9+Jk2O2cuYaTJ7qLoO3Gjv=tUH6bwjFFTg=sNHfQ@mail.gmail.com>
 <CA+55aFyf9UAhQp48YCtqNnvSs_tC3W51uCb00nMND7ZG8KQexA@mail.gmail.com>
 <1ae46402-4cd0-6afc-3dd3-e155ec59f64e@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>, Sara Sharon <sara.sharon@intel.com>,
        Dan Williams <dan.j.williams@intel.com>,
        =?UTF-8?Q?Christian_K=c3=b6nig?= <christian.koenig@amd.com>,
        Vinod Koul <vinod.koul@intel.com>,
        Alex Deucher <alexander.deucher@amd.com>,
        Johannes Berg <johannes.berg@intel.com>,
        "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>,
        Andy Lutomirski <luto@kernel.org>,
        the arch/x86 maintainers <x86@kernel.org>,
        Ingo Molnar <mingo@kernel.org>, LKML <linux-kernel@vger.kernel.org>,
        Adrian Hunter <adrian.hunter@intel.com>
From: Denys Vlasenko <dvlasenk@redhat.com>
Message-ID: <5174a3b9-7f48-49f4-c59f-0fa234b68b17@redhat.com>
Date: Thu, 18 Aug 2016 14:18:42 +0200
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101
 Thunderbird/45.2.0
MIME-Version: 1.0
In-Reply-To: <1ae46402-4cd0-6afc-3dd3-e155ec59f64e@redhat.com>
Content-Type: multipart/mixed;
 boundary="------------EE820730804703E300A78285"
X-Greylist: Sender IP whitelisted, not delayed by milter-greylist-4.5.16 (mx1.redhat.com [10.5.110.30]); Thu, 18 Aug 2016 12:18:47 +0000 (UTC)
Sender: linux-kernel-owner@vger.kernel.org
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org

This is a multi-part message in MIME format.
--------------EE820730804703E300A78285
Content-Type: text/plain; charset=utf-8; format=flowed
Content-Transfer-Encoding: 7bit

On 08/18/2016 11:21 AM, Denys Vlasenko wrote:
>>> Of course, if somebody uses native_restore_fl() to actually *disable*
>>> interrupts (when they weren't already disabled), then this untested
>>> patch will just not work. But why would you do somethign so stupid?
>>> Famous last words...
>>
>> Looking around, the vsmp code actually uses "native_restore_fl()" to
>> not just set the interrupt flag, but AC as well.
>>
>> And the PV spinlock case has that "push;popf" sequence encoded in an alternate.
>>
>> So that trivial patch may (or may not - still not tested) work for
>> some quick testing, but needs more effort for any *real* use.
>
> I'm going to test the attached patch.
...
>
> +# CONFIG_UPROBES is not set
> +# CONFIG_SCHED_OMIT_FRAME_POINTER is not set
> +# CONFIG_HYPERVISOR_GUEST is not set
> +# CONFIG_SYS_HYPERVISOR is not set
> +# CONFIG_FRAME_POINTER is not set
> +# CONFIG_KMEMCHECK is not set
> +# CONFIG_DEBUG_LOCK_ALLOC is not set
> +# CONFIG_PROVE_LOCKING is not set
> +# CONFIG_LOCK_STAT is not set
> +# CONFIG_PROVE_RCU is not set
> +# CONFIG_LATENCYTOP is not set
> +# CONFIG_FTRACE is not set
> +# CONFIG_BINARY_PRINTF is not set

Need also !CONFIG_DEBUG_SPINLOCK, then unpatched irqrestore is:

ffffffff817115a0 <_raw_spin_unlock_irqrestore>:
ffffffff817115a0:       c6 07 00                movb   $0x0,(%rdi)
ffffffff817115a3:       56                      push   %rsi
ffffffff817115a4:       9d                      popfq
ffffffff817115a5:       65 ff 0d e4 ad 8f 7e    decl   %gs:__preempt_count
ffffffff817115ac:       c3                      retq
ffffffff817115ad:       0f 1f 00                nopl   (%rax)

patched one is

ffffffff81711660 <_raw_spin_unlock_irqrestore>:
ffffffff81711660:       f7 c6 00 02 00 00       test   $0x200,%esi
ffffffff81711666:       c6 07 00                movb   $0x0,(%rdi)
ffffffff81711669:       74 01                   je     ffffffff8171166c <_raw_spin_unlock_irqrestore+0xc>
ffffffff8171166b:       fb                      sti
ffffffff8171166c:       65 ff 0d 1d ad 8f 7e    decl   %gs:__preempt_count
ffffffff81711673:       c3                      retq
ffffffff81711674:       66 90                   xchg   %ax,%ax
ffffffff81711676:       66 2e 0f 1f 84 00 00 00 00 00   nopw   %cs:0x0(%rax,%rax,1)


Ran the following twice on a quiesced machine:

taskset 1 perf stat -r60 perf bench sched messaging
taskset 1 perf stat -r60 perf bench sched pipe

Out of these four runs, both "perf bench sched pipe" runs show improvements:

-       2648.279829      task-clock (msec)         #    1.000 CPUs utilized            ( +-  0.24% )
+       2483.143469      task-clock (msec)         #    0.998 CPUs utilized            ( +-  0.31% )
-         2,000,002      context-switches          #    0.755 M/sec                    ( +-  0.00% )
+         2,000,013      context-switches          #    0.805 M/sec                    ( +-  0.00% )
-               547      page-faults               #    0.206 K/sec                    ( +-  0.04% )
+               546      page-faults               #    0.220 K/sec                    ( +-  0.04% )
-     8,723,284,926      cycles                    #    3.294 GHz                      ( +-  0.06% )
+     8,157,949,449      cycles                    #    3.285 GHz                      ( +-  0.07% )
-    12,286,937,344      instructions              #    1.41  insn per cycle           ( +-  0.03% )
+    12,255,616,405      instructions              #    1.50  insn per cycle           ( +-  0.05% )
-     2,588,839,023      branches                  #  977.555 M/sec                    ( +-  0.02% )
+     2,599,319,615      branches                  # 1046.786 M/sec                    ( +-  0.04% )
-         3,620,273      branch-misses             #    0.14% of all branches          ( +-  0.67% )
+         3,577,771      branch-misses             #    0.14% of all branches          ( +-  0.69% )
-       2.648799072 seconds time elapsed                                          ( +-  0.24% )
+       2.487452268 seconds time elapsed                                          ( +-  0.31% )

Good, we run more insns/cycle, as expected. However, a bit more branches.

But of two "perf bench sched messaging" run, one was slower on a patched kernel,
and perf shows why: more branches, and also branch miss percentage is larger:

-        690.008697      task-clock (msec)         #    0.996 CPUs utilized            ( +-  0.45% )
+        699.526509      task-clock (msec)         #    0.994 CPUs utilized            ( +-  0.28% )
-            26,796      context-switches          #    0.039 M/sec                    ( +-  8.31% )
+            29,088      context-switches          #    0.042 M/sec                    ( +-  6.62% )
-            35,477      page-faults               #    0.051 M/sec                    ( +-  0.11% )
+            35,504      page-faults               #    0.051 M/sec                    ( +-  0.14% )
-     2,157,701,609      cycles                    #    3.127 GHz                      ( +-  0.35% )
+     2,143,781,407      cycles                    #    3.065 GHz                      ( +-  0.25% )
-     3,115,212,672      instructions              #    1.44  insn per cycle           ( +-  0.28% )
+     3,253,499,549      instructions              #    1.52  insn per cycle           ( +-  0.19% )
-       661,888,593      branches                  #  959.247 M/sec                    ( +-  0.36% )
+       707,862,655      branches                  # 1011.917 M/sec                    ( +-  0.20% )
-         2,793,203      branch-misses             #    0.42% of all branches          ( +-  1.04% )
+         3,453,397      branch-misses             #    0.49% of all branches          ( +-  0.32% )
-       0.693004918 seconds time elapsed                                          ( +-  0.45% )
+       0.703630988 seconds time elapsed                                          ( +-  0.27% )

This tipped the scales, and despite higher insns/cycle, run time is worse.

The other "perf bench sched messaging" run was more lucky:

-        706.944245      task-clock (msec)         #    0.995 CPUs utilized            ( +-  0.32% )
+        687.038856      task-clock (msec)         #    0.993 CPUs utilized            ( +-  0.31% )
-            23,489      context-switches          #    0.033 M/sec                    ( +-  7.02% )
+            26,644      context-switches          #    0.039 M/sec                    ( +-  7.46% )
-            35,360      page-faults               #    0.050 M/sec                    ( +-  0.12% )
+            35,417      page-faults               #    0.052 M/sec                    ( +-  0.13% )
-     2,183,639,816      cycles                    #    3.089 GHz                      ( +-  0.35% )
+     2,123,086,753      cycles                    #    3.090 GHz                      ( +-  0.27% )
-     3,131,362,238      instructions              #    1.43  insn per cycle           ( +-  0.19% )
+     3,236,613,433      instructions              #    1.52  insn per cycle           ( +-  0.19% )
-       667,874,319      branches                  #  944.734 M/sec                    ( +-  0.21% )
+       703,677,908      branches                  # 1024.219 M/sec                    ( +-  0.20% )
-         2,859,521      branch-misses             #    0.43% of all branches          ( +-  0.56% )
+         3,462,063      branch-misses             #    0.49% of all branches          ( +-  0.33% )
-       0.710738536 seconds time elapsed                                          ( +-  0.32% )
+       0.691908533 seconds time elapsed                                          ( +-  0.31% )

However, it still had more branches (~5% more), and worse branch miss percentage.

The patch seems to work. It also does not bloat the kernel:

    text	   data	    bss	    dec	    hex	filename
8199556	5026784	2924544	16150884 f67164	vmlinux
8199897	5026784	2924544	16151225 f672b9	vmlinux.patched

However, a "conditional CLI/STI from r/m" insn could be better still.

The patch is attached.

--------------EE820730804703E300A78285
Content-Type: text/x-patch;
 name="z.diff"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
 filename="z.diff"

diff -urp linux-4.7.1.org/arch/x86/include/asm/irqflags.h linux-4.7.1.obj/arch/x86/include/asm/irqflags.h
--- linux-4.7.1.org/arch/x86/include/asm/irqflags.h	2016-08-16 09:35:15.000000000 +0200
+++ linux-4.7.1.obj/arch/x86/include/asm/irqflags.h	2016-08-18 10:01:09.514219644 +0200
@@ -44,6 +44,16 @@ static inline void native_irq_enable(voi
 	asm volatile("sti": : :"memory");
 }
 
+/*
+ * This produces a "test; jz; sti" insn sequence.
+ * It's faster than "push reg; popf". If sti is skipped, it's much faster.
+ */
+static inline void native_cond_irq_enable(unsigned long flags)
+{
+	if (flags & X86_EFLAGS_IF)
+		native_irq_enable();
+}
+
 static inline void native_safe_halt(void)
 {
 	asm volatile("sti; hlt": : :"memory");
@@ -69,7 +79,8 @@ static inline notrace unsigned long arch
 
 static inline notrace void arch_local_irq_restore(unsigned long flags)
 {
-	native_restore_fl(flags);
+	if (flags & X86_EFLAGS_IF)
+		native_irq_enable();
 }
 
 static inline notrace void arch_local_irq_disable(void)
diff -urp linux-4.7.1.org/arch/x86/kernel/paravirt.c linux-4.7.1.obj/arch/x86/kernel/paravirt.c
--- linux-4.7.1.org/arch/x86/kernel/paravirt.c	2016-08-16 09:35:15.000000000 +0200
+++ linux-4.7.1.obj/arch/x86/kernel/paravirt.c	2016-08-18 10:01:24.797155109 +0200
@@ -313,7 +313,7 @@ struct pv_time_ops pv_time_ops = {
 
 __visible struct pv_irq_ops pv_irq_ops = {
 	.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
-	.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
+	.restore_fl = __PV_IS_CALLEE_SAVE(native_cond_irq_enable),
 	.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
 	.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
 	.safe_halt = native_safe_halt,
diff -urp linux-4.7.1.org/arch/x86/kernel/paravirt_patch_32.c linux-4.7.1.obj/arch/x86/kernel/paravirt_patch_32.c
--- linux-4.7.1.org/arch/x86/kernel/paravirt_patch_32.c	2016-08-16 09:35:15.000000000 +0200
+++ linux-4.7.1.obj/arch/x86/kernel/paravirt_patch_32.c	2016-08-18 04:43:19.388102755 +0200
@@ -2,7 +2,7 @@
 
 DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
 DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
-DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
+DEF_NATIVE(pv_irq_ops, restore_fl, "testb $((1<<9)>>8),%ah; jz 1f; sti; 1: ;");
 DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
 DEF_NATIVE(pv_cpu_ops, iret, "iret");
 DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
diff -urp linux-4.7.1.org/arch/x86/kernel/paravirt_patch_64.c linux-4.7.1.obj/arch/x86/kernel/paravirt_patch_64.c
--- linux-4.7.1.org/arch/x86/kernel/paravirt_patch_64.c	2016-08-16 09:35:15.000000000 +0200
+++ linux-4.7.1.obj/arch/x86/kernel/paravirt_patch_64.c	2016-08-18 04:42:56.364545509 +0200
@@ -4,7 +4,7 @@
 
 DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
 DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
-DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
+DEF_NATIVE(pv_irq_ops, restore_fl, "testw $(1<<9),%di; jz 1f; sti; 1: ;");
 DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
 DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
 DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");

--------------EE820730804703E300A78285--