All of lore.kernel.org
 help / color / mirror / Atom feed
From: Chris Wilson <chris@chris-wilson.co.uk>
To: "H. Peter Anvin" <hpa@zytor.com>
Cc: Andy Lutomirski <luto@amacapital.net>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	Ross Zwisler <ross.zwisler@linux.intel.com>,
	"H . Peter Anvin" <hpa@linux.intel.com>,
	Borislav Petkov <bp@alien8.de>, Brian Gerst <brgerst@gmail.com>,
	Denys Vlasenko <dvlasenk@redhat.com>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	Imre Deak <imre.deak@intel.com>,
	Daniel Vetter <daniel.vetter@ffwll.ch>,
	DRI <dri-devel@lists.freedesktop.org>
Subject: Re: [PATCH] x86: Add an explicit barrier() to clflushopt()
Date: Thu, 7 Jan 2016 21:54:01 +0000	[thread overview]
Message-ID: <20160107215401.GB25144@nuc-i3427.alporthouse.com> (raw)
In-Reply-To: <568ED31F.1090004@zytor.com>

On Thu, Jan 07, 2016 at 01:05:35PM -0800, H. Peter Anvin wrote:
> On 01/07/16 11:44, Chris Wilson wrote:
> > 
> > Now I feel silly. Looking at the .s, there is no difference with the
> > addition of the barrier to clflush_cache_range(). And sure enough
> > letting the test run for longer, we see a failure. I fell for a placebo.
> > 
> > The failing assertion is always on the last cacheline and is always one
> > value behind. Oh well, back to wondering where we miss the flush.
> > -Chris
> > 
> 
> Could you include the assembly here?

Sure, here you go:

.LHOTB18:
	.p2align 4,,15
	.globl	clflush_cache_range
	.type	clflush_cache_range, @function
clflush_cache_range:
.LFB2505:
	.loc 1 131 0
	.cfi_startproc
.LVL194:
1:	call	__fentry__
	.loc 1 132 0
	movzwl	boot_cpu_data+198(%rip), %eax
	.loc 1 131 0
	pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset 6, -16
	.loc 1 133 0
	movl	%esi, %esi
.LVL195:
	addq	%rdi, %rsi
.LVL196:
	.loc 1 131 0
	movq	%rsp, %rbp
	.cfi_def_cfa_register 6
	.loc 1 132 0
	subl	$1, %eax
	cltq
.LVL197:
	.loc 1 136 0
#APP
# 136 "arch/x86/mm/pageattr.c" 1
	mfence
# 0 "" 2
	.loc 1 138 0
#NO_APP
	notq	%rax
.LVL198:
	andq	%rax, %rdi
.LVL199:
	cmpq	%rdi, %rsi
	jbe	.L216
.L217:
.LBB1741:
.LBB1742:
	.loc 8 198 0
#APP
# 198 "./arch/x86/include/asm/special_insns.h" 1
	661:
	.byte 0x3e; clflush (%rdi)
662:
.skip -(((6651f-6641f)-(662b-661b)) > 0) * ((6651f-6641f)-(662b-661b)),0x90
663:
.pushsection .altinstructions,"a"
 .long 661b - .
 .long 6641f - .
 .word ( 9*32+23)
 .byte 663b-661b
 .byte 6651f-6641f
 .byte 663b-662b
.popsection
.pushsection .altinstr_replacement, "ax"
6641:
	.byte 0x66; clflush (%rdi)
6651:
	.popsection
# 0 "" 2
#NO_APP
.LBE1742:
.LBE1741:
	.loc 1 141 0
	.loc 1 139 0
	movzwl	boot_cpu_data+198(%rip), %eax
	addq	%rax, %rdi
	.loc 1 138 0
	cmpq	%rdi, %rsi
	ja	.L217
.L216:
	.loc 1 144 0
#APP
# 144 "arch/x86/mm/pageattr.c" 1
	mfence
# 0 "" 2
	.loc 1 145 0
#NO_APP
	popq	%rbp
	.cfi_restore 6
	.cfi_def_cfa 7, 8
	ret
	.cfi_endproc
.LFE2505:
	.size	clflush_cache_range, .-clflush_cache_range
	.section	.text.unlikely


Whilst you are looking at this asm, note that we reload
boot_cpu_data.x86_cflush_size everytime around the loop. That's a small
but noticeable extra cost (especially when we are only flushing a single
cacheline).

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a3137a4..2cd2b4b 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -129,14 +129,13 @@ within(unsigned long addr, unsigned long start, unsigned long end)
  */
 void clflush_cache_range(void *vaddr, unsigned int size)
 {
-       unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
+       unsigned long clflush_size = boot_cpu_data.x86_clflush_size;
        void *vend = vaddr + size;
-       void *p;
+       void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1));
 
        mb();
 
-       for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
-            p < vend; p += boot_cpu_data.x86_clflush_size)
+       for (; p < vend; p += clflush_size)
                clflushopt(p);
 
        mb();

-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

WARNING: multiple messages have this Message-ID (diff)
From: Chris Wilson <chris@chris-wilson.co.uk>
To: "H. Peter Anvin" <hpa@zytor.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>,
	Brian Gerst <brgerst@gmail.com>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	DRI <dri-devel@lists.freedesktop.org>,
	Andy Lutomirski <luto@amacapital.net>,
	Borislav Petkov <bp@alien8.de>,
	Daniel Vetter <daniel.vetter@ffwll.ch>,
	Ross Zwisler <ross.zwisler@linux.intel.com>,
	"H . Peter Anvin" <hpa@linux.intel.com>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Thomas Gleixner <tglx@linutronix.de>
Subject: Re: [PATCH] x86: Add an explicit barrier() to clflushopt()
Date: Thu, 7 Jan 2016 21:54:01 +0000	[thread overview]
Message-ID: <20160107215401.GB25144@nuc-i3427.alporthouse.com> (raw)
In-Reply-To: <568ED31F.1090004@zytor.com>

On Thu, Jan 07, 2016 at 01:05:35PM -0800, H. Peter Anvin wrote:
> On 01/07/16 11:44, Chris Wilson wrote:
> > 
> > Now I feel silly. Looking at the .s, there is no difference with the
> > addition of the barrier to clflush_cache_range(). And sure enough
> > letting the test run for longer, we see a failure. I fell for a placebo.
> > 
> > The failing assertion is always on the last cacheline and is always one
> > value behind. Oh well, back to wondering where we miss the flush.
> > -Chris
> > 
> 
> Could you include the assembly here?

Sure, here you go:

.LHOTB18:
	.p2align 4,,15
	.globl	clflush_cache_range
	.type	clflush_cache_range, @function
clflush_cache_range:
.LFB2505:
	.loc 1 131 0
	.cfi_startproc
.LVL194:
1:	call	__fentry__
	.loc 1 132 0
	movzwl	boot_cpu_data+198(%rip), %eax
	.loc 1 131 0
	pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset 6, -16
	.loc 1 133 0
	movl	%esi, %esi
.LVL195:
	addq	%rdi, %rsi
.LVL196:
	.loc 1 131 0
	movq	%rsp, %rbp
	.cfi_def_cfa_register 6
	.loc 1 132 0
	subl	$1, %eax
	cltq
.LVL197:
	.loc 1 136 0
#APP
# 136 "arch/x86/mm/pageattr.c" 1
	mfence
# 0 "" 2
	.loc 1 138 0
#NO_APP
	notq	%rax
.LVL198:
	andq	%rax, %rdi
.LVL199:
	cmpq	%rdi, %rsi
	jbe	.L216
.L217:
.LBB1741:
.LBB1742:
	.loc 8 198 0
#APP
# 198 "./arch/x86/include/asm/special_insns.h" 1
	661:
	.byte 0x3e; clflush (%rdi)
662:
.skip -(((6651f-6641f)-(662b-661b)) > 0) * ((6651f-6641f)-(662b-661b)),0x90
663:
.pushsection .altinstructions,"a"
 .long 661b - .
 .long 6641f - .
 .word ( 9*32+23)
 .byte 663b-661b
 .byte 6651f-6641f
 .byte 663b-662b
.popsection
.pushsection .altinstr_replacement, "ax"
6641:
	.byte 0x66; clflush (%rdi)
6651:
	.popsection
# 0 "" 2
#NO_APP
.LBE1742:
.LBE1741:
	.loc 1 141 0
	.loc 1 139 0
	movzwl	boot_cpu_data+198(%rip), %eax
	addq	%rax, %rdi
	.loc 1 138 0
	cmpq	%rdi, %rsi
	ja	.L217
.L216:
	.loc 1 144 0
#APP
# 144 "arch/x86/mm/pageattr.c" 1
	mfence
# 0 "" 2
	.loc 1 145 0
#NO_APP
	popq	%rbp
	.cfi_restore 6
	.cfi_def_cfa 7, 8
	ret
	.cfi_endproc
.LFE2505:
	.size	clflush_cache_range, .-clflush_cache_range
	.section	.text.unlikely


Whilst you are looking at this asm, note that we reload
boot_cpu_data.x86_cflush_size everytime around the loop. That's a small
but noticeable extra cost (especially when we are only flushing a single
cacheline).

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a3137a4..2cd2b4b 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -129,14 +129,13 @@ within(unsigned long addr, unsigned long start, unsigned long end)
  */
 void clflush_cache_range(void *vaddr, unsigned int size)
 {
-       unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
+       unsigned long clflush_size = boot_cpu_data.x86_clflush_size;
        void *vend = vaddr + size;
-       void *p;
+       void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1));
 
        mb();
 
-       for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
-            p < vend; p += boot_cpu_data.x86_clflush_size)
+       for (; p < vend; p += clflush_size)
                clflushopt(p);
 
        mb();

-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

  reply	other threads:[~2016-01-07 21:54 UTC|newest]

Thread overview: 55+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-10-19  9:58 [PATCH] x86: Add an explicit barrier() to clflushopt() Chris Wilson
2015-10-19  9:58 ` Chris Wilson
2015-10-19 10:16 ` Borislav Petkov
2015-10-19 10:16   ` Borislav Petkov
2015-10-19 11:05   ` Chris Wilson
2015-10-19 11:05     ` Chris Wilson
2015-10-19 11:25     ` Borislav Petkov
2015-10-19 11:25       ` Borislav Petkov
2015-10-19 18:29 ` Ross Zwisler
2016-01-07 10:16 ` Chris Wilson
2016-01-07 10:16   ` Chris Wilson
2016-01-07 17:55   ` Andy Lutomirski
2016-01-07 17:55     ` Andy Lutomirski
2016-01-07 19:44     ` Chris Wilson
2016-01-07 19:44       ` Chris Wilson
2016-01-07 21:05       ` H. Peter Anvin
2016-01-07 21:54         ` Chris Wilson [this message]
2016-01-07 21:54           ` Chris Wilson
2016-01-07 22:29           ` H. Peter Anvin
2016-01-07 22:29             ` H. Peter Anvin
2016-01-07 22:32             ` H. Peter Anvin
2016-01-07 22:32               ` H. Peter Anvin
2016-01-09  5:55               ` H. Peter Anvin
2016-01-09  8:01               ` Chris Wilson
2016-01-09  8:01                 ` Chris Wilson
2016-01-09 22:36                 ` Andy Lutomirski
2016-01-09 22:36                   ` Andy Lutomirski
2016-01-11 11:28                   ` Chris Wilson
2016-01-11 11:28                     ` Chris Wilson
2016-01-11 20:11                     ` Linus Torvalds
2016-01-11 20:11                       ` Linus Torvalds
2016-01-11 21:05                       ` Chris Wilson
2016-01-11 21:05                         ` Chris Wilson
2016-01-12 16:37                         ` Chris Wilson
2016-01-12 16:37                           ` Chris Wilson
2016-01-12 17:05                           ` Linus Torvalds
2016-01-12 17:05                             ` Linus Torvalds
2016-01-12 21:13                             ` Chris Wilson
2016-01-12 21:13                               ` Chris Wilson
2016-01-12 22:07                               ` Linus Torvalds
2016-01-12 22:07                                 ` Linus Torvalds
2016-01-13  0:55                                 ` Chris Wilson
2016-01-13  0:55                                   ` Chris Wilson
2016-01-13  2:06                                   ` Linus Torvalds
2016-01-13  2:06                                     ` Linus Torvalds
2016-01-13  2:42                                     ` Andy Lutomirski
2016-01-13  2:42                                       ` Andy Lutomirski
2016-01-13  4:39                                       ` Linus Torvalds
2016-01-13  4:39                                         ` Linus Torvalds
2016-01-13 12:34                                     ` Chris Wilson
2016-01-13 12:34                                       ` Chris Wilson
2016-01-13 18:45                                       ` Linus Torvalds
2016-01-13 18:45                                         ` Linus Torvalds
2016-01-12 17:17                     ` H. Peter Anvin
2016-01-12 17:17                       ` H. Peter Anvin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20160107215401.GB25144@nuc-i3427.alporthouse.com \
    --to=chris@chris-wilson.co.uk \
    --cc=bp@alien8.de \
    --cc=brgerst@gmail.com \
    --cc=daniel.vetter@ffwll.ch \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=dvlasenk@redhat.com \
    --cc=hpa@linux.intel.com \
    --cc=hpa@zytor.com \
    --cc=imre.deak@intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@amacapital.net \
    --cc=ross.zwisler@linux.intel.com \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.