All of lore.kernel.org
 help / color / mirror / Atom feed
* Re: clang asm goto issue small reproducer
       [not found] <157245708.33559.1639605989199.JavaMail.zimbra@efficios.com>
@ 2021-12-15 23:50 ` Nick Desaulniers
  2021-12-16  0:18   ` Nick Desaulniers
  0 siblings, 1 reply; 6+ messages in thread
From: Nick Desaulniers @ 2021-12-15 23:50 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Peter Zijlstra, llvm, Bill Wendling, James Y Knight, Eli Friedman

[-- Attachment #1: Type: text/plain, Size: 2674 bytes --]

On Wed, Dec 15, 2021 at 2:06 PM Mathieu Desnoyers
<mathieu.desnoyers@efficios.com> wrote:
>
> Hi Nick,
>
> I have a small reproducer for the asm goto issue we discussed over irc. I have
> axed away all code that is not the problematic function by hand. Hopefully this
> won't change the overall optimization context too much.

If I cut the compilation pipeline off at after "middle-end"
optimizations, I can see that `this_cpu_list_pop` returns `NULL` for
test.c while in test-asm.c, it's a phi node of either NULL or head
(which looks correct; for the test-asm.c case).

"GVNPass" seems to be converting

return:                                           ; preds =
%cleanup.thread19, %cleanup.thread
  %retval.118 = phi %struct.percpu_list_node* [ %9, %cleanup.thread ],
[ null, %cleanup.thread19 ]
  ret %struct.percpu_list_node* %retval.118

into:

return:                                           ; preds =
%cleanup.thread19, %cleanup.thread
  ret %struct.percpu_list_node* null

Which drops the
  %9 = load %struct.percpu_list_node*, %struct.percpu_list_node**
%head, align 8, !tbaa !9

I think GVN may be eliminating a phi that it should not be.
Specifically, it seems that GVN thinks the default/fallthrough
successor is unreachable, which seems incorrect to me. The predecessor
lists all look correct to me, so GVN may be stumbling on callbr
itself.

$ clang -O2 test.c -emit-llvm -S -o - -Xclang -disable-llvm-passes > test.ll
$ opt -O2 -print-before=gvn -print-after=gvn
-filter-print-funcs=this_cpu_list_pop test.ll -o /dev/null 2>
test.gvn.ll

Then using this reproducer, I was able to narrow it down a little further:

$ opt -O2 -print-before=gvn -print-module-scope
--filter-print-funcs=this_cpu_list_pop test.ll -S 2>test.pre-gvn.ll
$ llvm-extract --func=this_cpu_list_pop test.pre-gvn.ll -o
test.pre-gvn.this_cpu_list_pop.bc --recursive
$ llvm-dis test.pre-gvn.this_cpu_list_pop.bc
$ cat repro.sh
#!/usr/bin/env sh
set -e
grep -q "%retval.118 = phi %struct.percpu_list_node\* \[ %9,
%cleanup.thread \], \[ null, %cleanup.thread19 \]" $1
grep -q "ret %struct.percpu_list_node\* %retval.118" $1
OUT=$(opt -passes=gvn $1 -S -o -)
echo $OUT | grep -q "ret %struct.percpu_list_node\* null"
$ llvm-reduce --test=./repro.sh test.pre-gvn.this_cpu_list_pop.ll

Then pared down further manually, see reduced.ll

I've filed https://github.com/llvm/llvm-project/issues/52735, let's
discuss more there?

>
> on x86-64:
>
> Buggy code:
> clang-13 -O2 -o test.o test.c
>
> OK code (added a dummy asm("")):
> clang-13 -O2 -o test-asm.o test-asm.c
>
> Thanks,
>
> Mathieu
>
> --
> Mathieu Desnoyers
> EfficiOS Inc.
> http://www.efficios.com



-- 
Thanks,
~Nick Desaulniers

[-- Attachment #2: test.gvn.ll --]
[-- Type: application/octet-stream, Size: 6629 bytes --]

*** IR Dump Before GVNPass on this_cpu_list_pop ***
; Function Attrs: nounwind uwtable
define dso_local %struct.percpu_list_node* @this_cpu_list_pop(%struct.percpu_list* %list, i32* writeonly %_cpu) local_unnamed_addr #0 {
entry:
  %head = alloca %struct.percpu_list_node*, align 8
  %0 = bitcast %struct.percpu_list_node** %head to i8*
  %1 = tail call i8* asm "mov %fs:0, $0", "=r,~{dirflag},~{fpsr},~{flags}"() #3, !srcloc !3
  %2 = load i32, i32* @rseq_offset, align 4, !tbaa !4
  %idx.ext.i.i11 = sext i32 %2 to i64
  %add.ptr.i.i12 = getelementptr i8, i8* %1, i64 %idx.ext.i.i11
  %cpu_id_start.i = bitcast i8* %add.ptr.i.i12 to i32*
  %3 = bitcast %struct.percpu_list_node** %head to i64*
  %4 = bitcast i8* %add.ptr.i.i12 to %struct.rseq*
  br label %for.cond

for.cond:                                         ; preds = %cleanup, %entry
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %0) #4
  %5 = load volatile i32, i32* %cpu_id_start.i, align 32, !tbaa !4
  %idxprom = sext i32 %5 to i64
  %head1 = getelementptr inbounds %struct.percpu_list, %struct.percpu_list* %list, i64 0, i32 0, i64 %idxprom, i32 0
  %6 = bitcast %struct.percpu_list_node** %head1 to i64*
  callbr void asm sideeffect ".pushsection __rseq_cs, \22aw\22\0A\09.balign 32\0A\093:\0A\09.long 0x0, 0x0\0A\09.quad 1f, (2f - 1f), 4f\0A\09.popsection\0A\09.pushsection __rseq_cs_ptr_array, \22aw\22\0A\09.quad 3b\0A\09.popsection\0A\09.pushsection __rseq_exit_point_array, \22aw\22\0A\09.quad 1f, ${7:l}\0A\09.popsection\0A\09leaq 3b(%rip), %rax\0A\09movq %rax, 8($1)\0A\091:\0A\09cmpl $0, 4($1)\0A\09jnz 4f\0A\09movq $2, %rbx\0A\09cmpq %rbx, $3\0A\09je ${7:l}\0A\09movq %rbx, $5\0A\09addq $4, %rbx\0A\09movq (%rbx), %rbx\0A\09ud2\0A\09movq %rbx, $2\0A\092:\0A\09.pushsection __rseq_failure, \22ax\22\0A\09.byte 0x0f, 0xb9, 0x3d\0A\09.long 0x53053053\0A\094:\0A\09jmp ${6:l}\0A\09.popsection\0A\09", "r,r,*m,r,er,*m,i,i,~{memory},~{cc},~{rax},~{rbx},~{dirflag},~{fpsr},~{flags}"(i32 %5, %struct.rseq* %4, i64* %6, i64 0, i64 8, i64* nonnull %3, i8* blockaddress(@this_cpu_list_pop, %cleanup), i8* blockaddress(@this_cpu_list_pop, %cleanup.thread19)) #4
          to label %if.then [label %cleanup, label %cleanup.thread19], !srcloc !8

cleanup.thread19:                                 ; preds = %for.cond
  %7 = bitcast %struct.percpu_list_node** %head to i8*
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %7) #4
  br label %return

if.then:                                          ; preds = %for.cond
  %.lcssa = phi i32 [ %5, %for.cond ]
  %8 = bitcast %struct.percpu_list_node** %head to i8*
  %tobool6.not = icmp eq i32* %_cpu, null
  br i1 %tobool6.not, label %cleanup.thread, label %if.then7

if.then7:                                         ; preds = %if.then
  store i32 %.lcssa, i32* %_cpu, align 4, !tbaa !4
  br label %cleanup.thread

cleanup.thread:                                   ; preds = %if.then, %if.then7
  %9 = load %struct.percpu_list_node*, %struct.percpu_list_node** %head, align 8, !tbaa !9
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %8) #4
  br label %return

cleanup:                                          ; preds = %for.cond
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %0) #4
  br label %for.cond

return:                                           ; preds = %cleanup.thread19, %cleanup.thread
  %retval.118 = phi %struct.percpu_list_node* [ %9, %cleanup.thread ], [ null, %cleanup.thread19 ]
  ret %struct.percpu_list_node* %retval.118
}
*** IR Dump After GVNPass on this_cpu_list_pop ***
; Function Attrs: nounwind uwtable
define dso_local %struct.percpu_list_node* @this_cpu_list_pop(%struct.percpu_list* %list, i32* writeonly %_cpu) local_unnamed_addr #0 {
entry:
  %head = alloca %struct.percpu_list_node*, align 8
  %0 = bitcast %struct.percpu_list_node** %head to i8*
  %1 = tail call i8* asm "mov %fs:0, $0", "=r,~{dirflag},~{fpsr},~{flags}"() #3, !srcloc !3
  %2 = load i32, i32* @rseq_offset, align 4, !tbaa !4
  %idx.ext.i.i11 = sext i32 %2 to i64
  %add.ptr.i.i12 = getelementptr i8, i8* %1, i64 %idx.ext.i.i11
  %cpu_id_start.i = bitcast i8* %add.ptr.i.i12 to i32*
  %3 = bitcast %struct.percpu_list_node** %head to i64*
  %4 = bitcast i8* %add.ptr.i.i12 to %struct.rseq*
  br label %for.cond

for.cond:                                         ; preds = %cleanup, %entry
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %0) #4
  %5 = load volatile i32, i32* %cpu_id_start.i, align 32, !tbaa !4
  %idxprom = sext i32 %5 to i64
  %head1 = getelementptr inbounds %struct.percpu_list, %struct.percpu_list* %list, i64 0, i32 0, i64 %idxprom, i32 0
  %6 = bitcast %struct.percpu_list_node** %head1 to i64*
  callbr void asm sideeffect ".pushsection __rseq_cs, \22aw\22\0A\09.balign 32\0A\093:\0A\09.long 0x0, 0x0\0A\09.quad 1f, (2f - 1f), 4f\0A\09.popsection\0A\09.pushsection __rseq_cs_ptr_array, \22aw\22\0A\09.quad 3b\0A\09.popsection\0A\09.pushsection __rseq_exit_point_array, \22aw\22\0A\09.quad 1f, ${7:l}\0A\09.popsection\0A\09leaq 3b(%rip), %rax\0A\09movq %rax, 8($1)\0A\091:\0A\09cmpl $0, 4($1)\0A\09jnz 4f\0A\09movq $2, %rbx\0A\09cmpq %rbx, $3\0A\09je ${7:l}\0A\09movq %rbx, $5\0A\09addq $4, %rbx\0A\09movq (%rbx), %rbx\0A\09ud2\0A\09movq %rbx, $2\0A\092:\0A\09.pushsection __rseq_failure, \22ax\22\0A\09.byte 0x0f, 0xb9, 0x3d\0A\09.long 0x53053053\0A\094:\0A\09jmp ${6:l}\0A\09.popsection\0A\09", "r,r,*m,r,er,*m,i,i,~{memory},~{cc},~{rax},~{rbx},~{dirflag},~{fpsr},~{flags}"(i32 %5, %struct.rseq* %4, i64* %6, i64 0, i64 8, i64* nonnull %3, i8* blockaddress(@this_cpu_list_pop, %cleanup), i8* blockaddress(@this_cpu_list_pop, %cleanup.thread19)) #4
          to label %if.then [label %cleanup, label %cleanup.thread19], !srcloc !8

cleanup.thread19:                                 ; preds = %for.cond
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %0) #4
  br label %return

if.then:                                          ; preds = %for.cond
  %tobool6.not = icmp eq i32* %_cpu, null
  br i1 %tobool6.not, label %cleanup.thread, label %if.then7

if.then7:                                         ; preds = %if.then
  store i32 %5, i32* %_cpu, align 4, !tbaa !4
  br label %cleanup.thread

cleanup.thread:                                   ; preds = %if.then, %if.then7
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %0) #4
  br label %return

cleanup:                                          ; preds = %for.cond
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %0) #4
  br label %for.cond

return:                                           ; preds = %cleanup.thread19, %cleanup.thread
  ret %struct.percpu_list_node* null
}

[-- Attachment #3: reduced.ll --]
[-- Type: application/octet-stream, Size: 863 bytes --]

%struct.percpu_list_node = type { i64, %struct.percpu_list_node* }

define %struct.percpu_list_node* @this_cpu_list_pop() {
  %head = alloca %struct.percpu_list_node*, i32 0, align 8
  br label %for.cond

for.cond:
  callbr void asm sideeffect "", "i,i"(i8* blockaddress(@this_cpu_list_pop, %cleanup), i8* blockaddress(@this_cpu_list_pop, %cleanup.thread19))
          to label %cleanup.thread [label %cleanup, label %cleanup.thread19]

cleanup.thread:
  %x = load %struct.percpu_list_node*, %struct.percpu_list_node** %head, align 8
  br label %return

cleanup:
  br label %for.cond

cleanup.thread19:
  br label %return

return:                                           ; preds = %cleanup.thread, %cleanup.thread19
  %retval.118 = phi %struct.percpu_list_node* [ %x, %cleanup.thread ], [ null, %cleanup.thread19 ]
  ret %struct.percpu_list_node* %retval.118
}

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: clang asm goto issue small reproducer
  2021-12-15 23:50 ` clang asm goto issue small reproducer Nick Desaulniers
@ 2021-12-16  0:18   ` Nick Desaulniers
  2021-12-16  0:25     ` Nick Desaulniers
  2021-12-16  0:26     ` Mathieu Desnoyers
  0 siblings, 2 replies; 6+ messages in thread
From: Nick Desaulniers @ 2021-12-16  0:18 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Peter Zijlstra, llvm, Bill Wendling, James Y Knight, Eli Friedman

On Wed, Dec 15, 2021 at 3:50 PM Nick Desaulniers
<ndesaulniers@google.com> wrote:
>
> On Wed, Dec 15, 2021 at 2:06 PM Mathieu Desnoyers
> <mathieu.desnoyers@efficios.com> wrote:
> >
> > Hi Nick,
> >
> > I have a small reproducer for the asm goto issue we discussed over irc. I have
> > axed away all code that is not the problematic function by hand. Hopefully this
> > won't change the overall optimization context too much.
>
> If I cut the compilation pipeline off at after "middle-end"
> optimizations, I can see that `this_cpu_list_pop` returns `NULL` for
> test.c while in test-asm.c, it's a phi node of either NULL or head
> (which looks correct; for the test-asm.c case).
>
> "GVNPass" seems to be converting
>
> return:                                           ; preds =
> %cleanup.thread19, %cleanup.thread
>   %retval.118 = phi %struct.percpu_list_node* [ %9, %cleanup.thread ],
> [ null, %cleanup.thread19 ]
>   ret %struct.percpu_list_node* %retval.118
>
> into:
>
> return:                                           ; preds =
> %cleanup.thread19, %cleanup.thread
>   ret %struct.percpu_list_node* null
>
> Which drops the
>   %9 = load %struct.percpu_list_node*, %struct.percpu_list_node**
> %head, align 8, !tbaa !9

Ah, Eli pointed out that I'm chasing the wrong squirrel. From test.c we have:

struct percpu_list_node *head;
load = (intptr_t *)&head;
ret = rseq_cmpnev_storeoffp_load(targetptr, expectnot,
     offset, load, cpu);

then in rseq_cmpnev_storeoffp_load():
__asm__ __volatile__ goto (
  :
  :
[load] "m" (*load));

Did we ever initialize head?  Isn't this equivalent to:

int* foo (void) {
  int *head;
  int **load;
  load = &head;
  asm(""::"m"(*load):"memory");
  return head;
}

Isn't that equivalent to:

int* foo (void) {
  int *head;
  asm(""::"m"(head):"memory");
  return head;
}

For which neither compiler warns for -Wuninitialized? They would if
head was an argument to a function call...

>
> I think GVN may be eliminating a phi that it should not be.
> Specifically, it seems that GVN thinks the default/fallthrough
> successor is unreachable, which seems incorrect to me. The predecessor
> lists all look correct to me, so GVN may be stumbling on callbr
> itself.
>
> $ clang -O2 test.c -emit-llvm -S -o - -Xclang -disable-llvm-passes > test.ll
> $ opt -O2 -print-before=gvn -print-after=gvn
> -filter-print-funcs=this_cpu_list_pop test.ll -o /dev/null 2>
> test.gvn.ll
>
> Then using this reproducer, I was able to narrow it down a little further:
>
> $ opt -O2 -print-before=gvn -print-module-scope
> --filter-print-funcs=this_cpu_list_pop test.ll -S 2>test.pre-gvn.ll
> $ llvm-extract --func=this_cpu_list_pop test.pre-gvn.ll -o
> test.pre-gvn.this_cpu_list_pop.bc --recursive
> $ llvm-dis test.pre-gvn.this_cpu_list_pop.bc
> $ cat repro.sh
> #!/usr/bin/env sh
> set -e
> grep -q "%retval.118 = phi %struct.percpu_list_node\* \[ %9,
> %cleanup.thread \], \[ null, %cleanup.thread19 \]" $1
> grep -q "ret %struct.percpu_list_node\* %retval.118" $1
> OUT=$(opt -passes=gvn $1 -S -o -)
> echo $OUT | grep -q "ret %struct.percpu_list_node\* null"
> $ llvm-reduce --test=./repro.sh test.pre-gvn.this_cpu_list_pop.ll
>
> Then pared down further manually, see reduced.ll
>
> I've filed https://github.com/llvm/llvm-project/issues/52735, let's
> discuss more there?
>
> >
> > on x86-64:
> >
> > Buggy code:
> > clang-13 -O2 -o test.o test.c
> >
> > OK code (added a dummy asm("")):
> > clang-13 -O2 -o test-asm.o test-asm.c
> >
> > Thanks,
> >
> > Mathieu
> >
> > --
> > Mathieu Desnoyers
> > EfficiOS Inc.
> > http://www.efficios.com
>
>
>
> --
> Thanks,
> ~Nick Desaulniers



--
Thanks,
~Nick Desaulniers

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: clang asm goto issue small reproducer
  2021-12-16  0:18   ` Nick Desaulniers
@ 2021-12-16  0:25     ` Nick Desaulniers
  2021-12-16  0:29       ` Mathieu Desnoyers
  2021-12-16 16:39       ` Mathieu Desnoyers
  2021-12-16  0:26     ` Mathieu Desnoyers
  1 sibling, 2 replies; 6+ messages in thread
From: Nick Desaulniers @ 2021-12-16  0:25 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Peter Zijlstra, llvm, Bill Wendling, James Y Knight, Eli Friedman

On Wed, Dec 15, 2021 at 4:18 PM Nick Desaulniers
<ndesaulniers@google.com> wrote:
>
> On Wed, Dec 15, 2021 at 3:50 PM Nick Desaulniers
> <ndesaulniers@google.com> wrote:
> >
> > On Wed, Dec 15, 2021 at 2:06 PM Mathieu Desnoyers
> > <mathieu.desnoyers@efficios.com> wrote:
> > >
> > > Hi Nick,
> > >
> > > I have a small reproducer for the asm goto issue we discussed over irc. I have
> > > axed away all code that is not the problematic function by hand. Hopefully this
> > > won't change the overall optimization context too much.
> >
> > If I cut the compilation pipeline off at after "middle-end"
> > optimizations, I can see that `this_cpu_list_pop` returns `NULL` for
> > test.c while in test-asm.c, it's a phi node of either NULL or head
> > (which looks correct; for the test-asm.c case).
> >
> > "GVNPass" seems to be converting
> >
> > return:                                           ; preds =
> > %cleanup.thread19, %cleanup.thread
> >   %retval.118 = phi %struct.percpu_list_node* [ %9, %cleanup.thread ],
> > [ null, %cleanup.thread19 ]
> >   ret %struct.percpu_list_node* %retval.118
> >
> > into:
> >
> > return:                                           ; preds =
> > %cleanup.thread19, %cleanup.thread
> >   ret %struct.percpu_list_node* null
> >
> > Which drops the
> >   %9 = load %struct.percpu_list_node*, %struct.percpu_list_node**
> > %head, align 8, !tbaa !9
>
> Ah, Eli pointed out that I'm chasing the wrong squirrel. From test.c we have:
>
> struct percpu_list_node *head;
> load = (intptr_t *)&head;
> ret = rseq_cmpnev_storeoffp_load(targetptr, expectnot,
>      offset, load, cpu);
>
> then in rseq_cmpnev_storeoffp_load():
> __asm__ __volatile__ goto (
>   :
>   :
> [load] "m" (*load));
>
> Did we ever initialize head?  Isn't this equivalent to:
>
> int* foo (void) {
>   int *head;
>   int **load;
>   load = &head;
>   asm(""::"m"(*load):"memory");
>   return head;
> }
>
> Isn't that equivalent to:
>
> int* foo (void) {
>   int *head;
>   asm(""::"m"(head):"memory");
>   return head;
> }
>
> For which neither compiler warns for -Wuninitialized? They would if
> head was an argument to a function call...

What's going on here...

$ cat y.c
void x (void) {
  int y;
  asm(""::"r"(y));
}
void z (void) {
  int y;
  asm(""::"m"(y));
}
$ clang -Wuninitialized y.c -c
y.c:3:15: warning: variable 'y' is uninitialized when used here
[-Wuninitialized]
  asm(""::"r"(y));
              ^
y.c:2:8: note: initialize the variable 'y' to silence this warning
  int y;
       ^
        = 0
1 warning generated.
$ gcc -Wuninitialized y.c -c
y.c: In function ‘x’:
y.c:3:3: warning: ‘y’ is used uninitialized [-Wuninitialized]
    3 |   asm(""::"r"(y));
      |   ^~~

Only 1 warning each? I expected 2.

Is there something special about "m" constraints and -Wuninitialized?

-- 
Thanks,
~Nick Desaulniers

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: clang asm goto issue small reproducer
  2021-12-16  0:18   ` Nick Desaulniers
  2021-12-16  0:25     ` Nick Desaulniers
@ 2021-12-16  0:26     ` Mathieu Desnoyers
  1 sibling, 0 replies; 6+ messages in thread
From: Mathieu Desnoyers @ 2021-12-16  0:26 UTC (permalink / raw)
  To: ndesaulniers
  Cc: Peter Zijlstra, llvm, Bill Wendling, James Y Knight, Eli Friedman

----- On Dec 15, 2021, at 7:18 PM, ndesaulniers ndesaulniers@google.com wrote:

> On Wed, Dec 15, 2021 at 3:50 PM Nick Desaulniers
> <ndesaulniers@google.com> wrote:
>>
>> On Wed, Dec 15, 2021 at 2:06 PM Mathieu Desnoyers
>> <mathieu.desnoyers@efficios.com> wrote:
>> >
>> > Hi Nick,
>> >
>> > I have a small reproducer for the asm goto issue we discussed over irc. I have
>> > axed away all code that is not the problematic function by hand. Hopefully this
>> > won't change the overall optimization context too much.
>>
>> If I cut the compilation pipeline off at after "middle-end"
>> optimizations, I can see that `this_cpu_list_pop` returns `NULL` for
>> test.c while in test-asm.c, it's a phi node of either NULL or head
>> (which looks correct; for the test-asm.c case).
>>
>> "GVNPass" seems to be converting
>>
>> return:                                           ; preds =
>> %cleanup.thread19, %cleanup.thread
>>   %retval.118 = phi %struct.percpu_list_node* [ %9, %cleanup.thread ],
>> [ null, %cleanup.thread19 ]
>>   ret %struct.percpu_list_node* %retval.118
>>
>> into:
>>
>> return:                                           ; preds =
>> %cleanup.thread19, %cleanup.thread
>>   ret %struct.percpu_list_node* null
>>
>> Which drops the
>>   %9 = load %struct.percpu_list_node*, %struct.percpu_list_node**
>> %head, align 8, !tbaa !9
> 
> Ah, Eli pointed out that I'm chasing the wrong squirrel. From test.c we have:
> 
> struct percpu_list_node *head;
> load = (intptr_t *)&head;
> ret = rseq_cmpnev_storeoffp_load(targetptr, expectnot,
>     offset, load, cpu);
> 
> then in rseq_cmpnev_storeoffp_load():
> __asm__ __volatile__ goto (
>  :
>  :
> [load] "m" (*load));
> 
> Did we ever initialize head?  Isn't this equivalent to:
> 
> int* foo (void) {
>  int *head;
>  int **load;
>  load = &head;
>  asm(""::"m"(*load):"memory");
>  return head;
> }
> 
> Isn't that equivalent to:
> 
> int* foo (void) {
>  int *head;
>  asm(""::"m"(head):"memory");
>  return head;
> }
> 
> For which neither compiler warns for -Wuninitialized? They would if
> head was an argument to a function call...

This is a situation where I had to work around the fact that older
gcc did not support output operands. I expected that having both
a "m" input operand and a "memory" clobber would allow the asm
to write to the memory location pointed to by the "m" input operand,
and the "memory" clobber was expected to prevent the compiler
from presuming anything about memory state across the asm.

But I suspect I was expecting too much here. What would be the
best course of action to fix these asm statements ? Do I have
any way to do this without requiring a dependency on gcc/clang
that support asm goto with output operands ?

Thanks,

Mathieu


> 
>>
>> I think GVN may be eliminating a phi that it should not be.
>> Specifically, it seems that GVN thinks the default/fallthrough
>> successor is unreachable, which seems incorrect to me. The predecessor
>> lists all look correct to me, so GVN may be stumbling on callbr
>> itself.
>>
>> $ clang -O2 test.c -emit-llvm -S -o - -Xclang -disable-llvm-passes > test.ll
>> $ opt -O2 -print-before=gvn -print-after=gvn
>> -filter-print-funcs=this_cpu_list_pop test.ll -o /dev/null 2>
>> test.gvn.ll
>>
>> Then using this reproducer, I was able to narrow it down a little further:
>>
>> $ opt -O2 -print-before=gvn -print-module-scope
>> --filter-print-funcs=this_cpu_list_pop test.ll -S 2>test.pre-gvn.ll
>> $ llvm-extract --func=this_cpu_list_pop test.pre-gvn.ll -o
>> test.pre-gvn.this_cpu_list_pop.bc --recursive
>> $ llvm-dis test.pre-gvn.this_cpu_list_pop.bc
>> $ cat repro.sh
>> #!/usr/bin/env sh
>> set -e
>> grep -q "%retval.118 = phi %struct.percpu_list_node\* \[ %9,
>> %cleanup.thread \], \[ null, %cleanup.thread19 \]" $1
>> grep -q "ret %struct.percpu_list_node\* %retval.118" $1
>> OUT=$(opt -passes=gvn $1 -S -o -)
>> echo $OUT | grep -q "ret %struct.percpu_list_node\* null"
>> $ llvm-reduce --test=./repro.sh test.pre-gvn.this_cpu_list_pop.ll
>>
>> Then pared down further manually, see reduced.ll
>>
>> I've filed https://github.com/llvm/llvm-project/issues/52735, let's
>> discuss more there?
>>
>> >
>> > on x86-64:
>> >
>> > Buggy code:
>> > clang-13 -O2 -o test.o test.c
>> >
>> > OK code (added a dummy asm("")):
>> > clang-13 -O2 -o test-asm.o test-asm.c
>> >
>> > Thanks,
>> >
>> > Mathieu
>> >
>> > --
>> > Mathieu Desnoyers
>> > EfficiOS Inc.
>> > http://www.efficios.com
>>
>>
>>
>> --
>> Thanks,
>> ~Nick Desaulniers
> 
> 
> 
> --
> Thanks,
> ~Nick Desaulniers

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: clang asm goto issue small reproducer
  2021-12-16  0:25     ` Nick Desaulniers
@ 2021-12-16  0:29       ` Mathieu Desnoyers
  2021-12-16 16:39       ` Mathieu Desnoyers
  1 sibling, 0 replies; 6+ messages in thread
From: Mathieu Desnoyers @ 2021-12-16  0:29 UTC (permalink / raw)
  To: ndesaulniers
  Cc: Peter Zijlstra, llvm, Bill Wendling, James Y Knight, Eli Friedman

----- On Dec 15, 2021, at 7:25 PM, ndesaulniers ndesaulniers@google.com wrote:

> On Wed, Dec 15, 2021 at 4:18 PM Nick Desaulniers
> <ndesaulniers@google.com> wrote:
>>
>> On Wed, Dec 15, 2021 at 3:50 PM Nick Desaulniers
>> <ndesaulniers@google.com> wrote:
>> >
>> > On Wed, Dec 15, 2021 at 2:06 PM Mathieu Desnoyers
>> > <mathieu.desnoyers@efficios.com> wrote:
>> > >
>> > > Hi Nick,
>> > >
>> > > I have a small reproducer for the asm goto issue we discussed over irc. I have
>> > > axed away all code that is not the problematic function by hand. Hopefully this
>> > > won't change the overall optimization context too much.
>> >
>> > If I cut the compilation pipeline off at after "middle-end"
>> > optimizations, I can see that `this_cpu_list_pop` returns `NULL` for
>> > test.c while in test-asm.c, it's a phi node of either NULL or head
>> > (which looks correct; for the test-asm.c case).
>> >
>> > "GVNPass" seems to be converting
>> >
>> > return:                                           ; preds =
>> > %cleanup.thread19, %cleanup.thread
>> >   %retval.118 = phi %struct.percpu_list_node* [ %9, %cleanup.thread ],
>> > [ null, %cleanup.thread19 ]
>> >   ret %struct.percpu_list_node* %retval.118
>> >
>> > into:
>> >
>> > return:                                           ; preds =
>> > %cleanup.thread19, %cleanup.thread
>> >   ret %struct.percpu_list_node* null
>> >
>> > Which drops the
>> >   %9 = load %struct.percpu_list_node*, %struct.percpu_list_node**
>> > %head, align 8, !tbaa !9
>>
>> Ah, Eli pointed out that I'm chasing the wrong squirrel. From test.c we have:
>>
>> struct percpu_list_node *head;
>> load = (intptr_t *)&head;
>> ret = rseq_cmpnev_storeoffp_load(targetptr, expectnot,
>>      offset, load, cpu);
>>
>> then in rseq_cmpnev_storeoffp_load():
>> __asm__ __volatile__ goto (
>>   :
>>   :
>> [load] "m" (*load));
>>
>> Did we ever initialize head?  Isn't this equivalent to:
>>
>> int* foo (void) {
>>   int *head;
>>   int **load;
>>   load = &head;
>>   asm(""::"m"(*load):"memory");
>>   return head;
>> }
>>
>> Isn't that equivalent to:
>>
>> int* foo (void) {
>>   int *head;
>>   asm(""::"m"(head):"memory");
>>   return head;
>> }
>>
>> For which neither compiler warns for -Wuninitialized? They would if
>> head was an argument to a function call...
> 
> What's going on here...
> 
> $ cat y.c
> void x (void) {
>  int y;
>  asm(""::"r"(y));
> }
> void z (void) {
>  int y;
>  asm(""::"m"(y));
> }
> $ clang -Wuninitialized y.c -c
> y.c:3:15: warning: variable 'y' is uninitialized when used here
> [-Wuninitialized]
>  asm(""::"r"(y));
>              ^
> y.c:2:8: note: initialize the variable 'y' to silence this warning
>  int y;
>       ^
>        = 0
> 1 warning generated.
> $ gcc -Wuninitialized y.c -c
> y.c: In function ‘x’:
> y.c:3:3: warning: ‘y’ is used uninitialized [-Wuninitialized]
>    3 |   asm(""::"r"(y));
>      |   ^~~
> 
> Only 1 warning each? I expected 2.
> 
> Is there something special about "m" constraints and -Wuninitialized?

In my work-arounds for the lack of output operands with asm goto,
I've used either of those tricks across various architectures:

- "m" (y) input operand with "memory" clobber, or
- "r" (&y) input operand with "memory" clobber

to achieve writing to a memory area from assembler without having
output operands available.

It may be stretching the definitions, but those were my (perhaps
ill advised) expectations.

Thanks,

Mathieu

> 
> --
> Thanks,
> ~Nick Desaulniers

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: clang asm goto issue small reproducer
  2021-12-16  0:25     ` Nick Desaulniers
  2021-12-16  0:29       ` Mathieu Desnoyers
@ 2021-12-16 16:39       ` Mathieu Desnoyers
  1 sibling, 0 replies; 6+ messages in thread
From: Mathieu Desnoyers @ 2021-12-16 16:39 UTC (permalink / raw)
  To: ndesaulniers
  Cc: Peter Zijlstra, llvm, Bill Wendling, James Y Knight, Eli Friedman

[-- Attachment #1: Type: text/plain, Size: 3628 bytes --]

Hi Nick,

Here is the result of:

opt-13 -passes='gvn,gvn-hoist,gvn-sink,print<memoryssa>' -filter-print-funcs=this_cpu_list_pop test-asm.ll -o /dev/null 2> test-asm.gvn.ll
opt-13 -passes='gvn,gvn-hoist,gvn-sink,print<memoryssa>' -filter-print-funcs=this_cpu_list_pop test.ll -o /dev/null 2> test.gvn.ll

I notice a missing MemoryPhi in the case without the extra asm("").

Thanks,

Mathieu

----- On Dec 15, 2021, at 7:25 PM, ndesaulniers ndesaulniers@google.com wrote:

> On Wed, Dec 15, 2021 at 4:18 PM Nick Desaulniers
> <ndesaulniers@google.com> wrote:
>>
>> On Wed, Dec 15, 2021 at 3:50 PM Nick Desaulniers
>> <ndesaulniers@google.com> wrote:
>> >
>> > On Wed, Dec 15, 2021 at 2:06 PM Mathieu Desnoyers
>> > <mathieu.desnoyers@efficios.com> wrote:
>> > >
>> > > Hi Nick,
>> > >
>> > > I have a small reproducer for the asm goto issue we discussed over irc. I have
>> > > axed away all code that is not the problematic function by hand. Hopefully this
>> > > won't change the overall optimization context too much.
>> >
>> > If I cut the compilation pipeline off at after "middle-end"
>> > optimizations, I can see that `this_cpu_list_pop` returns `NULL` for
>> > test.c while in test-asm.c, it's a phi node of either NULL or head
>> > (which looks correct; for the test-asm.c case).
>> >
>> > "GVNPass" seems to be converting
>> >
>> > return:                                           ; preds =
>> > %cleanup.thread19, %cleanup.thread
>> >   %retval.118 = phi %struct.percpu_list_node* [ %9, %cleanup.thread ],
>> > [ null, %cleanup.thread19 ]
>> >   ret %struct.percpu_list_node* %retval.118
>> >
>> > into:
>> >
>> > return:                                           ; preds =
>> > %cleanup.thread19, %cleanup.thread
>> >   ret %struct.percpu_list_node* null
>> >
>> > Which drops the
>> >   %9 = load %struct.percpu_list_node*, %struct.percpu_list_node**
>> > %head, align 8, !tbaa !9
>>
>> Ah, Eli pointed out that I'm chasing the wrong squirrel. From test.c we have:
>>
>> struct percpu_list_node *head;
>> load = (intptr_t *)&head;
>> ret = rseq_cmpnev_storeoffp_load(targetptr, expectnot,
>>      offset, load, cpu);
>>
>> then in rseq_cmpnev_storeoffp_load():
>> __asm__ __volatile__ goto (
>>   :
>>   :
>> [load] "m" (*load));
>>
>> Did we ever initialize head?  Isn't this equivalent to:
>>
>> int* foo (void) {
>>   int *head;
>>   int **load;
>>   load = &head;
>>   asm(""::"m"(*load):"memory");
>>   return head;
>> }
>>
>> Isn't that equivalent to:
>>
>> int* foo (void) {
>>   int *head;
>>   asm(""::"m"(head):"memory");
>>   return head;
>> }
>>
>> For which neither compiler warns for -Wuninitialized? They would if
>> head was an argument to a function call...
> 
> What's going on here...
> 
> $ cat y.c
> void x (void) {
>  int y;
>  asm(""::"r"(y));
> }
> void z (void) {
>  int y;
>  asm(""::"m"(y));
> }
> $ clang -Wuninitialized y.c -c
> y.c:3:15: warning: variable 'y' is uninitialized when used here
> [-Wuninitialized]
>  asm(""::"r"(y));
>              ^
> y.c:2:8: note: initialize the variable 'y' to silence this warning
>  int y;
>       ^
>        = 0
> 1 warning generated.
> $ gcc -Wuninitialized y.c -c
> y.c: In function ‘x’:
> y.c:3:3: warning: ‘y’ is used uninitialized [-Wuninitialized]
>    3 |   asm(""::"r"(y));
>      |   ^~~
> 
> Only 1 warning each? I expected 2.
> 
> Is there something special about "m" constraints and -Wuninitialized?
> 
> --
> Thanks,
> ~Nick Desaulniers

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

[-- Attachment #2: test-asm.gvn.ll --]
[-- Type: application/octet-stream, Size: 8960 bytes --]

MemorySSA for function: this_cpu_list_pop
; Function Attrs: nounwind uwtable
define dso_local %struct.percpu_list_node* @this_cpu_list_pop(%struct.percpu_list* %0, i32* %1) #0 {
  %3 = alloca %struct.percpu_list_node*, align 8
  %4 = alloca %struct.percpu_list*, align 8
  %5 = alloca i32*, align 8
  %6 = alloca %struct.percpu_list_node*, align 8
  %7 = alloca i64*, align 8
  %8 = alloca i64, align 8
  %9 = alloca i64*, align 8
  %10 = alloca i64, align 8
  %11 = alloca i32, align 4
  %12 = alloca i32, align 4
  %13 = alloca i32, align 4
; 1 = MemoryDef(liveOnEntry)
  store %struct.percpu_list* %0, %struct.percpu_list** %4, align 8, !tbaa !3
; 2 = MemoryDef(1)
  store i32* %1, i32** %5, align 8, !tbaa !3
  br label %14

14:                                               ; preds = %52, %2
; 31 = MemoryPhi({%2,2},{%52,28})
  %15 = phi %struct.percpu_list_node* [ %49, %52 ], [ undef, %2 ]
  %16 = phi i32* [ %51, %52 ], [ %1, %2 ]
  %17 = bitcast %struct.percpu_list_node** %6 to i8*
; 3 = MemoryDef(31)
  call void @llvm.lifetime.start.p0i8(i64 8, i8* %17) #5
  %18 = bitcast i64** %7 to i8*
; 4 = MemoryDef(3)
  call void @llvm.lifetime.start.p0i8(i64 8, i8* %18) #5
  %19 = bitcast i64* %8 to i8*
; 5 = MemoryDef(4)
  call void @llvm.lifetime.start.p0i8(i64 8, i8* %19) #5
  %20 = bitcast i64** %9 to i8*
; 6 = MemoryDef(5)
  call void @llvm.lifetime.start.p0i8(i64 8, i8* %20) #5
  %21 = bitcast i64* %10 to i8*
; 7 = MemoryDef(6)
  call void @llvm.lifetime.start.p0i8(i64 8, i8* %21) #5
  %22 = bitcast i32* %11 to i8*
; 8 = MemoryDef(7)
  call void @llvm.lifetime.start.p0i8(i64 4, i8* %22) #5
  %23 = bitcast i32* %12 to i8*
; 9 = MemoryDef(8)
  call void @llvm.lifetime.start.p0i8(i64 4, i8* %23) #5
; 10 = MemoryDef(9)
  %24 = call i32 @rseq_cpu_start()
; 11 = MemoryDef(10)
  store i32 %24, i32* %12, align 4, !tbaa !7
  %25 = getelementptr inbounds %struct.percpu_list, %struct.percpu_list* %0, i32 0, i32 0
  %26 = sext i32 %24 to i64
  %27 = getelementptr inbounds [1024 x %struct.percpu_list_entry], [1024 x %struct.percpu_list_entry]* %25, i64 0, i64 %26
  %28 = getelementptr inbounds %struct.percpu_list_entry, %struct.percpu_list_entry* %27, i32 0, i32 0
  %29 = bitcast %struct.percpu_list_node** %28 to i64*
; 12 = MemoryDef(11)
  store i64* %29, i64** %7, align 8, !tbaa !3
; 13 = MemoryDef(12)
  store i64 0, i64* %8, align 8, !tbaa !9
; 14 = MemoryDef(13)
  store i64 8, i64* %10, align 8, !tbaa !9
  %30 = bitcast %struct.percpu_list_node** %6 to i64*
; 15 = MemoryDef(14)
  store i64* %30, i64** %9, align 8, !tbaa !3
; 16 = MemoryDef(15)
  %31 = call i32 @rseq_cmpnev_storeoffp_load(i64* %29, i64 0, i64 8, i64* %30, i32 %24)
; 17 = MemoryDef(16)
  store i32 %31, i32* %11, align 4, !tbaa !7
  %32 = icmp ne i32 %31, 0
  %33 = xor i1 %32, true
  %34 = zext i1 %33 to i32
  %35 = sext i32 %34 to i64
  %36 = call i64 @llvm.expect.i64(i64 %35, i64 1)
  %37 = icmp ne i64 %36, 0
  br i1 %37, label %38, label %44

38:                                               ; preds = %14
  %39 = icmp ne i32* %16, null
  br i1 %39, label %40, label %41

40:                                               ; preds = %38
; 18 = MemoryDef(17)
  store i32 %24, i32* %1, align 4, !tbaa !7
  br label %41

41:                                               ; preds = %40, %38
; 30 = MemoryPhi({%38,17},{%40,18})
  %42 = phi i32* [ %1, %40 ], [ null, %38 ]
; MemoryUse(16) MayAlias
  %43 = load %struct.percpu_list_node*, %struct.percpu_list_node** %6, align 8, !tbaa !3
; 19 = MemoryDef(30)
  store %struct.percpu_list_node* %43, %struct.percpu_list_node** %3, align 8
  br label %48

44:                                               ; preds = %14
  %45 = icmp sgt i32 %31, 0
  br i1 %45, label %46, label %47

46:                                               ; preds = %44
; 20 = MemoryDef(17)
  store %struct.percpu_list_node* null, %struct.percpu_list_node** %3, align 8
  br label %48

47:                                               ; preds = %44
  br label %48

48:                                               ; preds = %47, %46, %41
; 29 = MemoryPhi({%41,19},{%46,20},{%47,17})
  %.sink = phi i32 [ 1, %41 ], [ 1, %46 ], [ 0, %47 ]
  %49 = phi %struct.percpu_list_node* [ %15, %47 ], [ null, %46 ], [ %43, %41 ]
  %50 = phi i32 [ 0, %47 ], [ 1, %46 ], [ 1, %41 ]
  %51 = phi i32* [ %16, %47 ], [ %16, %46 ], [ %42, %41 ]
; 21 = MemoryDef(29)
  store i32 %.sink, i32* %13, align 4
; 22 = MemoryDef(21)
  call void @llvm.lifetime.end.p0i8(i64 4, i8* %23) #5
; 23 = MemoryDef(22)
  call void @llvm.lifetime.end.p0i8(i64 4, i8* %22) #5
; 24 = MemoryDef(23)
  call void @llvm.lifetime.end.p0i8(i64 8, i8* %21) #5
; 25 = MemoryDef(24)
  call void @llvm.lifetime.end.p0i8(i64 8, i8* %20) #5
; 26 = MemoryDef(25)
  call void @llvm.lifetime.end.p0i8(i64 8, i8* %19) #5
; 27 = MemoryDef(26)
  call void @llvm.lifetime.end.p0i8(i64 8, i8* %18) #5
; 28 = MemoryDef(27)
  call void @llvm.lifetime.end.p0i8(i64 8, i8* %17) #5
  switch i32 %50, label %54 [
    i32 0, label %52
    i32 1, label %53
  ]

52:                                               ; preds = %48
  br label %14

53:                                               ; preds = %48
  ret %struct.percpu_list_node* %49

54:                                               ; preds = %48
  unreachable
}
MemorySSA for function: rseq_cpu_start
; Function Attrs: inlinehint nounwind uwtable
define internal i32 @rseq_cpu_start() #2 {
; 1 = MemoryDef(liveOnEntry)
  %1 = call %struct.rseq* @rseq_get_abi()
  %2 = getelementptr inbounds %struct.rseq, %struct.rseq* %1, i32 0, i32 0
; 2 = MemoryDef(1)
  %3 = load volatile i32, i32* %2, align 32, !tbaa !3
  ret i32 %3
}
MemorySSA for function: rseq_cmpnev_storeoffp_load
; Function Attrs: alwaysinline nounwind uwtable
define internal i32 @rseq_cmpnev_storeoffp_load(i64* %0, i64 %1, i64 %2, i64* %3, i32 %4) #3 {
  %6 = alloca i32, align 4
  %7 = alloca i64*, align 8
  %8 = alloca i64, align 8
  %9 = alloca i64, align 8
  %10 = alloca i64*, align 8
  %11 = alloca i32, align 4
; 1 = MemoryDef(liveOnEntry)
  store i64* %0, i64** %7, align 8, !tbaa !3
; 2 = MemoryDef(1)
  store i64 %1, i64* %8, align 8, !tbaa !7
; 3 = MemoryDef(2)
  store i64 %2, i64* %9, align 8, !tbaa !7
; 4 = MemoryDef(3)
  store i64* %3, i64** %10, align 8, !tbaa !3
; 5 = MemoryDef(4)
  store i32 %4, i32* %11, align 4, !tbaa !9
; 6 = MemoryDef(5)
  %12 = call %struct.rseq* @rseq_get_abi()
; 7 = MemoryDef(6)
  callbr void asm sideeffect ".pushsection __rseq_cs, \22aw\22\0A\09.balign 32\0A\093:\0A\09.long 0x0, 0x0\0A\09.quad 1f, (2f - 1f), 4f\0A\09.popsection\0A\09.pushsection __rseq_cs_ptr_array, \22aw\22\0A\09.quad 3b\0A\09.popsection\0A\09.pushsection __rseq_exit_point_array, \22aw\22\0A\09.quad 1f, ${7:l}\0A\09.popsection\0A\09leaq 3b(%rip), %rax\0A\09movq %rax, 8($1)\0A\091:\0A\09cmpl $0, 4($1)\0A\09jnz 4f\0A\09movq $2, %rbx\0A\09cmpq %rbx, $3\0A\09je ${7:l}\0A\09movq %rbx, $5\0A\09addq $4, %rbx\0A\09movq (%rbx), %rbx\0A\09ud2\0A\09movq %rbx, $2\0A\092:\0A\09.pushsection __rseq_failure, \22ax\22\0A\09.byte 0x0f, 0xb9, 0x3d\0A\09.long 0x53053053\0A\094:\0A\09jmp ${6:l}\0A\09.popsection\0A\09", "r,r,*m,r,er,*m,X,X,~{memory},~{cc},~{rax},~{rbx},~{dirflag},~{fpsr},~{flags}"(i32 %4, %struct.rseq* %12, i64* %0, i64 %1, i64 %2, i64* %3, i8* blockaddress(@rseq_cmpnev_storeoffp_load, %14), i8* blockaddress(@rseq_cmpnev_storeoffp_load, %15)) #5
          to label %13 [label %14, label %15], !srcloc !11

13:                                               ; preds = %5
; 8 = MemoryDef(7)
  call void asm sideeffect "", "~{dirflag},~{fpsr},~{flags}"() #5, !srcloc !12
  br label %16

14:                                               ; preds = %5
  br label %16

15:                                               ; preds = %5
  br label %16

16:                                               ; preds = %15, %14, %13
; 10 = MemoryPhi({%13,8},{%14,7},{%15,7})
  %.sink = phi i32 [ -1, %14 ], [ 1, %15 ], [ 0, %13 ]
  %17 = phi i32 [ 1, %15 ], [ -1, %14 ], [ 0, %13 ]
; 9 = MemoryDef(10)
  store i32 %.sink, i32* %6, align 4
  ret i32 %17
}
MemorySSA for function: rseq_get_abi
; Function Attrs: inlinehint nounwind uwtable
define internal %struct.rseq* @rseq_get_abi() #2 {
; 1 = MemoryDef(liveOnEntry)
  %1 = call i8* @rseq_thread_pointer()
; MemoryUse(1) MayAlias
  %2 = load i32, i32* @rseq_offset, align 4, !tbaa !3
  %3 = sext i32 %2 to i64
  %4 = getelementptr i8, i8* %1, i64 %3
  %5 = bitcast i8* %4 to %struct.rseq*
  ret %struct.rseq* %5
}
MemorySSA for function: rseq_thread_pointer
; Function Attrs: inlinehint nounwind uwtable
define internal i8* @rseq_thread_pointer() #2 {
  %1 = alloca i8*, align 8
  %2 = bitcast i8** %1 to i8*
; 1 = MemoryDef(liveOnEntry)
  call void @llvm.lifetime.start.p0i8(i64 8, i8* %2) #5
  %3 = call i8* asm "mov %fs:0, $0", "=r,~{dirflag},~{fpsr},~{flags}"() #6
; 2 = MemoryDef(1)
  store i8* %3, i8** %1, align 8, !tbaa !3
; 3 = MemoryDef(2)
  call void @llvm.lifetime.end.p0i8(i64 8, i8* %2) #5
  ret i8* %3
}

[-- Attachment #3: test.gvn.ll --]
[-- Type: application/octet-stream, Size: 8819 bytes --]

MemorySSA for function: this_cpu_list_pop
; Function Attrs: nounwind uwtable
define dso_local %struct.percpu_list_node* @this_cpu_list_pop(%struct.percpu_list* %0, i32* %1) #0 {
  %3 = alloca %struct.percpu_list_node*, align 8
  %4 = alloca %struct.percpu_list*, align 8
  %5 = alloca i32*, align 8
  %6 = alloca %struct.percpu_list_node*, align 8
  %7 = alloca i64*, align 8
  %8 = alloca i64, align 8
  %9 = alloca i64*, align 8
  %10 = alloca i64, align 8
  %11 = alloca i32, align 4
  %12 = alloca i32, align 4
  %13 = alloca i32, align 4
; 1 = MemoryDef(liveOnEntry)
  store %struct.percpu_list* %0, %struct.percpu_list** %4, align 8, !tbaa !3
; 2 = MemoryDef(1)
  store i32* %1, i32** %5, align 8, !tbaa !3
  br label %14

14:                                               ; preds = %52, %2
; 31 = MemoryPhi({%2,2},{%52,28})
  %15 = phi %struct.percpu_list_node* [ %49, %52 ], [ undef, %2 ]
  %16 = phi i32* [ %51, %52 ], [ %1, %2 ]
  %17 = bitcast %struct.percpu_list_node** %6 to i8*
; 3 = MemoryDef(31)
  call void @llvm.lifetime.start.p0i8(i64 8, i8* %17) #5
  %18 = bitcast i64** %7 to i8*
; 4 = MemoryDef(3)
  call void @llvm.lifetime.start.p0i8(i64 8, i8* %18) #5
  %19 = bitcast i64* %8 to i8*
; 5 = MemoryDef(4)
  call void @llvm.lifetime.start.p0i8(i64 8, i8* %19) #5
  %20 = bitcast i64** %9 to i8*
; 6 = MemoryDef(5)
  call void @llvm.lifetime.start.p0i8(i64 8, i8* %20) #5
  %21 = bitcast i64* %10 to i8*
; 7 = MemoryDef(6)
  call void @llvm.lifetime.start.p0i8(i64 8, i8* %21) #5
  %22 = bitcast i32* %11 to i8*
; 8 = MemoryDef(7)
  call void @llvm.lifetime.start.p0i8(i64 4, i8* %22) #5
  %23 = bitcast i32* %12 to i8*
; 9 = MemoryDef(8)
  call void @llvm.lifetime.start.p0i8(i64 4, i8* %23) #5
; 10 = MemoryDef(9)
  %24 = call i32 @rseq_cpu_start()
; 11 = MemoryDef(10)
  store i32 %24, i32* %12, align 4, !tbaa !7
  %25 = getelementptr inbounds %struct.percpu_list, %struct.percpu_list* %0, i32 0, i32 0
  %26 = sext i32 %24 to i64
  %27 = getelementptr inbounds [1024 x %struct.percpu_list_entry], [1024 x %struct.percpu_list_entry]* %25, i64 0, i64 %26
  %28 = getelementptr inbounds %struct.percpu_list_entry, %struct.percpu_list_entry* %27, i32 0, i32 0
  %29 = bitcast %struct.percpu_list_node** %28 to i64*
; 12 = MemoryDef(11)
  store i64* %29, i64** %7, align 8, !tbaa !3
; 13 = MemoryDef(12)
  store i64 0, i64* %8, align 8, !tbaa !9
; 14 = MemoryDef(13)
  store i64 8, i64* %10, align 8, !tbaa !9
  %30 = bitcast %struct.percpu_list_node** %6 to i64*
; 15 = MemoryDef(14)
  store i64* %30, i64** %9, align 8, !tbaa !3
; 16 = MemoryDef(15)
  %31 = call i32 @rseq_cmpnev_storeoffp_load(i64* %29, i64 0, i64 8, i64* %30, i32 %24)
; 17 = MemoryDef(16)
  store i32 %31, i32* %11, align 4, !tbaa !7
  %32 = icmp ne i32 %31, 0
  %33 = xor i1 %32, true
  %34 = zext i1 %33 to i32
  %35 = sext i32 %34 to i64
  %36 = call i64 @llvm.expect.i64(i64 %35, i64 1)
  %37 = icmp ne i64 %36, 0
  br i1 %37, label %38, label %44

38:                                               ; preds = %14
  %39 = icmp ne i32* %16, null
  br i1 %39, label %40, label %41

40:                                               ; preds = %38
; 18 = MemoryDef(17)
  store i32 %24, i32* %1, align 4, !tbaa !7
  br label %41

41:                                               ; preds = %40, %38
; 30 = MemoryPhi({%38,17},{%40,18})
  %42 = phi i32* [ %1, %40 ], [ null, %38 ]
; MemoryUse(16) MayAlias
  %43 = load %struct.percpu_list_node*, %struct.percpu_list_node** %6, align 8, !tbaa !3
; 19 = MemoryDef(30)
  store %struct.percpu_list_node* %43, %struct.percpu_list_node** %3, align 8
  br label %48

44:                                               ; preds = %14
  %45 = icmp sgt i32 %31, 0
  br i1 %45, label %46, label %47

46:                                               ; preds = %44
; 20 = MemoryDef(17)
  store %struct.percpu_list_node* null, %struct.percpu_list_node** %3, align 8
  br label %48

47:                                               ; preds = %44
  br label %48

48:                                               ; preds = %47, %46, %41
; 29 = MemoryPhi({%41,19},{%46,20},{%47,17})
  %.sink = phi i32 [ 1, %41 ], [ 1, %46 ], [ 0, %47 ]
  %49 = phi %struct.percpu_list_node* [ %15, %47 ], [ null, %46 ], [ %43, %41 ]
  %50 = phi i32 [ 0, %47 ], [ 1, %46 ], [ 1, %41 ]
  %51 = phi i32* [ %16, %47 ], [ %16, %46 ], [ %42, %41 ]
; 21 = MemoryDef(29)
  store i32 %.sink, i32* %13, align 4
; 22 = MemoryDef(21)
  call void @llvm.lifetime.end.p0i8(i64 4, i8* %23) #5
; 23 = MemoryDef(22)
  call void @llvm.lifetime.end.p0i8(i64 4, i8* %22) #5
; 24 = MemoryDef(23)
  call void @llvm.lifetime.end.p0i8(i64 8, i8* %21) #5
; 25 = MemoryDef(24)
  call void @llvm.lifetime.end.p0i8(i64 8, i8* %20) #5
; 26 = MemoryDef(25)
  call void @llvm.lifetime.end.p0i8(i64 8, i8* %19) #5
; 27 = MemoryDef(26)
  call void @llvm.lifetime.end.p0i8(i64 8, i8* %18) #5
; 28 = MemoryDef(27)
  call void @llvm.lifetime.end.p0i8(i64 8, i8* %17) #5
  switch i32 %50, label %54 [
    i32 0, label %52
    i32 1, label %53
  ]

52:                                               ; preds = %48
  br label %14

53:                                               ; preds = %48
  ret %struct.percpu_list_node* %49

54:                                               ; preds = %48
  unreachable
}
MemorySSA for function: rseq_cpu_start
; Function Attrs: inlinehint nounwind uwtable
define internal i32 @rseq_cpu_start() #2 {
; 1 = MemoryDef(liveOnEntry)
  %1 = call %struct.rseq* @rseq_get_abi()
  %2 = getelementptr inbounds %struct.rseq, %struct.rseq* %1, i32 0, i32 0
; 2 = MemoryDef(1)
  %3 = load volatile i32, i32* %2, align 32, !tbaa !3
  ret i32 %3
}
MemorySSA for function: rseq_cmpnev_storeoffp_load
; Function Attrs: alwaysinline nounwind uwtable
define internal i32 @rseq_cmpnev_storeoffp_load(i64* %0, i64 %1, i64 %2, i64* %3, i32 %4) #3 {
  %6 = alloca i32, align 4
  %7 = alloca i64*, align 8
  %8 = alloca i64, align 8
  %9 = alloca i64, align 8
  %10 = alloca i64*, align 8
  %11 = alloca i32, align 4
; 1 = MemoryDef(liveOnEntry)
  store i64* %0, i64** %7, align 8, !tbaa !3
; 2 = MemoryDef(1)
  store i64 %1, i64* %8, align 8, !tbaa !7
; 3 = MemoryDef(2)
  store i64 %2, i64* %9, align 8, !tbaa !7
; 4 = MemoryDef(3)
  store i64* %3, i64** %10, align 8, !tbaa !3
; 5 = MemoryDef(4)
  store i32 %4, i32* %11, align 4, !tbaa !9
; 6 = MemoryDef(5)
  %12 = call %struct.rseq* @rseq_get_abi()
; 7 = MemoryDef(6)
  callbr void asm sideeffect ".pushsection __rseq_cs, \22aw\22\0A\09.balign 32\0A\093:\0A\09.long 0x0, 0x0\0A\09.quad 1f, (2f - 1f), 4f\0A\09.popsection\0A\09.pushsection __rseq_cs_ptr_array, \22aw\22\0A\09.quad 3b\0A\09.popsection\0A\09.pushsection __rseq_exit_point_array, \22aw\22\0A\09.quad 1f, ${7:l}\0A\09.popsection\0A\09leaq 3b(%rip), %rax\0A\09movq %rax, 8($1)\0A\091:\0A\09cmpl $0, 4($1)\0A\09jnz 4f\0A\09movq $2, %rbx\0A\09cmpq %rbx, $3\0A\09je ${7:l}\0A\09movq %rbx, $5\0A\09addq $4, %rbx\0A\09movq (%rbx), %rbx\0A\09ud2\0A\09movq %rbx, $2\0A\092:\0A\09.pushsection __rseq_failure, \22ax\22\0A\09.byte 0x0f, 0xb9, 0x3d\0A\09.long 0x53053053\0A\094:\0A\09jmp ${6:l}\0A\09.popsection\0A\09", "r,r,*m,r,er,*m,X,X,~{memory},~{cc},~{rax},~{rbx},~{dirflag},~{fpsr},~{flags}"(i32 %4, %struct.rseq* %12, i64* %0, i64 %1, i64 %2, i64* %3, i8* blockaddress(@rseq_cmpnev_storeoffp_load, %14), i8* blockaddress(@rseq_cmpnev_storeoffp_load, %15)) #5
          to label %13 [label %14, label %15], !srcloc !11

13:                                               ; preds = %5
  br label %16

14:                                               ; preds = %5
  br label %16

15:                                               ; preds = %5
  br label %16

16:                                               ; preds = %15, %14, %13
  %.sink = phi i32 [ -1, %14 ], [ 1, %15 ], [ 0, %13 ]
  %17 = phi i32 [ 1, %15 ], [ -1, %14 ], [ 0, %13 ]
; 8 = MemoryDef(7)
  store i32 %.sink, i32* %6, align 4
  ret i32 %17
}
MemorySSA for function: rseq_get_abi
; Function Attrs: inlinehint nounwind uwtable
define internal %struct.rseq* @rseq_get_abi() #2 {
; 1 = MemoryDef(liveOnEntry)
  %1 = call i8* @rseq_thread_pointer()
; MemoryUse(1) MayAlias
  %2 = load i32, i32* @rseq_offset, align 4, !tbaa !3
  %3 = sext i32 %2 to i64
  %4 = getelementptr i8, i8* %1, i64 %3
  %5 = bitcast i8* %4 to %struct.rseq*
  ret %struct.rseq* %5
}
MemorySSA for function: rseq_thread_pointer
; Function Attrs: inlinehint nounwind uwtable
define internal i8* @rseq_thread_pointer() #2 {
  %1 = alloca i8*, align 8
  %2 = bitcast i8** %1 to i8*
; 1 = MemoryDef(liveOnEntry)
  call void @llvm.lifetime.start.p0i8(i64 8, i8* %2) #5
  %3 = call i8* asm "mov %fs:0, $0", "=r,~{dirflag},~{fpsr},~{flags}"() #6
; 2 = MemoryDef(1)
  store i8* %3, i8** %1, align 8, !tbaa !3
; 3 = MemoryDef(2)
  call void @llvm.lifetime.end.p0i8(i64 8, i8* %2) #5
  ret i8* %3
}

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-12-16 16:39 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <157245708.33559.1639605989199.JavaMail.zimbra@efficios.com>
2021-12-15 23:50 ` clang asm goto issue small reproducer Nick Desaulniers
2021-12-16  0:18   ` Nick Desaulniers
2021-12-16  0:25     ` Nick Desaulniers
2021-12-16  0:29       ` Mathieu Desnoyers
2021-12-16 16:39       ` Mathieu Desnoyers
2021-12-16  0:26     ` Mathieu Desnoyers

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.