[PATCH 0/4] target/s390x: Improve carry computation

* [PATCH 0/4] target/s390x: Improve carry computation
@ 2020-10-17  2:28 Richard Henderson
  2020-10-17  2:28 ` [PATCH 1/4] target/s390x: Improve cc computation for ADD LOGICAL Richard Henderson
                   ` (3 more replies)
  0 siblings, 4 replies; 11+ messages in thread
From: Richard Henderson @ 2020-10-17  2:28 UTC (permalink / raw)
  To: qemu-devel; +Cc: david

While testing the float128_muladd changes for s390x host,
emulating under x86_64 of course, I noticed that the code
we generate for strings of ALCGR and SLBGR is pretty awful.

I realized that we were missing a trick: the output cc is
based only on the output (result and carry) and so we don't
need to save the inputs.  And once we do that, we can use
the output carry as a direct input to the next insn.

For subtract, computing carry as per the PoO (a + ~b + c),
in tcg is less efficient than computing borrow.  We can
convert between the two simply by adding or subtracting 1.

As an example from float128_muladd,

0x400003f014:  b90b 0019       slgr     %r1, %r9
0x400003f018:  b989 002a       slbgr    %r2, %r10
0x400003f01c:  b989 0030       slbgr    %r3, %r0

Before:

  -- guest addr 0x000000400003f014
0x7fcbf811a4bc:  4d 8b f5                 movq     %r13, %r14
0x7fcbf811a4bf:  4c 8b 7d 48              movq     0x48(%rbp), %r15
0x7fcbf811a4c3:  4d 2b ef                 subq     %r15, %r13
0x7fcbf811a4c6:  4c 89 6d 08              movq     %r13, 8(%rbp)
  -- guest addr 0x000000400003f018
0x7fcbf811a4ca:  4c 8b d3                 movq     %rbx, %r10
0x7fcbf811a4cd:  4c 8b 5d 50              movq     0x50(%rbp), %r11
0x7fcbf811a4d1:  49 2b db                 subq     %r11, %rbx
0x7fcbf811a4d4:  4d 3b f7                 cmpq     %r15, %r14
0x7fcbf811a4d7:  41 0f 92 c6              setb     %r14b
0x7fcbf811a4db:  45 0f b6 f6              movzbl   %r14b, %r14d
0x7fcbf811a4df:  49 2b de                 subq     %r14, %rbx
0x7fcbf811a4e2:  48 89 5d 10              movq     %rbx, 0x10(%rbp)
0x7fcbf811a4e6:  4c 8b c3                 movq     %rbx, %r8
  -- guest addr 0x000000400003f01c
0x7fcbf811a4e9:  4c 8b 75 18              movq     0x18(%rbp), %r14
0x7fcbf811a4ed:  4d 8b fe                 movq     %r14, %r15
0x7fcbf811a4f0:  4c 8b 4d 00              movq     (%rbp), %r9
0x7fcbf811a4f4:  4d 2b f1                 subq     %r9, %r14
0x7fcbf811a4f7:  48 8b fd                 movq     %rbp, %rdi
0x7fcbf811a4fa:  be 12 00 00 00           movl     $0x12, %esi
0x7fcbf811a4ff:  49 8b d2                 movq     %r10, %rdx
0x7fcbf811a502:  49 8b cb                 movq     %r11, %rcx
0x7fcbf811a505:  ff 15 4d 01 00 00        callq    *0x14d(%rip)
0x7fcbf811a50b:  83 f8 02                 cmpl     $2, %eax
0x7fcbf811a50e:  41 0f 92 c2              setb     %r10b
0x7fcbf811a512:  45 0f b6 d2              movzbl   %r10b, %r10d
0x7fcbf811a516:  45 8b d2                 movl     %r10d, %r10d
0x7fcbf811a519:  4d 2b f2                 subq     %r10, %r14
0x7fcbf811a51c:  4c 89 75 18              movq     %r14, 0x18(%rbp)
0x7fcbf811a520:  48 8b 4d 00              movq     (%rbp), %rcx
0x7fcbf811a524:  4d 8b c6                 movq     %r14, %r8

After:

  -- guest addr 0x000000400003f014
0x7fd1d011a23c:  45 33 f6                 xorl     %r14d, %r14d
0x7fd1d011a23f:  4c 8b 7d 48              movq     0x48(%rbp), %r15
0x7fd1d011a243:  4d 2b ef                 subq     %r15, %r13
0x7fd1d011a246:  4d 1b f6                 sbbq     %r14, %r14
0x7fd1d011a249:  4c 89 6d 08              movq     %r13, 8(%rbp)
  -- guest addr 0x000000400003f018
0x7fd1d011a24d:  49 03 de                 addq     %r14, %rbx
0x7fd1d011a250:  49 83 d6 00              adcq     $0, %r14
0x7fd1d011a254:  4c 8b 7d 50              movq     0x50(%rbp), %r15
0x7fd1d011a258:  49 2b df                 subq     %r15, %rbx
0x7fd1d011a25b:  49 83 de 00              sbbq     $0, %r14
0x7fd1d011a25f:  48 89 5d 10              movq     %rbx, 0x10(%rbp)
  -- guest addr 0x000000400003f01c
0x7fd1d011a263:  4c 8b 7d 18              movq     0x18(%rbp), %r15
0x7fd1d011a267:  4d 03 fe                 addq     %r14, %r15
0x7fd1d011a26a:  49 83 d6 00              adcq     $0, %r14
0x7fd1d011a26e:  4c 8b 55 00              movq     (%rbp), %r10
0x7fd1d011a272:  4d 2b fa                 subq     %r10, %r15
0x7fd1d011a275:  49 83 de 00              sbbq     $0, %r14
0x7fd1d011a279:  4c 89 7d 18              movq     %r15, 0x18(%rbp)

r~

Richard Henderson (4):
  target/s390x: Improve cc computation for ADD LOGICAL
  target/s390x: Improve ADD LOGICAL WITH CARRY
  target/s390x: Improve cc computation for SUBTRACT LOGICAL
  target/s390x: Improve SUB LOGICAL WITH BORROW

 target/s390x/internal.h    |  11 +-
 target/s390x/cc_helper.c   | 123 +++-------------
 target/s390x/helper.c      |  10 +-
 target/s390x/translate.c   | 286 ++++++++++++++++++++-----------------
 target/s390x/insn-data.def |  76 +++++-----
 5 files changed, 213 insertions(+), 293 deletions(-)

-- 
2.25.1

^ permalink raw reply	[flat|nested] 11+ messages in thread