Skip to content

Commit 8b5d9cb

Browse files
omern1jmorse
authored andcommitted
[x86][DAG] Unroll vectorized FREMs that will become libcalls
Currently, two element vectors produced as the result of a binary op are widened to four element vectors on x86 by DAGTypeLegalizer::WidenVecRes_BinaryCanTrap. If the op still isn't legal after widening it is unrolled into 4 scalar ops in SelectionDAG before being converted into a libcall. This way we end up with 4 libcalls (two of them on known undef elements) instead of the original two libcalls. This patch modifies DAGTypeLegalizer::WidenVectorResult to ensure that if it is known that a binary op will be tunred into a libcall, it is unrolled instead of being widened. This prevents the creation of the extra scalar instructions on known undef elements and (eventually) libacalls with known undef parameters which would otherwise be created when the op gets expanded post widening. Differential Revision: https://reviews.llvm.org/D125988
1 parent 1e2b746 commit 8b5d9cb

File tree

2 files changed

+34
-37
lines changed

2 files changed

+34
-37
lines changed

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3580,6 +3580,22 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
35803580
return;
35813581

35823582
SDValue Res = SDValue();
3583+
3584+
auto unrollExpandedOp = [&]() {
3585+
// We're going to widen this vector op to a legal type by padding with undef
3586+
// elements. If the wide vector op is eventually going to be expanded to
3587+
// scalar libcalls, then unroll into scalar ops now to avoid unnecessary
3588+
// libcalls on the undef elements.
3589+
EVT VT = N->getValueType(0);
3590+
EVT WideVecVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
3591+
if (!TLI.isOperationLegalOrCustom(N->getOpcode(), WideVecVT) &&
3592+
TLI.isOperationExpand(N->getOpcode(), VT.getScalarType())) {
3593+
Res = DAG.UnrollVectorOp(N, WideVecVT.getVectorNumElements());
3594+
return true;
3595+
}
3596+
return false;
3597+
};
3598+
35833599
switch (N->getOpcode()) {
35843600
default:
35853601
#ifndef NDEBUG
@@ -3678,12 +3694,19 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
36783694
Res = WidenVecRes_Binary(N);
36793695
break;
36803696

3697+
case ISD::FREM:
3698+
if (unrollExpandedOp())
3699+
break;
3700+
// If the target has custom/legal support for the scalar FP intrinsic ops
3701+
// (they are probably not destined to become libcalls), then widen those
3702+
// like any other binary ops.
3703+
LLVM_FALLTHROUGH;
3704+
36813705
case ISD::FADD:
36823706
case ISD::FMUL:
36833707
case ISD::FPOW:
36843708
case ISD::FSUB:
36853709
case ISD::FDIV:
3686-
case ISD::FREM:
36873710
case ISD::SDIV:
36883711
case ISD::UDIV:
36893712
case ISD::SREM:
@@ -3766,23 +3789,13 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
37663789
case ISD::FROUNDEVEN:
37673790
case ISD::FSIN:
37683791
case ISD::FSQRT:
3769-
case ISD::FTRUNC: {
3770-
// We're going to widen this vector op to a legal type by padding with undef
3771-
// elements. If the wide vector op is eventually going to be expanded to
3772-
// scalar libcalls, then unroll into scalar ops now to avoid unnecessary
3773-
// libcalls on the undef elements.
3774-
EVT VT = N->getValueType(0);
3775-
EVT WideVecVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
3776-
if (!TLI.isOperationLegalOrCustom(N->getOpcode(), WideVecVT) &&
3777-
TLI.isOperationExpand(N->getOpcode(), VT.getScalarType())) {
3778-
Res = DAG.UnrollVectorOp(N, WideVecVT.getVectorNumElements());
3792+
case ISD::FTRUNC:
3793+
if (unrollExpandedOp())
37793794
break;
3780-
}
3781-
}
3782-
// If the target has custom/legal support for the scalar FP intrinsic ops
3783-
// (they are probably not destined to become libcalls), then widen those like
3784-
// any other unary ops.
3785-
LLVM_FALLTHROUGH;
3795+
// If the target has custom/legal support for the scalar FP intrinsic ops
3796+
// (they are probably not destined to become libcalls), then widen those
3797+
// like any other unary ops.
3798+
LLVM_FALLTHROUGH;
37863799

37873800
case ISD::ABS:
37883801
case ISD::BITREVERSE:

llvm/test/CodeGen/X86/frem-libcall.ll

Lines changed: 4 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,33 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc -mtriple=x86_64-linux-gnu < %s | FileCheck %s
33

4-
; FIXME: Ensure vectorized FREMs are not widened/unrolled such that they get lowered
4+
; Ensure vectorized FREMs are not widened/unrolled such that they get lowered
55
; into libcalls on undef elements.
66

77
define float @frem(<2 x float> %a0, <2 x float> %a1, <2 x float> %a2, <2 x float> *%p3) nounwind {
88
; CHECK-LABEL: frem:
99
; CHECK: # %bb.0:
1010
; CHECK-NEXT: pushq %rbx
11-
; CHECK-NEXT: subq $80, %rsp
11+
; CHECK-NEXT: subq $64, %rsp
1212
; CHECK-NEXT: movq %rdi, %rbx
1313
; CHECK-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1414
; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15-
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16-
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
17-
; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
18-
; CHECK-NEXT: callq fmodf@PLT
19-
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
20-
; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
21-
; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
22-
; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
23-
; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
24-
; CHECK-NEXT: callq fmodf@PLT
25-
; CHECK-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
26-
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2715
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
28-
; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
29-
; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3016
; CHECK-NEXT: callq fmodf@PLT
3117
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
32-
; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
18+
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
3319
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
3420
; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3521
; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
3622
; CHECK-NEXT: callq fmodf@PLT
3723
; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3824
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
39-
; CHECK-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload
40-
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
4125
; CHECK-NEXT: divps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4226
; CHECK-NEXT: movaps %xmm1, %xmm0
4327
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
4428
; CHECK-NEXT: addss %xmm1, %xmm0
4529
; CHECK-NEXT: movlps %xmm1, (%rbx)
46-
; CHECK-NEXT: addq $80, %rsp
30+
; CHECK-NEXT: addq $64, %rsp
4731
; CHECK-NEXT: popq %rbx
4832
; CHECK-NEXT: retq
4933
%frem = frem <2 x float> %a0, %a1

0 commit comments

Comments
 (0)