Skip to content

Commit 0e4fcb8

Browse files
Synthesize SSE3/AVX 128 bit horizontal add/sub instructions from
floating point add/sub of appropriate shuffle vectors. Does not synthesize the 256 bit AVX versions because they work differently. llvm-svn: 140332
1 parent e9a2443 commit 0e4fcb8

File tree

5 files changed

+434
-22
lines changed

5 files changed

+434
-22
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1137,6 +1137,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
11371137
setTargetDAGCombine(ISD::OR);
11381138
setTargetDAGCombine(ISD::AND);
11391139
setTargetDAGCombine(ISD::ADD);
1140+
setTargetDAGCombine(ISD::FADD);
1141+
setTargetDAGCombine(ISD::FSUB);
11401142
setTargetDAGCombine(ISD::SUB);
11411143
setTargetDAGCombine(ISD::LOAD);
11421144
setTargetDAGCombine(ISD::STORE);
@@ -10647,6 +10649,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
1064710649
case X86ISD::FMIN: return "X86ISD::FMIN";
1064810650
case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
1064910651
case X86ISD::FRCP: return "X86ISD::FRCP";
10652+
case X86ISD::FHADD: return "X86ISD::FHADD";
10653+
case X86ISD::FHSUB: return "X86ISD::FHSUB";
1065010654
case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
1065110655
case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
1065210656
case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
@@ -13738,6 +13742,150 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
1373813742
return SDValue();
1373913743
}
1374013744

13745+
/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal"
13746+
/// and return the operands for the horizontal operation in LHS and RHS. A
13747+
/// horizontal operation performs the binary operation on successive elements
13748+
/// of its first operand, then on successive elements of its second operand,
13749+
/// returning the resulting values in a vector. For example, if
13750+
/// A = < float a0, float a1, float a2, float a3 >
13751+
/// and
13752+
/// B = < float b0, float b1, float b2, float b3 >
13753+
/// then the result of doing a horizontal operation on A and B is
13754+
/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
13755+
/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
13756+
/// A horizontal-op B, for some already available A and B, and if so then LHS is
13757+
/// set to A, RHS to B, and the routine returns 'true'.
13758+
/// Note that the binary operation should have the property that if one of the
13759+
/// operands is UNDEF then the result is UNDEF.
13760+
static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) {
13761+
// Look for the following pattern: if
13762+
// A = < float a0, float a1, float a2, float a3 >
13763+
// B = < float b0, float b1, float b2, float b3 >
13764+
// and
13765+
// LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
13766+
// RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
13767+
// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
13768+
// which is A horizontal-op B.
13769+
13770+
// At least one of the operands should be a vector shuffle.
13771+
if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
13772+
RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
13773+
return false;
13774+
13775+
EVT VT = LHS.getValueType();
13776+
unsigned N = VT.getVectorNumElements();
13777+
13778+
// View LHS in the form
13779+
// LHS = VECTOR_SHUFFLE A, B, LMask
13780+
// If LHS is not a shuffle then pretend it is the shuffle
13781+
// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
13782+
// NOTE: in what follows a default initialized SDValue represents an UNDEF of
13783+
// type VT.
13784+
SDValue A, B;
13785+
SmallVector<int, 8> LMask(N);
13786+
if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
13787+
if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
13788+
A = LHS.getOperand(0);
13789+
if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
13790+
B = LHS.getOperand(1);
13791+
cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(LMask);
13792+
} else {
13793+
if (LHS.getOpcode() != ISD::UNDEF)
13794+
A = LHS;
13795+
for (unsigned i = 0; i != N; ++i)
13796+
LMask[i] = i;
13797+
}
13798+
13799+
// Likewise, view RHS in the form
13800+
// RHS = VECTOR_SHUFFLE C, D, RMask
13801+
SDValue C, D;
13802+
SmallVector<int, 8> RMask(N);
13803+
if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
13804+
if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
13805+
C = RHS.getOperand(0);
13806+
if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
13807+
D = RHS.getOperand(1);
13808+
cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(RMask);
13809+
} else {
13810+
if (RHS.getOpcode() != ISD::UNDEF)
13811+
C = RHS;
13812+
for (unsigned i = 0; i != N; ++i)
13813+
RMask[i] = i;
13814+
}
13815+
13816+
// Check that the shuffles are both shuffling the same vectors.
13817+
if (!(A == C && B == D) && !(A == D && B == C))
13818+
return false;
13819+
13820+
// If everything is UNDEF then bail out: it would be better to fold to UNDEF.
13821+
if (!A.getNode() && !B.getNode())
13822+
return false;
13823+
13824+
// If A and B occur in reverse order in RHS, then "swap" them (which means
13825+
// rewriting the mask).
13826+
if (A != C)
13827+
for (unsigned i = 0; i != N; ++i) {
13828+
unsigned Idx = RMask[i];
13829+
if (Idx < N)
13830+
RMask[i] += N;
13831+
else if (Idx < 2*N)
13832+
RMask[i] -= N;
13833+
}
13834+
13835+
// At this point LHS and RHS are equivalent to
13836+
// LHS = VECTOR_SHUFFLE A, B, LMask
13837+
// RHS = VECTOR_SHUFFLE A, B, RMask
13838+
// Check that the masks correspond to performing a horizontal operation.
13839+
for (unsigned i = 0; i != N; ++i) {
13840+
unsigned LIdx = LMask[i], RIdx = RMask[i];
13841+
13842+
// Ignore any UNDEF components.
13843+
if (LIdx >= 2*N || RIdx >= 2*N || (!A.getNode() && (LIdx < N || RIdx < N))
13844+
|| (!B.getNode() && (LIdx >= N || RIdx >= N)))
13845+
continue;
13846+
13847+
// Check that successive elements are being operated on. If not, this is
13848+
// not a horizontal operation.
13849+
if (!(LIdx == 2*i && RIdx == 2*i + 1) &&
13850+
!(isCommutative && LIdx == 2*i + 1 && RIdx == 2*i))
13851+
return false;
13852+
}
13853+
13854+
LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
13855+
RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
13856+
return true;
13857+
}
13858+
13859+
/// PerformFADDCombine - Do target-specific dag combines on floating point adds.
13860+
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
13861+
const X86Subtarget *Subtarget) {
13862+
EVT VT = N->getValueType(0);
13863+
SDValue LHS = N->getOperand(0);
13864+
SDValue RHS = N->getOperand(1);
13865+
13866+
// Try to synthesize horizontal adds from adds of shuffles.
13867+
if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) &&
13868+
(VT == MVT::v4f32 || VT == MVT::v2f64) &&
13869+
isHorizontalBinOp(LHS, RHS, true))
13870+
return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS);
13871+
return SDValue();
13872+
}
13873+
13874+
/// PerformFSUBCombine - Do target-specific dag combines on floating point subs.
13875+
static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
13876+
const X86Subtarget *Subtarget) {
13877+
EVT VT = N->getValueType(0);
13878+
SDValue LHS = N->getOperand(0);
13879+
SDValue RHS = N->getOperand(1);
13880+
13881+
// Try to synthesize horizontal subs from subs of shuffles.
13882+
if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) &&
13883+
(VT == MVT::v4f32 || VT == MVT::v2f64) &&
13884+
isHorizontalBinOp(LHS, RHS, false))
13885+
return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS);
13886+
return SDValue();
13887+
}
13888+
1374113889
/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
1374213890
/// X86ISD::FXOR nodes.
1374313891
static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
@@ -13975,6 +14123,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
1397514123
case ISD::LOAD: return PerformLOADCombine(N, DAG, Subtarget);
1397614124
case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
1397714125
case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this);
14126+
case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget);
14127+
case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget);
1397814128
case X86ISD::FXOR:
1397914129
case X86ISD::FOR: return PerformFORCombine(N, DAG);
1398014130
case X86ISD::FAND: return PerformFANDCombine(N, DAG);

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,12 @@ namespace llvm {
178178
/// BLEND family of opcodes
179179
BLENDV,
180180

181+
/// FHADD - Floating point horizontal add.
182+
FHADD,
183+
184+
/// FHSUB - Floating point horizontal sub.
185+
FHSUB,
186+
181187
/// FMAX, FMIN - Floating point max and min.
182188
///
183189
FMAX, FMIN,

llvm/lib/Target/X86/X86InstrFragmentsSIMD.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>;
3939
def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>;
4040
def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>;
4141
def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>;
42+
def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>;
43+
def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>;
4244
def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>;
4345
def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>;
4446
def X86cmpss : SDNode<"X86ISD::FSETCCss", SDTX86Cmpss>;

llvm/lib/Target/X86/X86InstrSSE.td

Lines changed: 82 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4714,62 +4714,122 @@ let Constraints = "$src1 = $dst", Predicates = [HasSSE3],
47144714

47154715
// Horizontal ops
47164716
multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4717-
X86MemOperand x86memop, Intrinsic IntId, bit Is2Addr = 1> {
4717+
X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
47184718
def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
47194719
!if(Is2Addr,
47204720
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
47214721
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4722-
[(set RC:$dst, (vt (IntId RC:$src1, RC:$src2)))]>;
4722+
[(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>;
47234723

47244724
def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
47254725
!if(Is2Addr,
47264726
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
47274727
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4728-
[(set RC:$dst, (vt (IntId RC:$src1, (memop addr:$src2))))]>;
4728+
[(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))]>;
47294729
}
47304730
multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4731-
X86MemOperand x86memop, Intrinsic IntId, bit Is2Addr = 1> {
4731+
X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
47324732
def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
47334733
!if(Is2Addr,
47344734
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
47354735
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4736-
[(set RC:$dst, (vt (IntId RC:$src1, RC:$src2)))]>;
4736+
[(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>;
47374737

47384738
def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
47394739
!if(Is2Addr,
47404740
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
47414741
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4742-
[(set RC:$dst, (vt (IntId RC:$src1, (memop addr:$src2))))]>;
4742+
[(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))]>;
47434743
}
47444744

47454745
let Predicates = [HasAVX] in {
47464746
defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
4747-
int_x86_sse3_hadd_ps, 0>, VEX_4V;
4747+
X86fhadd, 0>, VEX_4V;
47484748
defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
4749-
int_x86_sse3_hadd_pd, 0>, VEX_4V;
4749+
X86fhadd, 0>, VEX_4V;
47504750
defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
4751-
int_x86_sse3_hsub_ps, 0>, VEX_4V;
4751+
X86fhsub, 0>, VEX_4V;
47524752
defm VHSUBPD : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
4753-
int_x86_sse3_hsub_pd, 0>, VEX_4V;
4753+
X86fhsub, 0>, VEX_4V;
47544754
defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
4755-
int_x86_avx_hadd_ps_256, 0>, VEX_4V;
4755+
X86fhadd, 0>, VEX_4V;
47564756
defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
4757-
int_x86_avx_hadd_pd_256, 0>, VEX_4V;
4757+
X86fhadd, 0>, VEX_4V;
47584758
defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
4759-
int_x86_avx_hsub_ps_256, 0>, VEX_4V;
4759+
X86fhsub, 0>, VEX_4V;
47604760
defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
4761-
int_x86_avx_hsub_pd_256, 0>, VEX_4V;
4761+
X86fhsub, 0>, VEX_4V;
4762+
}
4763+
4764+
let Predicates = [HasAVX] in {
4765+
def : Pat<(int_x86_sse3_hadd_ps (v4f32 VR128:$src1), VR128:$src2),
4766+
(VHADDPSrr VR128:$src1, VR128:$src2)>;
4767+
def : Pat<(int_x86_sse3_hadd_ps (v4f32 VR128:$src1), (memop addr:$src2)),
4768+
(VHADDPSrm VR128:$src1, addr:$src2)>;
4769+
4770+
def : Pat<(int_x86_sse3_hadd_pd (v2f64 VR128:$src1), VR128:$src2),
4771+
(VHADDPDrr VR128:$src1, VR128:$src2)>;
4772+
def : Pat<(int_x86_sse3_hadd_pd (v2f64 VR128:$src1), (memop addr:$src2)),
4773+
(VHADDPDrm VR128:$src1, addr:$src2)>;
4774+
4775+
def : Pat<(int_x86_sse3_hsub_ps (v4f32 VR128:$src1), VR128:$src2),
4776+
(VHSUBPSrr VR128:$src1, VR128:$src2)>;
4777+
def : Pat<(int_x86_sse3_hsub_ps (v4f32 VR128:$src1), (memop addr:$src2)),
4778+
(VHSUBPSrm VR128:$src1, addr:$src2)>;
4779+
4780+
def : Pat<(int_x86_sse3_hsub_pd (v2f64 VR128:$src1), VR128:$src2),
4781+
(VHSUBPDrr VR128:$src1, VR128:$src2)>;
4782+
def : Pat<(int_x86_sse3_hsub_pd (v2f64 VR128:$src1), (memop addr:$src2)),
4783+
(VHSUBPDrm VR128:$src1, addr:$src2)>;
4784+
4785+
def : Pat<(int_x86_avx_hadd_ps_256 (v8f32 VR256:$src1), VR256:$src2),
4786+
(VHADDPSYrr VR256:$src1, VR256:$src2)>;
4787+
def : Pat<(int_x86_avx_hadd_ps_256 (v8f32 VR256:$src1), (memop addr:$src2)),
4788+
(VHADDPSYrm VR256:$src1, addr:$src2)>;
4789+
4790+
def : Pat<(int_x86_avx_hadd_pd_256 (v4f64 VR256:$src1), VR256:$src2),
4791+
(VHADDPDYrr VR256:$src1, VR256:$src2)>;
4792+
def : Pat<(int_x86_avx_hadd_pd_256 (v4f64 VR256:$src1), (memop addr:$src2)),
4793+
(VHADDPDYrm VR256:$src1, addr:$src2)>;
4794+
4795+
def : Pat<(int_x86_avx_hsub_ps_256 (v8f32 VR256:$src1), VR256:$src2),
4796+
(VHSUBPSYrr VR256:$src1, VR256:$src2)>;
4797+
def : Pat<(int_x86_avx_hsub_ps_256 (v8f32 VR256:$src1), (memop addr:$src2)),
4798+
(VHSUBPSYrm VR256:$src1, addr:$src2)>;
4799+
4800+
def : Pat<(int_x86_avx_hsub_pd_256 (v4f64 VR256:$src1), VR256:$src2),
4801+
(VHSUBPDYrr VR256:$src1, VR256:$src2)>;
4802+
def : Pat<(int_x86_avx_hsub_pd_256 (v4f64 VR256:$src1), (memop addr:$src2)),
4803+
(VHSUBPDYrm VR256:$src1, addr:$src2)>;
47624804
}
47634805

47644806
let Constraints = "$src1 = $dst" in {
4765-
defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem,
4766-
int_x86_sse3_hadd_ps>;
4767-
defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem,
4768-
int_x86_sse3_hadd_pd>;
4769-
defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem,
4770-
int_x86_sse3_hsub_ps>;
4771-
defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem,
4772-
int_x86_sse3_hsub_pd>;
4807+
defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd>;
4808+
defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd>;
4809+
defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub>;
4810+
defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub>;
4811+
}
4812+
4813+
let Predicates = [HasSSE3] in {
4814+
def : Pat<(int_x86_sse3_hadd_ps (v4f32 VR128:$src1), VR128:$src2),
4815+
(HADDPSrr VR128:$src1, VR128:$src2)>;
4816+
def : Pat<(int_x86_sse3_hadd_ps (v4f32 VR128:$src1), (memop addr:$src2)),
4817+
(HADDPSrm VR128:$src1, addr:$src2)>;
4818+
4819+
def : Pat<(int_x86_sse3_hadd_pd (v2f64 VR128:$src1), VR128:$src2),
4820+
(HADDPDrr VR128:$src1, VR128:$src2)>;
4821+
def : Pat<(int_x86_sse3_hadd_pd (v2f64 VR128:$src1), (memop addr:$src2)),
4822+
(HADDPDrm VR128:$src1, addr:$src2)>;
4823+
4824+
def : Pat<(int_x86_sse3_hsub_ps (v4f32 VR128:$src1), VR128:$src2),
4825+
(HSUBPSrr VR128:$src1, VR128:$src2)>;
4826+
def : Pat<(int_x86_sse3_hsub_ps (v4f32 VR128:$src1), (memop addr:$src2)),
4827+
(HSUBPSrm VR128:$src1, addr:$src2)>;
4828+
4829+
def : Pat<(int_x86_sse3_hsub_pd (v2f64 VR128:$src1), VR128:$src2),
4830+
(HSUBPDrr VR128:$src1, VR128:$src2)>;
4831+
def : Pat<(int_x86_sse3_hsub_pd (v2f64 VR128:$src1), (memop addr:$src2)),
4832+
(HSUBPDrm VR128:$src1, addr:$src2)>;
47734833
}
47744834

47754835
//===---------------------------------------------------------------------===//

0 commit comments

Comments
 (0)