Skip to content

Commit e28c8ec

Browse files
committed
[x86] add and use fast horizontal vector math subtarget feature
This is the planned follow-up to D52997. Here we are reducing horizontal vector math codegen by default. AMD Jaguar (btver2) should have no difference with this patch because it has fast-hops. (If we want to set that bit for other CPUs, let me know.) The code changes are small, but there are many test diffs. For files that are specifically testing for hops, I added RUNs to distinguish fast/slow, so we can see the consequences side-by-side. For files that are primarily concerned with codegen other than hops, I just updated the CHECK lines to reflect the new default codegen. To recap the recent horizontal op story: 1. Before rL343727, we were producing hops for all subtargets for a variety of patterns. Hops were likely not optimal for all targets though. 2. The IR improvement in r343727 exposed a hole in the backend hop pattern matching, so we reduced hop codegen for all subtargets. That was bad for Jaguar (PR39195). 3. We restored the hop codegen for all targets with rL344141. Good for Jaguar, but probably bad for other CPUs. 4. This patch allows us to distinguish when we want to produce hops, so everyone can be happy. I'm not sure if we have the best predicate here, but the intent is to undo the extra hop-iness that was enabled by r344141. Differential Revision: https://reviews.llvm.org/D53095 llvm-svn: 344361
1 parent 47bab69 commit e28c8ec

15 files changed

+2378
-1220
lines changed

llvm/lib/Target/X86/X86.td

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,15 @@ def FeatureFastBEXTR : SubtargetFeature<"fast-bextr", "HasFastBEXTR", "true",
404404
"Indicates that the BEXTR instruction is implemented as a single uop "
405405
"with good throughput.">;
406406

407+
// Combine vector math operations with shuffles into horizontal math
408+
// instructions if a CPU implements horizontal operations (introduced with
409+
// SSE3) with better latency/throughput than the alternative sequence.
410+
def FeatureFastHorizontalOps
411+
: SubtargetFeature<
412+
"fast-hops", "HasFastHorizontalOps", "true",
413+
"Prefer horizontal vector math instructions (haddp, phsub, etc.) over "
414+
"normal vector instructions with shuffles", [FeatureSSE3]>;
415+
407416
// Merge branches using three-way conditional code.
408417
def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch",
409418
"ThreewayBranchProfitable", "true",
@@ -998,7 +1007,8 @@ def : ProcessorModel<"btver2", BtVer2Model, [
9981007
FeatureLAHFSAHF,
9991008
FeatureFast15ByteNOP,
10001009
FeatureFastBEXTR,
1001-
FeatureFastPartialYMMorZMMWrite
1010+
FeatureFastPartialYMMorZMMWrite,
1011+
FeatureFastHorizontalOps
10021012
]>;
10031013

10041014
// Bulldozer

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37031,9 +37031,6 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
3703137031
// The low half of the 128-bit result must choose from A.
3703237032
// The high half of the 128-bit result must choose from B,
3703337033
// unless B is undef. In that case, we are always choosing from A.
37034-
// TODO: Using a horizontal op on a single input is likely worse for
37035-
// performance on many CPUs, so this should be limited here or reversed
37036-
// in a later pass.
3703737034
unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
3703837035
unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0;
3703937036

@@ -37051,6 +37048,16 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
3705137048
return true;
3705237049
}
3705337050

37051+
/// Horizontal vector math instructions may be slower than normal math with
37052+
/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
37053+
/// implementation, and likely shuffle complexity of the alternate sequence.
37054+
static bool shouldCombineToHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
37055+
const X86Subtarget &Subtarget) {
37056+
bool IsOptimizingSize = DAG.getMachineFunction().getFunction().optForSize();
37057+
bool HasFastHOps = Subtarget.hasFastHorizontalOps();
37058+
return !IsSingleSource || IsOptimizingSize || HasFastHOps;
37059+
}
37060+
3705437061
/// Do target-specific dag combines on floating-point adds/subs.
3705537062
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
3705637063
const X86Subtarget &Subtarget) {
@@ -37063,7 +37070,8 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
3706337070
// Try to synthesize horizontal add/sub from adds/subs of shuffles.
3706437071
if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
3706537072
(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
37066-
isHorizontalBinOp(LHS, RHS, IsFadd)) {
37073+
isHorizontalBinOp(LHS, RHS, IsFadd) &&
37074+
shouldCombineToHorizontalOp(LHS == RHS, DAG, Subtarget)) {
3706737075
auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
3706837076
return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
3706937077
}
@@ -39787,7 +39795,8 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
3978739795
// Try to synthesize horizontal adds from adds of shuffles.
3978839796
if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
3978939797
VT == MVT::v8i32) &&
39790-
Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true)) {
39798+
Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true) &&
39799+
shouldCombineToHorizontalOp(Op0 == Op1, DAG, Subtarget)) {
3979139800
auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
3979239801
ArrayRef<SDValue> Ops) {
3979339802
return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops);
@@ -39918,7 +39927,8 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
3991839927
EVT VT = N->getValueType(0);
3991939928
if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
3992039929
VT == MVT::v8i32) &&
39921-
Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false)) {
39930+
Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false) &&
39931+
shouldCombineToHorizontalOp(Op0 == Op1, DAG, Subtarget)) {
3992239932
auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
3992339933
ArrayRef<SDValue> Ops) {
3992439934
return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops);

llvm/lib/Target/X86/X86Subtarget.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,9 @@ class X86Subtarget final : public X86GenSubtargetInfo {
388388
/// Processor has a single uop BEXTR implementation.
389389
bool HasFastBEXTR = false;
390390

391+
/// Try harder to combine to horizontal vector ops if they are fast.
392+
bool HasFastHorizontalOps = false;
393+
391394
/// Use a retpoline thunk rather than indirect calls to block speculative
392395
/// execution.
393396
bool UseRetpolineIndirectCalls = false;
@@ -636,6 +639,7 @@ class X86Subtarget final : public X86GenSubtargetInfo {
636639
bool hasFastLZCNT() const { return HasFastLZCNT; }
637640
bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
638641
bool hasFastBEXTR() const { return HasFastBEXTR; }
642+
bool hasFastHorizontalOps() const { return HasFastHorizontalOps; }
639643
bool hasMacroFusion() const { return HasMacroFusion; }
640644
bool hasERMSB() const { return HasERMSB; }
641645
bool hasSlowDivide32() const { return HasSlowDivide32; }

llvm/test/CodeGen/X86/avx2-phaddsub.ll

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32
3-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64
2+
; RUN: llc < %s -mtriple=i686-- -mattr=+avx2 | FileCheck %s --check-prefixes=X32,X32-SLOW
3+
; RUN: llc < %s -mtriple=i686-- -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=X32,X32-FAST
4+
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-SLOW
5+
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=X64,X64-FAST
46

57
define <16 x i16> @phaddw1(<16 x i16> %x, <16 x i16> %y) {
68
; X32-LABEL: phaddw1:
@@ -67,15 +69,29 @@ define <8 x i32> @phaddd2(<8 x i32> %x, <8 x i32> %y) {
6769
}
6870

6971
define <8 x i32> @phaddd3(<8 x i32> %x) {
70-
; X32-LABEL: phaddd3:
71-
; X32: # %bb.0:
72-
; X32-NEXT: vphaddd %ymm0, %ymm0, %ymm0
73-
; X32-NEXT: retl
72+
; X32-SLOW-LABEL: phaddd3:
73+
; X32-SLOW: # %bb.0:
74+
; X32-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
75+
; X32-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
76+
; X32-SLOW-NEXT: vpaddd %ymm0, %ymm1, %ymm0
77+
; X32-SLOW-NEXT: retl
7478
;
75-
; X64-LABEL: phaddd3:
76-
; X64: # %bb.0:
77-
; X64-NEXT: vphaddd %ymm0, %ymm0, %ymm0
78-
; X64-NEXT: retq
79+
; X32-FAST-LABEL: phaddd3:
80+
; X32-FAST: # %bb.0:
81+
; X32-FAST-NEXT: vphaddd %ymm0, %ymm0, %ymm0
82+
; X32-FAST-NEXT: retl
83+
;
84+
; X64-SLOW-LABEL: phaddd3:
85+
; X64-SLOW: # %bb.0:
86+
; X64-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
87+
; X64-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
88+
; X64-SLOW-NEXT: vpaddd %ymm0, %ymm1, %ymm0
89+
; X64-SLOW-NEXT: retq
90+
;
91+
; X64-FAST-LABEL: phaddd3:
92+
; X64-FAST: # %bb.0:
93+
; X64-FAST-NEXT: vphaddd %ymm0, %ymm0, %ymm0
94+
; X64-FAST-NEXT: retq
7995
%a = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
8096
%b = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
8197
%r = add <8 x i32> %a, %b

llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6860,7 +6860,8 @@ define i32 @test_mm512_reduce_add_epi32(<8 x i64> %__W) {
68606860
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
68616861
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
68626862
; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
6863-
; CHECK-NEXT: vphaddd %xmm0, %xmm0, %xmm0
6863+
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
6864+
; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
68646865
; CHECK-NEXT: vmovd %xmm0, %eax
68656866
; CHECK-NEXT: vzeroupper
68666867
; CHECK-NEXT: ret{{[l|q]}}
@@ -6989,7 +6990,8 @@ define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) {
69896990
; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0
69906991
; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
69916992
; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0
6992-
; X86-NEXT: vphaddd %xmm0, %xmm0, %xmm0
6993+
; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
6994+
; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0
69936995
; X86-NEXT: vmovd %xmm0, %eax
69946996
; X86-NEXT: vzeroupper
69956997
; X86-NEXT: retl
@@ -7004,7 +7006,8 @@ define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) {
70047006
; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
70057007
; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
70067008
; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0
7007-
; X64-NEXT: vphaddd %xmm0, %xmm0, %xmm0
7009+
; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7010+
; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0
70087011
; X64-NEXT: vmovd %xmm0, %eax
70097012
; X64-NEXT: vzeroupper
70107013
; X64-NEXT: retq
@@ -7210,7 +7213,8 @@ define double @test_mm512_reduce_add_pd(<8 x double> %__W) {
72107213
; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0
72117214
; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
72127215
; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7213-
; X86-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
7216+
; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7217+
; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
72147218
; X86-NEXT: vmovlpd %xmm0, (%esp)
72157219
; X86-NEXT: fldl (%esp)
72167220
; X86-NEXT: movl %ebp, %esp
@@ -7225,7 +7229,8 @@ define double @test_mm512_reduce_add_pd(<8 x double> %__W) {
72257229
; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
72267230
; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
72277231
; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7228-
; X64-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
7232+
; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7233+
; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
72297234
; X64-NEXT: vzeroupper
72307235
; X64-NEXT: retq
72317236
entry:
@@ -7405,7 +7410,8 @@ define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W)
74057410
; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0
74067411
; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
74077412
; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7408-
; X86-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
7413+
; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7414+
; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
74097415
; X86-NEXT: vmovlpd %xmm0, (%esp)
74107416
; X86-NEXT: fldl (%esp)
74117417
; X86-NEXT: movl %ebp, %esp
@@ -7422,7 +7428,8 @@ define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W)
74227428
; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
74237429
; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
74247430
; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7425-
; X64-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
7431+
; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7432+
; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
74267433
; X64-NEXT: vzeroupper
74277434
; X64-NEXT: retq
74287435
entry:

0 commit comments

Comments
 (0)