Skip to content

Commit fa0e9ac

Browse files
committed
[X86] Remove PMADDWD/PMADDUBSW known bits handling due to performance issues
This appears to be causing an slow (infinite?) loop when building the highway open source project - most likely due to the high number of computeKnownBits calls (although improving early-out doesn't appear help so far). I'm reverting support to unstick the highway team and will revisit this shortly. Reported by @alexfh
1 parent 559ea40 commit fa0e9ac

File tree

2 files changed

+68
-106
lines changed

2 files changed

+68
-106
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 0 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -37139,52 +37139,6 @@ static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS,
3713937139
Known = Known.zext(64);
3714037140
}
3714137141

37142-
static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS,
37143-
KnownBits &Known,
37144-
const APInt &DemandedElts,
37145-
const SelectionDAG &DAG,
37146-
unsigned Depth) {
37147-
unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
37148-
37149-
// Multiply signed i16 elements to create i32 values and add Lo/Hi pairs.
37150-
APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
37151-
APInt DemandedLoElts =
37152-
DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
37153-
APInt DemandedHiElts =
37154-
DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
37155-
KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
37156-
KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
37157-
KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
37158-
KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
37159-
KnownBits Lo = KnownBits::mul(LHSLo.sext(32), RHSLo.sext(32));
37160-
KnownBits Hi = KnownBits::mul(LHSHi.sext(32), RHSHi.sext(32));
37161-
Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/false,
37162-
/*NUW=*/false, Lo, Hi);
37163-
}
37164-
37165-
static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS,
37166-
KnownBits &Known,
37167-
const APInt &DemandedElts,
37168-
const SelectionDAG &DAG,
37169-
unsigned Depth) {
37170-
unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
37171-
37172-
// Multiply unsigned/signed i8 elements to create i16 values and add_sat Lo/Hi
37173-
// pairs.
37174-
APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
37175-
APInt DemandedLoElts =
37176-
DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
37177-
APInt DemandedHiElts =
37178-
DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
37179-
KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
37180-
KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
37181-
KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
37182-
KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
37183-
KnownBits Lo = KnownBits::mul(LHSLo.zext(16), RHSLo.sext(16));
37184-
KnownBits Hi = KnownBits::mul(LHSHi.zext(16), RHSHi.sext(16));
37185-
Known = KnownBits::sadd_sat(Lo, Hi);
37186-
}
37187-
3718837142
void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
3718937143
KnownBits &Known,
3719037144
const APInt &DemandedElts,
@@ -37360,26 +37314,6 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
3736037314
}
3736137315
break;
3736237316
}
37363-
case X86ISD::VPMADDWD: {
37364-
SDValue LHS = Op.getOperand(0);
37365-
SDValue RHS = Op.getOperand(1);
37366-
assert(VT.getVectorElementType() == MVT::i32 &&
37367-
LHS.getValueType() == RHS.getValueType() &&
37368-
LHS.getValueType().getVectorElementType() == MVT::i16 &&
37369-
"Unexpected PMADDWD types");
37370-
computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
37371-
break;
37372-
}
37373-
case X86ISD::VPMADDUBSW: {
37374-
SDValue LHS = Op.getOperand(0);
37375-
SDValue RHS = Op.getOperand(1);
37376-
assert(VT.getVectorElementType() == MVT::i16 &&
37377-
LHS.getValueType() == RHS.getValueType() &&
37378-
LHS.getValueType().getVectorElementType() == MVT::i8 &&
37379-
"Unexpected PMADDUBSW types");
37380-
computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
37381-
break;
37382-
}
3738337317
case X86ISD::PMULUDQ: {
3738437318
KnownBits Known2;
3738537319
Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
@@ -37516,30 +37450,6 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
3751637450
}
3751737451
case ISD::INTRINSIC_WO_CHAIN: {
3751837452
switch (Op->getConstantOperandVal(0)) {
37519-
case Intrinsic::x86_sse2_pmadd_wd:
37520-
case Intrinsic::x86_avx2_pmadd_wd:
37521-
case Intrinsic::x86_avx512_pmaddw_d_512: {
37522-
SDValue LHS = Op.getOperand(1);
37523-
SDValue RHS = Op.getOperand(2);
37524-
assert(VT.getScalarType() == MVT::i32 &&
37525-
LHS.getValueType() == RHS.getValueType() &&
37526-
LHS.getValueType().getScalarType() == MVT::i16 &&
37527-
"Unexpected PMADDWD types");
37528-
computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
37529-
break;
37530-
}
37531-
case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
37532-
case Intrinsic::x86_avx2_pmadd_ub_sw:
37533-
case Intrinsic::x86_avx512_pmaddubs_w_512: {
37534-
SDValue LHS = Op.getOperand(1);
37535-
SDValue RHS = Op.getOperand(2);
37536-
assert(VT.getScalarType() == MVT::i16 &&
37537-
LHS.getValueType() == RHS.getValueType() &&
37538-
LHS.getValueType().getScalarType() == MVT::i8 &&
37539-
"Unexpected PMADDUBSW types");
37540-
computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
37541-
break;
37542-
}
3754337453
case Intrinsic::x86_sse2_psad_bw:
3754437454
case Intrinsic::x86_avx2_psad_bw:
3754537455
case Intrinsic::x86_avx512_psad_bw_512: {

llvm/test/CodeGen/X86/combine-pmadd.ll

Lines changed: 68 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -88,22 +88,48 @@ define <4 x i32> @combine_pmaddwd_demandedelts(<8 x i16> %a0, <8 x i16> %a1) {
8888
ret <4 x i32> %4
8989
}
9090

91+
; TODO
9192
define i32 @combine_pmaddwd_constant() {
92-
; CHECK-LABEL: combine_pmaddwd_constant:
93-
; CHECK: # %bb.0:
94-
; CHECK-NEXT: movl $-155, %eax
95-
; CHECK-NEXT: retq
93+
; SSE-LABEL: combine_pmaddwd_constant:
94+
; SSE: # %bb.0:
95+
; SSE-NEXT: pmovsxbw {{.*#+}} xmm0 = [65535,2,3,65532,65531,6,7,65528]
96+
; SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [65531,7,65527,65525,13,65521,17,65517]
97+
; SSE-NEXT: pextrd $2, %xmm0, %eax
98+
; SSE-NEXT: retq
99+
;
100+
; AVX-LABEL: combine_pmaddwd_constant:
101+
; AVX: # %bb.0:
102+
; AVX-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,2,3,65532,65531,6,7,65528]
103+
; AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [65531,7,65527,65525,13,65521,17,65517]
104+
; AVX-NEXT: vpextrd $2, %xmm0, %eax
105+
; AVX-NEXT: retq
96106
%1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> <i16 -1, i16 2, i16 3, i16 -4, i16 -5, i16 6, i16 7, i16 -8>, <8 x i16> <i16 -5, i16 7, i16 -9, i16 -11, i16 13, i16 -15, i16 17, i16 -19>)
97107
%2 = extractelement <4 x i32> %1, i32 2 ; (-5*13)+(6*-15) = -155
98108
ret i32 %2
99109
}
100110

101111
; ensure we don't assume pmaddwd performs add nsw
102112
define i32 @combine_pmaddwd_constant_nsw() {
103-
; CHECK-LABEL: combine_pmaddwd_constant_nsw:
104-
; CHECK: # %bb.0:
105-
; CHECK-NEXT: movl $-2147483648, %eax # imm = 0x80000000
106-
; CHECK-NEXT: retq
113+
; SSE-LABEL: combine_pmaddwd_constant_nsw:
114+
; SSE: # %bb.0:
115+
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [32768,32768,32768,32768,32768,32768,32768,32768]
116+
; SSE-NEXT: pmaddwd %xmm0, %xmm0
117+
; SSE-NEXT: movd %xmm0, %eax
118+
; SSE-NEXT: retq
119+
;
120+
; AVX1-LABEL: combine_pmaddwd_constant_nsw:
121+
; AVX1: # %bb.0:
122+
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768,32768,32768,32768,32768]
123+
; AVX1-NEXT: vpmaddwd %xmm0, %xmm0, %xmm0
124+
; AVX1-NEXT: vmovd %xmm0, %eax
125+
; AVX1-NEXT: retq
126+
;
127+
; AVX2-LABEL: combine_pmaddwd_constant_nsw:
128+
; AVX2: # %bb.0:
129+
; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [32768,32768,32768,32768,32768,32768,32768,32768]
130+
; AVX2-NEXT: vpmaddwd %xmm0, %xmm0, %xmm0
131+
; AVX2-NEXT: vmovd %xmm0, %eax
132+
; AVX2-NEXT: retq
107133
%1 = insertelement <8 x i16> undef, i16 32768, i32 0
108134
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
109135
%3 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %2, <8 x i16> %2)
@@ -193,25 +219,51 @@ define <8 x i16> @combine_pmaddubsw_demandedelts(<16 x i8> %a0, <16 x i8> %a1) {
193219
ret <8 x i16> %4
194220
}
195221

222+
; TODO
196223
define i32 @combine_pmaddubsw_constant() {
197-
; CHECK-LABEL: combine_pmaddubsw_constant:
198-
; CHECK: # %bb.0:
199-
; CHECK-NEXT: movl $1694, %eax # imm = 0x69E
200-
; CHECK-NEXT: retq
224+
; SSE-LABEL: combine_pmaddubsw_constant:
225+
; SSE: # %bb.0:
226+
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,250,7,8,9,10,11,12,13,14,15]
227+
; SSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,3,4,5,6,7,248,9,10,11,12,13,14,15,16]
228+
; SSE-NEXT: pextrw $3, %xmm0, %eax
229+
; SSE-NEXT: cwtl
230+
; SSE-NEXT: retq
231+
;
232+
; AVX-LABEL: combine_pmaddubsw_constant:
233+
; AVX: # %bb.0:
234+
; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,250,7,8,9,10,11,12,13,14,15]
235+
; AVX-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,3,4,5,6,7,248,9,10,11,12,13,14,15,16]
236+
; AVX-NEXT: vpextrw $3, %xmm0, %eax
237+
; AVX-NEXT: cwtl
238+
; AVX-NEXT: retq
201239
%1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 -6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>)
202240
%2 = extractelement <8 x i16> %1, i32 3 ; ((uint16_t)-6*7)+(7*-8) = (250*7)+(7*-8) = 1694
203241
%3 = sext i16 %2 to i32
204242
ret i32 %3
205243
}
206244

245+
; TODO
207246
define i32 @combine_pmaddubsw_constant_sat() {
208-
; CHECK-LABEL: combine_pmaddubsw_constant_sat:
209-
; CHECK: # %bb.0:
210-
; CHECK-NEXT: movl $-32768, %eax # imm = 0x8000
211-
; CHECK-NEXT: retq
247+
; SSE-LABEL: combine_pmaddubsw_constant_sat:
248+
; SSE: # %bb.0:
249+
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,2,3,4,5,250,7,8,9,10,11,12,13,14,15]
250+
; SSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,128,3,4,5,6,7,248,9,10,11,12,13,14,15,16]
251+
; SSE-NEXT: movd %xmm0, %eax
252+
; SSE-NEXT: cwtl
253+
; SSE-NEXT: retq
254+
;
255+
; AVX-LABEL: combine_pmaddubsw_constant_sat:
256+
; AVX: # %bb.0:
257+
; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,2,3,4,5,250,7,8,9,10,11,12,13,14,15]
258+
; AVX-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,128,3,4,5,6,7,248,9,10,11,12,13,14,15,16]
259+
; AVX-NEXT: vmovd %xmm0, %eax
260+
; AVX-NEXT: cwtl
261+
; AVX-NEXT: retq
212262
%1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> <i8 -1, i8 -1, i8 2, i8 3, i8 4, i8 5, i8 -6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, <16 x i8> <i8 -128, i8 -128, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>)
213263
%2 = extractelement <8 x i16> %1, i32 0 ; add_sat_i16(((uint16_t)-1*-128),((uint16_t)-1*-128)_ = add_sat_i16(255*-128),(255*-128)) = sat_i16(-65280) = -32768
214264
%3 = sext i16 %2 to i32
215265
ret i32 %3
216266
}
217267

268+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
269+
; CHECK: {{.*}}

0 commit comments

Comments
 (0)