Skip to content

Commit 34c73cf

Browse files
committed
DAG: Avoid forming shufflevector from a single extract_vector_elt
This avoids regressions in a future AMDGPU commit. Previously we would have a build_vector (extract_vector_elt x), undef with free access to the elements bloated into a shuffle of one element + undef, which has much worse combine support than the extract. Alternatively could check aggressivelyPreferBuildVectorSources, but I'm not sure it's really different than isExtractVecEltCheap.
1 parent da5089e commit 34c73cf

File tree

5 files changed

+34
-23
lines changed

5 files changed

+34
-23
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23807,6 +23807,10 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
2380723807
SmallVector<SDValue, 8> VecIn;
2380823808
VecIn.push_back(SDValue());
2380923809

23810+
// If we have a single extract_element with a constant index, track the index
23811+
// value.
23812+
unsigned OneConstExtractIndex = ~0u;
23813+
2381023814
for (unsigned i = 0; i != NumElems; ++i) {
2381123815
SDValue Op = N->getOperand(i);
2381223816

@@ -23824,23 +23828,27 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
2382423828

2382523829
// Not an undef or zero. If the input is something other than an
2382623830
// EXTRACT_VECTOR_ELT with an in-range constant index, bail out.
23827-
if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
23828-
!isa<ConstantSDNode>(Op.getOperand(1)))
23831+
if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
2382923832
return SDValue();
23830-
SDValue ExtractedFromVec = Op.getOperand(0);
2383123833

23834+
SDValue ExtractedFromVec = Op.getOperand(0);
2383223835
if (ExtractedFromVec.getValueType().isScalableVector())
2383323836
return SDValue();
23837+
auto *ExtractIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
23838+
if (!ExtractIdx)
23839+
return SDValue();
2383423840

23835-
const APInt &ExtractIdx = Op.getConstantOperandAPInt(1);
23836-
if (ExtractIdx.uge(ExtractedFromVec.getValueType().getVectorNumElements()))
23841+
if (ExtractIdx->getAsAPIntVal().uge(
23842+
ExtractedFromVec.getValueType().getVectorNumElements()))
2383723843
return SDValue();
2383823844

2383923845
// All inputs must have the same element type as the output.
2384023846
if (VT.getVectorElementType() !=
2384123847
ExtractedFromVec.getValueType().getVectorElementType())
2384223848
return SDValue();
2384323849

23850+
OneConstExtractIndex = ExtractIdx->getZExtValue();
23851+
2384423852
// Have we seen this input vector before?
2384523853
// The vectors are expected to be tiny (usually 1 or 2 elements), so using
2384623854
// a map back from SDValues to numbers isn't worth it.
@@ -23863,6 +23871,13 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
2386323871
// VecIn accordingly.
2386423872
bool DidSplitVec = false;
2386523873
if (VecIn.size() == 2) {
23874+
// If we only found a single constant indexed extract_vector_elt feeding the
23875+
// build_vector, do not produce a more complicated shuffle if the extract is
23876+
// cheap.
23877+
if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, VT) &&
23878+
TLI.isExtractVecEltCheap(VT, OneConstExtractIndex))
23879+
return SDValue();
23880+
2386623881
unsigned MaxIndex = 0;
2386723882
unsigned NearestPow2 = 0;
2386823883
SDValue Vec = VecIn.back();

llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -452,11 +452,11 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3
452452
; GCN-NEXT: s_and_b32 s6, s4, 0x1010101
453453
; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
454454
; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3]
455-
; GCN-NEXT: v_mov_b32_e32 v3, s1
456-
; GCN-NEXT: v_mov_b32_e32 v0, s2
457-
; GCN-NEXT: v_mov_b32_e32 v1, s3
458-
; GCN-NEXT: v_mov_b32_e32 v2, s0
459-
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
455+
; GCN-NEXT: v_mov_b32_e32 v0, s0
456+
; GCN-NEXT: v_mov_b32_e32 v2, s2
457+
; GCN-NEXT: v_mov_b32_e32 v1, s1
458+
; GCN-NEXT: v_mov_b32_e32 v3, s3
459+
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
460460
; GCN-NEXT: s_endpgm
461461
entry:
462462
%v = insertelement <8 x i8> %vec, i8 1, i32 %sel

llvm/test/CodeGen/X86/avx512-build-vector.ll

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,9 @@ define <16 x i32> @test2(<16 x i32> %x) {
1414
define <16 x float> @test3(<4 x float> %a) {
1515
; CHECK-LABEL: test3:
1616
; CHECK: ## %bb.0:
17-
; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
18-
; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15]
19-
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
20-
; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
21-
; CHECK-NEXT: vmovaps %zmm1, %zmm0
17+
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11,0,1,2,3],zero,zero,zero,zero
18+
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
19+
; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
2220
; CHECK-NEXT: retq
2321
%b = extractelement <4 x float> %a, i32 2
2422
%c = insertelement <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %b, i32 5

llvm/test/CodeGen/X86/insertelement-duplicates.ll

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,18 +31,16 @@ define void @PR15298(ptr nocapture %source, ptr nocapture %dest) nounwind noinli
3131
; AVX-32: # %bb.0: # %L.entry
3232
; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax
3333
; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
34-
; AVX-32-NEXT: vbroadcastss 304(%ecx), %xmm0
35-
; AVX-32-NEXT: vxorps %xmm1, %xmm1, %xmm1
36-
; AVX-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7]
34+
; AVX-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
35+
; AVX-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,1]
3736
; AVX-32-NEXT: vmovups %ymm0, 608(%eax)
3837
; AVX-32-NEXT: vzeroupper
3938
; AVX-32-NEXT: retl
4039
;
4140
; AVX-64-LABEL: PR15298:
4241
; AVX-64: # %bb.0: # %L.entry
43-
; AVX-64-NEXT: vbroadcastss 304(%rdi), %xmm0
44-
; AVX-64-NEXT: vxorps %xmm1, %xmm1, %xmm1
45-
; AVX-64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7]
42+
; AVX-64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
43+
; AVX-64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,1]
4644
; AVX-64-NEXT: vmovups %ymm0, 608(%rsi)
4745
; AVX-64-NEXT: vzeroupper
4846
; AVX-64-NEXT: retq

llvm/test/CodeGen/X86/sse-align-12.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ define <4 x float> @b(ptr %y, <4 x float> %z) nounwind {
4040
define <2 x double> @c(ptr %y) nounwind {
4141
; CHECK-LABEL: c:
4242
; CHECK: # %bb.0:
43-
; CHECK-NEXT: movups (%rdi), %xmm0
44-
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
43+
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
44+
; CHECK-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
4545
; CHECK-NEXT: retq
4646
%x = load <2 x double>, ptr %y, align 8
4747
%a = extractelement <2 x double> %x, i32 0

0 commit comments

Comments
 (0)