Skip to content

Commit 9b32f3d

Browse files
authored
[DAG] visitEXTRACT_SUBVECTOR - don't return early on failure of EXTRACT_SUBVECTOR(INSERT_SUBVECTOR()) -> BITCAST fold (#133695)
Always allow later folds to try to match as well.
1 parent b9b9add commit 9b32f3d

File tree

6 files changed

+335
-362
lines changed

6 files changed

+335
-362
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -25532,26 +25532,24 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
2553225532
// Handle only simple case where vector being inserted and vector
2553325533
// being extracted are of same size.
2553425534
EVT SmallVT = V.getOperand(1).getValueType();
25535-
if (!NVT.bitsEq(SmallVT))
25536-
return SDValue();
25537-
25538-
// Combine:
25539-
// (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx)
25540-
// Into:
25541-
// indices are equal or bit offsets are equal => V1
25542-
// otherwise => (extract_subvec V1, ExtIdx)
25543-
uint64_t InsIdx = V.getConstantOperandVal(2);
25544-
if (InsIdx * SmallVT.getScalarSizeInBits() ==
25545-
ExtIdx * NVT.getScalarSizeInBits()) {
25546-
if (LegalOperations && !TLI.isOperationLegal(ISD::BITCAST, NVT))
25547-
return SDValue();
25548-
25549-
return DAG.getBitcast(NVT, V.getOperand(1));
25535+
if (NVT.bitsEq(SmallVT)) {
25536+
// Combine:
25537+
// (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx)
25538+
// Into:
25539+
// indices are equal or bit offsets are equal => V1
25540+
// otherwise => (extract_subvec V1, ExtIdx)
25541+
uint64_t InsIdx = V.getConstantOperandVal(2);
25542+
if (InsIdx * SmallVT.getScalarSizeInBits() ==
25543+
ExtIdx * NVT.getScalarSizeInBits()) {
25544+
if (!LegalOperations || TLI.isOperationLegal(ISD::BITCAST, NVT))
25545+
return DAG.getBitcast(NVT, V.getOperand(1));
25546+
} else {
25547+
return DAG.getNode(
25548+
ISD::EXTRACT_SUBVECTOR, DL, NVT,
25549+
DAG.getBitcast(N->getOperand(0).getValueType(), V.getOperand(0)),
25550+
N->getOperand(1));
25551+
}
2555025552
}
25551-
return DAG.getNode(
25552-
ISD::EXTRACT_SUBVECTOR, DL, NVT,
25553-
DAG.getBitcast(N->getOperand(0).getValueType(), V.getOperand(0)),
25554-
N->getOperand(1));
2555525553
}
2555625554

2555725555
if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG, LegalOperations))

llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll

Lines changed: 22 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -66,39 +66,38 @@ define amdgpu_vs void @test_3(i32 inreg %arg1, i32 inreg %arg2, ptr addrspace(8)
6666
; CHECK-NEXT: s_mov_b32 s6, s4
6767
; CHECK-NEXT: s_mov_b32 s5, s3
6868
; CHECK-NEXT: s_mov_b32 s4, s2
69-
; CHECK-NEXT: v_add_i32_e32 v0, vcc, 20, v1
70-
; CHECK-NEXT: v_add_i32_e32 v3, vcc, 16, v1
71-
; CHECK-NEXT: v_add_i32_e32 v4, vcc, 12, v1
72-
; CHECK-NEXT: v_add_i32_e32 v5, vcc, 8, v1
73-
; CHECK-NEXT: v_add_i32_e32 v8, vcc, 4, v1
69+
; CHECK-NEXT: v_add_i32_e32 v0, vcc, 12, v1
70+
; CHECK-NEXT: v_add_i32_e32 v3, vcc, 8, v1
71+
; CHECK-NEXT: v_add_i32_e32 v4, vcc, 4, v1
72+
; CHECK-NEXT: v_add_i32_e32 v6, vcc, 20, v1
73+
; CHECK-NEXT: v_add_i32_e32 v7, vcc, 16, v1
7474
; CHECK-NEXT: v_mov_b32_e32 v9, s0
75-
; CHECK-NEXT: v_add_i32_e32 v10, vcc, 20, v2
76-
; CHECK-NEXT: v_add_i32_e32 v11, vcc, 16, v2
75+
; CHECK-NEXT: v_add_i32_e32 v10, vcc, 12, v2
76+
; CHECK-NEXT: v_add_i32_e32 v11, vcc, 8, v2
7777
; CHECK-NEXT: s_mov_b32 m0, -1
78-
; CHECK-NEXT: ds_read_b32 v7, v3
79-
; CHECK-NEXT: ds_read_b32 v6, v4
80-
; CHECK-NEXT: ds_read_b32 v5, v5
81-
; CHECK-NEXT: ds_read_b32 v4, v8
82-
; CHECK-NEXT: ds_read_b32 v8, v0
78+
; CHECK-NEXT: ds_read_b32 v5, v3
79+
; CHECK-NEXT: ds_read_b32 v4, v4
80+
; CHECK-NEXT: ds_read_b32 v8, v6
81+
; CHECK-NEXT: ds_read_b32 v7, v7
82+
; CHECK-NEXT: ds_read_b32 v6, v0
8383
; CHECK-NEXT: ds_read_b32 v3, v1
84-
; CHECK-NEXT: v_add_i32_e32 v1, vcc, 12, v2
85-
; CHECK-NEXT: v_add_i32_e32 v12, vcc, 8, v2
86-
; CHECK-NEXT: v_add_i32_e32 v13, vcc, 4, v2
84+
; CHECK-NEXT: v_add_i32_e32 v0, vcc, 4, v2
85+
; CHECK-NEXT: v_add_i32_e32 v1, vcc, 20, v2
86+
; CHECK-NEXT: v_add_i32_e32 v12, vcc, 16, v2
8787
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
8888
; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc
8989
; CHECK-NEXT: tbuffer_store_format_xy v[7:8], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc
90-
; CHECK-NEXT: ds_read_b32 v0, v11
9190
; CHECK-NEXT: s_waitcnt expcnt(1)
92-
; CHECK-NEXT: ds_read_b32 v5, v1
93-
; CHECK-NEXT: ds_read_b32 v4, v12
94-
; CHECK-NEXT: ds_read_b32 v3, v13
91+
; CHECK-NEXT: ds_read_b32 v4, v11
92+
; CHECK-NEXT: ds_read_b32 v3, v0
93+
; CHECK-NEXT: ds_read_b32 v1, v1
94+
; CHECK-NEXT: ds_read_b32 v0, v12
95+
; CHECK-NEXT: ds_read_b32 v5, v10
9596
; CHECK-NEXT: ds_read_b32 v2, v2
96-
; CHECK-NEXT: ds_read_b32 v1, v10
97-
; CHECK-NEXT: s_waitcnt lgkmcnt(5)
97+
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
9898
; CHECK-NEXT: exp mrt0 off, off, off, off
99-
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
100-
; CHECK-NEXT: tbuffer_store_format_xyzw v[2:5], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc
10199
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
100+
; CHECK-NEXT: tbuffer_store_format_xyzw v[2:5], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc
102101
; CHECK-NEXT: tbuffer_store_format_xy v[0:1], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc
103102
; CHECK-NEXT: s_endpgm
104103
%load1 = load <6 x float>, ptr addrspace(3) %arg5, align 4

llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -449,9 +449,8 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
449449
; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7]
450450
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
451451
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
452-
; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1
453452
; AVX512-NEXT: vmovq %xmm0, 32(%r9)
454-
; AVX512-NEXT: vmovdqa %ymm1, (%r9)
453+
; AVX512-NEXT: vmovdqa %ymm2, (%r9)
455454
; AVX512-NEXT: vzeroupper
456455
; AVX512-NEXT: retq
457456
;
@@ -476,9 +475,8 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
476475
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15]
477476
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
478477
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
479-
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1
480478
; AVX512-FCP-NEXT: vmovq %xmm0, 32(%r9)
481-
; AVX512-FCP-NEXT: vmovdqa %ymm1, (%r9)
479+
; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r9)
482480
; AVX512-FCP-NEXT: vzeroupper
483481
; AVX512-FCP-NEXT: retq
484482
;
@@ -504,9 +502,8 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
504502
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7]
505503
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
506504
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
507-
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1
508505
; AVX512DQ-NEXT: vmovq %xmm0, 32(%r9)
509-
; AVX512DQ-NEXT: vmovdqa %ymm1, (%r9)
506+
; AVX512DQ-NEXT: vmovdqa %ymm2, (%r9)
510507
; AVX512DQ-NEXT: vzeroupper
511508
; AVX512DQ-NEXT: retq
512509
;
@@ -531,9 +528,8 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
531528
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15]
532529
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
533530
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
534-
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1
535531
; AVX512DQ-FCP-NEXT: vmovq %xmm0, 32(%r9)
536-
; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%r9)
532+
; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r9)
537533
; AVX512DQ-FCP-NEXT: vzeroupper
538534
; AVX512DQ-FCP-NEXT: retq
539535
;

0 commit comments

Comments
 (0)