Skip to content

Commit a4a3ac1

Browse files
committed
[AMDGPU] Remove extract_subvector patterns
Removing them seems to slightly increase code quality as well as simplifying both the tablegen and C++ parts of the code. Differential Revision: https://reviews.llvm.org/D149853
1 parent d9be8a8 commit a4a3ac1

File tree

5 files changed

+36
-105
lines changed

5 files changed

+36
-105
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1441,11 +1441,6 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
14411441
unsigned NumSrcElt = SrcVT.getVectorNumElements();
14421442
assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
14431443

1444-
// We have some TableGen patterns for when the extracted vector is exactly
1445-
// the low or high half of the operand.
1446-
if ((NumSrcElt == 2 * NumElt) && (Start == 0 || Start == NumElt))
1447-
return Op;
1448-
14491444
// Extract 32-bit registers at a time.
14501445
EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
14511446
EVT NewVT = NumElt == 2

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 0 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1356,66 +1356,6 @@ foreach Index = 0-15 in {
13561356
}
13571357

13581358

1359-
def : Pat <
1360-
(extract_subvector v4i16:$vec, (i32 0)),
1361-
(v2i16 (EXTRACT_SUBREG v4i16:$vec, sub0))
1362-
>;
1363-
1364-
def : Pat <
1365-
(extract_subvector v4i16:$vec, (i32 2)),
1366-
(v2i16 (EXTRACT_SUBREG v4i16:$vec, sub1))
1367-
>;
1368-
1369-
def : Pat <
1370-
(extract_subvector v4f16:$vec, (i32 0)),
1371-
(v2f16 (EXTRACT_SUBREG v4f16:$vec, sub0))
1372-
>;
1373-
1374-
def : Pat <
1375-
(extract_subvector v4f16:$vec, (i32 2)),
1376-
(v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1))
1377-
>;
1378-
1379-
def : Pat <
1380-
(extract_subvector v8i16:$vec, (i32 0)),
1381-
(v4i16 (EXTRACT_SUBREG v8i16:$vec, sub0_sub1))
1382-
>;
1383-
1384-
def : Pat <
1385-
(extract_subvector v8i16:$vec, (i32 4)),
1386-
(v4i16 (EXTRACT_SUBREG v8i16:$vec, sub2_sub3))
1387-
>;
1388-
1389-
def : Pat <
1390-
(extract_subvector v8f16:$vec, (i32 0)),
1391-
(v4f16 (EXTRACT_SUBREG v8f16:$vec, sub0_sub1))
1392-
>;
1393-
1394-
def : Pat <
1395-
(extract_subvector v8f16:$vec, (i32 4)),
1396-
(v4f16 (EXTRACT_SUBREG v8f16:$vec, sub2_sub3))
1397-
>;
1398-
1399-
def : Pat <
1400-
(extract_subvector v16i16:$vec, (i32 0)),
1401-
(v8i16 (EXTRACT_SUBREG v16i16:$vec, sub0_sub1_sub2_sub3))
1402-
>;
1403-
1404-
def : Pat <
1405-
(extract_subvector v16i16:$vec, (i32 8)),
1406-
(v8i16 (EXTRACT_SUBREG v16i16:$vec, sub4_sub5_sub6_sub7))
1407-
>;
1408-
1409-
def : Pat <
1410-
(extract_subvector v16f16:$vec, (i32 0)),
1411-
(v8f16 (EXTRACT_SUBREG v16f16:$vec, sub0_sub1_sub2_sub3))
1412-
>;
1413-
1414-
def : Pat <
1415-
(extract_subvector v16f16:$vec, (i32 8)),
1416-
(v8f16 (EXTRACT_SUBREG v16f16:$vec, sub4_sub5_sub6_sub7))
1417-
>;
1418-
14191359
foreach Index = 0-31 in {
14201360
def Extract_Element_v32i32_#Index : Extract_Element <
14211361
i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)

llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll

Lines changed: 24 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -366,22 +366,20 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
366366
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc
367367
; GFX9-NEXT: s_waitcnt vmcnt(0)
368368
; GFX9-NEXT: .LBB2_4: ; %exit
369-
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
369+
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900
370+
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00
370371
; GFX9-NEXT: s_waitcnt vmcnt(0)
371-
; GFX9-NEXT: v_perm_b32 v0, v3, v3, s4
372-
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800
373-
; GFX9-NEXT: v_mov_b32_e32 v3, 0x3900
374-
; GFX9-NEXT: v_mov_b32_e32 v4, 0x3d00
375-
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v0
376-
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v4, vcc
377-
; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
378-
; GFX9-NEXT: v_cndmask_b32_e32 v6, v4, v3, vcc
379372
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v2
380-
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
381-
; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v2, v1 src0_sel:WORD_1 src1_sel:DWORD
382-
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
383-
; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
384-
; GFX9-NEXT: v_pack_b32_f16 v1, v5, v6
373+
; GFX9-NEXT: v_mov_b32_e32 v5, 0x3800
374+
; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
375+
; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v2, v5 src0_sel:WORD_1 src1_sel:DWORD
376+
; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
377+
; GFX9-NEXT: v_cmp_nge_f16_e32 vcc, 0.5, v3
378+
; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v0, vcc
379+
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v3
380+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
381+
; GFX9-NEXT: v_pack_b32_f16 v1, v0, v5
382+
; GFX9-NEXT: v_pack_b32_f16 v0, v4, v2
385383
; GFX9-NEXT: s_setpc_b64 s[30:31]
386384
br i1 undef, label %T, label %F
387385

@@ -882,22 +880,20 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
882880
; GFX9-NEXT: s_waitcnt vmcnt(0)
883881
; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
884882
; GFX9-NEXT: .LBB5_4: ; %exit
885-
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
883+
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900
884+
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00
886885
; GFX9-NEXT: s_waitcnt vmcnt(0)
887-
; GFX9-NEXT: v_perm_b32 v0, v5, v5, s4
888-
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800
889-
; GFX9-NEXT: v_mov_b32_e32 v2, 0x3900
890-
; GFX9-NEXT: v_mov_b32_e32 v3, 0x3d00
891-
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v0
892-
; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v3, vcc
893-
; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
894-
; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v2, vcc
895886
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v4
896-
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
897-
; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v4, v1 src0_sel:WORD_1 src1_sel:DWORD
898-
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
899-
; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
900-
; GFX9-NEXT: v_pack_b32_f16 v1, v5, v6
887+
; GFX9-NEXT: v_mov_b32_e32 v3, 0x3800
888+
; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
889+
; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v4, v3 src0_sel:WORD_1 src1_sel:DWORD
890+
; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
891+
; GFX9-NEXT: v_cmp_nge_f16_e32 vcc, 0.5, v5
892+
; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc
893+
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v5
894+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
895+
; GFX9-NEXT: v_pack_b32_f16 v1, v0, v4
896+
; GFX9-NEXT: v_pack_b32_f16 v0, v2, v3
901897
; GFX9-NEXT: s_setpc_b64 s[30:31]
902898
br i1 undef, label %T, label %F
903899

llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,12 @@ define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %
3030
; REVERSEXNACK-LABEL: shuffle_v4f16_234u:
3131
; REVERSEXNACK: ; %bb.0:
3232
; REVERSEXNACK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33-
; REVERSEXNACK-NEXT: v_mov_b32_e32 v6, v1
34-
; REVERSEXNACK-NEXT: v_mov_b32_e32 v5, v0
35-
; REVERSEXNACK-NEXT: v_mov_b32_e32 v4, v3
36-
; REVERSEXNACK-NEXT: v_mov_b32_e32 v3, v2
37-
; REVERSEXNACK-NEXT: global_load_dword v0, v[5:6], off offset:4
38-
; REVERSEXNACK-NEXT: global_load_dwordx2 v[1:2], v[3:4], off
33+
; REVERSEXNACK-NEXT: v_mov_b32_e32 v6, v3
34+
; REVERSEXNACK-NEXT: v_mov_b32_e32 v4, v1
35+
; REVERSEXNACK-NEXT: v_mov_b32_e32 v3, v0
36+
; REVERSEXNACK-NEXT: v_mov_b32_e32 v5, v2
37+
; REVERSEXNACK-NEXT: global_load_dword v0, v[3:4], off offset:4
38+
; REVERSEXNACK-NEXT: global_load_dwordx2 v[1:2], v[5:6], off
3939
; REVERSEXNACK-NEXT: s_waitcnt vmcnt(0)
4040
; REVERSEXNACK-NEXT: s_setpc_b64 s[30:31]
4141
;

llvm/test/CodeGen/AMDGPU/sra.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -187,13 +187,13 @@ define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %i
187187
; VI-NEXT: s_mov_b32 s4, s0
188188
; VI-NEXT: s_mov_b32 s5, s1
189189
; VI-NEXT: s_waitcnt vmcnt(0)
190-
; VI-NEXT: v_readfirstlane_b32 s0, v0
191-
; VI-NEXT: v_readfirstlane_b32 s1, v1
192-
; VI-NEXT: s_ashr_i32 s2, s0, 16
193-
; VI-NEXT: s_sext_i32_i16 s0, s0
194-
; VI-NEXT: s_ashr_i32 s3, s1, 16
190+
; VI-NEXT: v_readfirstlane_b32 s0, v1
191+
; VI-NEXT: v_readfirstlane_b32 s1, v0
192+
; VI-NEXT: s_ashr_i32 s2, s1, 16
195193
; VI-NEXT: s_sext_i32_i16 s1, s1
196-
; VI-NEXT: s_ashr_i32 s0, s0, s1
194+
; VI-NEXT: s_ashr_i32 s3, s0, 16
195+
; VI-NEXT: s_sext_i32_i16 s0, s0
196+
; VI-NEXT: s_ashr_i32 s0, s1, s0
197197
; VI-NEXT: s_ashr_i32 s1, s2, s3
198198
; VI-NEXT: s_lshl_b32 s1, s1, 16
199199
; VI-NEXT: s_and_b32 s0, s0, 0xffff

0 commit comments

Comments
 (0)