Skip to content

Commit 0641139

Browse files
authored
[AMDGPU][True16][CodeGen] srl pattern for true16 mode (#132987)
Added a srl pattern for true16 flow. Changing right shift 16bit to a reg_sequence `srl vgpr32, 16 -> reg_sequence (vgpr32.hi16, 0)` and finally it's lowered to two COPY `vdst.lo16 = COPY vsrc.hi16` `vdst.hi16 = COPY 0` The benefits of this transform is allowing the following pass to optimize out these copy.
1 parent 6075275 commit 0641139

28 files changed

+4911
-4652
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2425,6 +2425,13 @@ def : GCNPat <(i1 imm:$imm),
24252425
let WaveSizePredicate = isWave32;
24262426
}
24272427

2428+
let True16Predicate = UseRealTrue16Insts in
2429+
foreach vt = [i32, v2i16] in
2430+
def : GCNPat <
2431+
(vt (DivergentBinFrag<srl> VGPR_32:$src, (i32 16))),
2432+
(REG_SEQUENCE VGPR_32, (i16 (EXTRACT_SUBREG $src, hi16)), lo16, (V_MOV_B16_t16_e64 0, (i16 0x0000), 0), hi16)
2433+
>;
2434+
24282435
/********** ================== **********/
24292436
/********** Intrinsic Patterns **********/
24302437
/********** ================== **********/

llvm/test/CodeGen/AMDGPU/add.v2i16.ll

Lines changed: 92 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -658,26 +658,47 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out,
658658
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
659659
; GFX10-NEXT: s_endpgm
660660
;
661-
; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i32:
662-
; GFX11: ; %bb.0:
663-
; GFX11-NEXT: s_clause 0x1
664-
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
665-
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
666-
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
667-
; GFX11-NEXT: v_mov_b32_e32 v2, 0
668-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
669-
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
670-
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
671-
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
672-
; GFX11-NEXT: s_waitcnt vmcnt(0)
673-
; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc
674-
; GFX11-NEXT: s_waitcnt vmcnt(0)
675-
; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
676-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
677-
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
678-
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
679-
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
680-
; GFX11-NEXT: s_endpgm
661+
; GFX11-TRUE16-LABEL: v_test_add_v2i16_zext_to_v2i32:
662+
; GFX11-TRUE16: ; %bb.0:
663+
; GFX11-TRUE16-NEXT: s_clause 0x1
664+
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
665+
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
666+
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
667+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
668+
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
669+
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
670+
; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
671+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
672+
; GFX11-TRUE16-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc
673+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
674+
; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v1, v0
675+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
676+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
677+
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2
678+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
679+
; GFX11-TRUE16-NEXT: global_store_b64 v3, v[0:1], s[0:1]
680+
; GFX11-TRUE16-NEXT: s_endpgm
681+
;
682+
; GFX11-FAKE16-LABEL: v_test_add_v2i16_zext_to_v2i32:
683+
; GFX11-FAKE16: ; %bb.0:
684+
; GFX11-FAKE16-NEXT: s_clause 0x1
685+
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
686+
; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
687+
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
688+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
689+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
690+
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
691+
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
692+
; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
693+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
694+
; GFX11-FAKE16-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc
695+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
696+
; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v1, v0
697+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
698+
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
699+
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
700+
; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
701+
; GFX11-FAKE16-NEXT: s_endpgm
681702
%tid = call i32 @llvm.amdgcn.workitem.id.x()
682703
%gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid
683704
%gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
@@ -971,30 +992,57 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out,
971992
; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
972993
; GFX10-NEXT: s_endpgm
973994
;
974-
; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i64:
975-
; GFX11: ; %bb.0:
976-
; GFX11-NEXT: s_clause 0x1
977-
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
978-
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
979-
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
980-
; GFX11-NEXT: v_mov_b32_e32 v4, 0
981-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
982-
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
983-
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
984-
; GFX11-NEXT: s_clause 0x1
985-
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
986-
; GFX11-NEXT: global_load_b32 v0, v0, s[4:5]
987-
; GFX11-NEXT: s_waitcnt vmcnt(0)
988-
; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
989-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
990-
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
991-
; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
992-
; GFX11-NEXT: v_bfe_i32 v2, v1, 0, 16
993-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
994-
; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0
995-
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
996-
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
997-
; GFX11-NEXT: s_endpgm
995+
; GFX11-TRUE16-LABEL: v_test_add_v2i16_sext_to_v2i64:
996+
; GFX11-TRUE16: ; %bb.0:
997+
; GFX11-TRUE16-NEXT: s_clause 0x1
998+
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
999+
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1000+
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1001+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0
1002+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
1003+
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1004+
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
1005+
; GFX11-TRUE16-NEXT: s_clause 0x1
1006+
; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3]
1007+
; GFX11-TRUE16-NEXT: global_load_b32 v0, v0, s[4:5]
1008+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1009+
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v1, v0
1010+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
1011+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
1012+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
1013+
; GFX11-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16
1014+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1015+
; GFX11-TRUE16-NEXT: v_bfe_i32 v2, v1, 0, 16
1016+
; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1017+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
1018+
; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v3, 31, v2
1019+
; GFX11-TRUE16-NEXT: global_store_b128 v4, v[0:3], s[0:1]
1020+
; GFX11-TRUE16-NEXT: s_endpgm
1021+
;
1022+
; GFX11-FAKE16-LABEL: v_test_add_v2i16_sext_to_v2i64:
1023+
; GFX11-FAKE16: ; %bb.0:
1024+
; GFX11-FAKE16-NEXT: s_clause 0x1
1025+
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1026+
; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1027+
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1028+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, 0
1029+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
1030+
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1031+
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
1032+
; GFX11-FAKE16-NEXT: s_clause 0x1
1033+
; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3]
1034+
; GFX11-FAKE16-NEXT: global_load_b32 v0, v0, s[4:5]
1035+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
1036+
; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v1, v0
1037+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1038+
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1039+
; GFX11-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16
1040+
; GFX11-FAKE16-NEXT: v_bfe_i32 v2, v1, 0, 16
1041+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1042+
; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1043+
; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v3, 31, v2
1044+
; GFX11-FAKE16-NEXT: global_store_b128 v4, v[0:3], s[0:1]
1045+
; GFX11-FAKE16-NEXT: s_endpgm
9981046
%tid = call i32 @llvm.amdgcn.workitem.id.x()
9991047
%gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
10001048
%gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid

0 commit comments

Comments
 (0)