Skip to content

Commit 6887016

Browse files
AMDGPU Mark permlane instructions as convergent (#142962)
Not sure if this is the right place to add this, but without explicitly setting it as convergent, the permlanex16 instruction in the test sinks into the second block. Co-authored-by: Paul Trojahn <[email protected]>
1 parent 823750d commit 6887016

File tree

2 files changed

+70
-2
lines changed

2 files changed

+70
-2
lines changed

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1324,10 +1324,10 @@ let SubtargetPredicate = isGFX10Plus in {
13241324
} // End isCommutable = 1, isReMaterializable = 1
13251325
def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32_e64>;
13261326

1327-
let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
1327+
let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in", isConvergent = 1 in {
13281328
defm V_PERMLANE16_B32 : VOP3Inst<"v_permlane16_b32", VOP3_PERMLANE_Profile>;
13291329
defm V_PERMLANEX16_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>;
1330-
} // End $vdst = $vdst_in, DisableEncoding $vdst_in
1330+
} // End $vdst = $vdst_in, DisableEncoding $vdst_in, isConvergent = 1
13311331

13321332
foreach vt = Reg32Types.types in {
13331333
def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64, vt>;

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10410,3 +10410,71 @@ define void @v_permlanex16_v8f64(ptr addrspace(1) %out, <8 x double> %src0, i32
1041010410
store <8 x double> %v, ptr addrspace(1) %out
1041110411
ret void
1041210412
}
10413+
10414+
define amdgpu_kernel void @v_permlanex16_convergent(ptr addrspace(1) %out, i32 %src0, i32 %pattern_lo, i32 %pattern_hi) {
10415+
; GFX10-LABEL: v_permlanex16_convergent:
10416+
; GFX10: ; %bb.0:
10417+
; GFX10-NEXT: s_clause 0x1
10418+
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
10419+
; GFX10-NEXT: s_load_dword s2, s[4:5], 0x34
10420+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
10421+
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
10422+
; GFX10-NEXT: v_mov_b32_e32 v1, s0
10423+
; GFX10-NEXT: v_permlanex16_b32 v1, v1, s1, s2
10424+
; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo
10425+
; GFX10-NEXT: s_cbranch_execz .LBB142_2
10426+
; GFX10-NEXT: ; %bb.1: ; %t
10427+
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
10428+
; GFX10-NEXT: v_mov_b32_e32 v0, 0
10429+
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
10430+
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
10431+
; GFX10-NEXT: .LBB142_2: ; %f
10432+
; GFX10-NEXT: s_endpgm
10433+
;
10434+
; GFX11-LABEL: v_permlanex16_convergent:
10435+
; GFX11: ; %bb.0:
10436+
; GFX11-NEXT: s_clause 0x1
10437+
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
10438+
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x34
10439+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
10440+
; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_and_b32 v0, 0x3ff, v0
10441+
; GFX11-NEXT: s_mov_b32 s0, exec_lo
10442+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
10443+
; GFX11-NEXT: v_permlanex16_b32 v1, v1, s1, s2
10444+
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
10445+
; GFX11-NEXT: s_cbranch_execz .LBB142_2
10446+
; GFX11-NEXT: ; %bb.1: ; %t
10447+
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
10448+
; GFX11-NEXT: v_mov_b32_e32 v0, 0
10449+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
10450+
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
10451+
; GFX11-NEXT: .LBB142_2: ; %f
10452+
; GFX11-NEXT: s_endpgm
10453+
;
10454+
; GFX12-LABEL: v_permlanex16_convergent:
10455+
; GFX12: ; %bb.0:
10456+
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c
10457+
; GFX12-NEXT: s_wait_kmcnt 0x0
10458+
; GFX12-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_and_b32 v0, 0x3ff, v0
10459+
; GFX12-NEXT: s_mov_b32 s0, exec_lo
10460+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
10461+
; GFX12-NEXT: v_permlanex16_b32 v1, v1, s1, s2
10462+
; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0
10463+
; GFX12-NEXT: s_cbranch_execz .LBB142_2
10464+
; GFX12-NEXT: ; %bb.1: ; %t
10465+
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
10466+
; GFX12-NEXT: v_mov_b32_e32 v0, 0
10467+
; GFX12-NEXT: s_wait_kmcnt 0x0
10468+
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
10469+
; GFX12-NEXT: .LBB142_2: ; %f
10470+
; GFX12-NEXT: s_endpgm
10471+
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
10472+
%v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %pattern_lo, i32 %pattern_hi, i1 false, i1 false)
10473+
%select = icmp eq i32 %tidx, 0
10474+
br i1 %select, label %t, label %f
10475+
t:
10476+
store i32 %v, ptr addrspace(1) %out
10477+
br label %f
10478+
f:
10479+
ret void
10480+
}

0 commit comments

Comments
 (0)