Skip to content

Commit ecec7d1

Browse files
authored
DAG: Use phi in alloca constant case to create virtual registers (#130254)
This is a follow up from 39bf765, for the other case handled here. We would create CopyToReg marked as uniform, even though the end phi would need to use VGPRs due to another divergent input. There's no directly observable change in the final output of the new test, but it does hit this case.
1 parent 0db702a commit ecec7d1

File tree

2 files changed

+50
-1
lines changed

2 files changed

+50
-1
lines changed

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12045,7 +12045,7 @@ SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
1204512045
assert(isa<AllocaInst>(PHIOp) &&
1204612046
FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(PHIOp)) &&
1204712047
"Didn't codegen value into a register!??");
12048-
Reg = FuncInfo.CreateRegs(PHIOp);
12048+
Reg = FuncInfo.CreateRegs(&PN);
1204912049
CopyValueToVirtualRegister(PHIOp, Reg);
1205012050
}
1205112051
}

llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,52 @@ done:
3535
store i32 %extract.0, ptr addrspace(1) %out, align 4
3636
ret void
3737
}
38+
39+
; When creating registers for %divergent.alloca.phi, we should report
40+
; the CopyToReg as divergent values (not uniform just because the
41+
; alloca is uniform)
42+
define void @phi_with_alloca_and_divergent_copy_to_reg(ptr addrspace(5) %divergent.private, ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
43+
; CHECK-LABEL: phi_with_alloca_and_divergent_copy_to_reg:
44+
; CHECK: ; %bb.0: ; %entry
45+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
46+
; CHECK-NEXT: v_mov_b32_e32 v7, v2
47+
; CHECK-NEXT: v_mov_b32_e32 v6, v1
48+
; CHECK-NEXT: s_mov_b64 s[4:5], 0
49+
; CHECK-NEXT: v_lshrrev_b32_e64 v2, 6, s32
50+
; CHECK-NEXT: .LBB1_1: ; %loop
51+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
52+
; CHECK-NEXT: v_mov_b32_e32 v1, v2
53+
; CHECK-NEXT: v_lshl_add_u32 v2, v3, 2, v1
54+
; CHECK-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
55+
; CHECK-NEXT: v_add_u32_e32 v2, 1, v3
56+
; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 15, v2
57+
; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
58+
; CHECK-NEXT: v_mov_b32_e32 v3, v4
59+
; CHECK-NEXT: v_mov_b32_e32 v2, v0
60+
; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
61+
; CHECK-NEXT: s_cbranch_execnz .LBB1_1
62+
; CHECK-NEXT: ; %bb.2: ; %done
63+
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
64+
; CHECK-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen
65+
; CHECK-NEXT: s_waitcnt vmcnt(0)
66+
; CHECK-NEXT: global_store_dword v[6:7], v0, off
67+
; CHECK-NEXT: s_waitcnt vmcnt(0)
68+
; CHECK-NEXT: s_setpc_b64 s[30:31]
69+
entry:
70+
%alloca0 = alloca [16 x i32], addrspace(5)
71+
br label %loop
72+
73+
loop:
74+
%inc = phi i32 [%a, %entry], [%b, %loop]
75+
%divergent.alloca.phi = phi ptr addrspace(5) [ %alloca0, %entry ], [ %divergent.private, %loop ]
76+
%ptr = getelementptr [16 x i32], ptr addrspace(5) %divergent.alloca.phi, i32 0, i32 %inc
77+
store i32 %inc, ptr addrspace(5) %ptr
78+
%inc.i = add i32 %inc, 1
79+
%cnd = icmp uge i32 %inc.i, 16
80+
br i1 %cnd, label %done, label %loop
81+
82+
done:
83+
%tmp1 = load i32, ptr addrspace(5) %divergent.alloca.phi
84+
store i32 %tmp1, ptr addrspace(1) %out
85+
ret void
86+
}

0 commit comments

Comments
 (0)