Skip to content

Commit 8c07f48

Browse files
committed
[AMDGPU] SIFixSGPRCopies should not process twice V2S copies created by PHI preprocessing
1 parent 29aa9d0 commit 8c07f48

9 files changed

+530
-214
lines changed

llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ class SIFixSGPRCopies {
127127
unsigned NextVGPRToSGPRCopyID = 0;
128128
MapVector<unsigned, V2SCopyInfo> V2SCopies;
129129
DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty;
130+
DenseSet<MachineInstr *> PHISources;
130131

131132
public:
132133
MachineRegisterInfo *MRI;
@@ -692,6 +693,7 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
692693
.addReg(MO.getReg());
693694
MO.setReg(NewDst);
694695
analyzeVGPRToSGPRCopy(NewCopy);
696+
PHISources.insert(NewCopy);
695697
}
696698
}
697699
}
@@ -798,6 +800,7 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
798800
RegSequences.clear();
799801
PHINodes.clear();
800802
S2VCopies.clear();
803+
PHISources.clear();
801804

802805
return true;
803806
}
@@ -923,6 +926,8 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI,
923926
}
924927

925928
void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) {
929+
if (PHISources.contains(MI))
930+
return;
926931
Register DstReg = MI->getOperand(0).getReg();
927932
const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
928933

@@ -966,9 +971,21 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) {
966971
}
967972
} else if (Inst->getNumExplicitDefs() != 0) {
968973
Register Reg = Inst->getOperand(0).getReg();
969-
if (TRI->isSGPRReg(*MRI, Reg) && !TII->isVALU(*Inst))
970-
for (auto &U : MRI->use_instructions(Reg))
971-
Users.push_back(&U);
974+
if (TRI->isSGPRReg(*MRI, Reg) && !TII->isVALU(*Inst)) {
975+
if (Reg.isVirtual()) {
976+
for (auto &U : MRI->use_instructions(Reg))
977+
Users.push_back(&U);
978+
} else {
979+
auto I = Inst->getIterator();
980+
auto E = Inst->getParent()->end();
981+
while (++I != E) {
982+
if (I->readsRegister(Reg, TRI))
983+
Users.push_back(&*I);
984+
if (I->modifiesRegister(Reg, TRI))
985+
break;
986+
}
987+
}
988+
}
972989
}
973990
for (auto *U : Users) {
974991
if (TII->isSALU(*U))

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll

Lines changed: 29 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1114,19 +1114,23 @@ define amdgpu_kernel void @f64_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1)
11141114
; GCN-NEXT: s_waitcnt lgkmcnt(0)
11151115
; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
11161116
; GCN-NEXT: s_mov_b32 s3, 0xf000
1117-
; GCN-NEXT: s_mov_b32 s2, -1
11181117
; GCN-NEXT: s_waitcnt lgkmcnt(0)
11191118
; GCN-NEXT: v_add_f64 v[0:1], s[4:5], 1.0
1120-
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
1121-
; GCN-NEXT: v_add_i32_e32 v1, vcc, 2, v1
1122-
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
1123-
; GCN-NEXT: v_add_i32_e32 v0, vcc, 2, v0
1124-
; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
1125-
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
1126-
; GCN-NEXT: v_or_b32_e32 v1, v2, v1
1127-
; GCN-NEXT: v_or_b32_e32 v0, v3, v0
1128-
; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x20000, v1
1129-
; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x20000, v0
1119+
; GCN-NEXT: v_readfirstlane_b32 s2, v0
1120+
; GCN-NEXT: v_readfirstlane_b32 s4, v1
1121+
; GCN-NEXT: s_and_b32 s5, s4, 0xffff0000
1122+
; GCN-NEXT: s_add_i32 s4, s4, 2
1123+
; GCN-NEXT: s_and_b32 s6, s2, 0xffff0000
1124+
; GCN-NEXT: s_add_i32 s2, s2, 2
1125+
; GCN-NEXT: s_and_b32 s4, s4, 0xffff
1126+
; GCN-NEXT: s_and_b32 s2, s2, 0xffff
1127+
; GCN-NEXT: s_or_b32 s4, s5, s4
1128+
; GCN-NEXT: s_or_b32 s2, s6, s2
1129+
; GCN-NEXT: s_add_i32 s4, s4, 0x20000
1130+
; GCN-NEXT: s_add_i32 s5, s2, 0x20000
1131+
; GCN-NEXT: s_mov_b32 s2, -1
1132+
; GCN-NEXT: v_mov_b32_e32 v0, s5
1133+
; GCN-NEXT: v_mov_b32_e32 v1, s4
11301134
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
11311135
; GCN-NEXT: s_endpgm
11321136
;
@@ -1139,14 +1143,20 @@ define amdgpu_kernel void @f64_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1)
11391143
; VI-NEXT: v_mov_b32_e32 v3, s1
11401144
; VI-NEXT: s_waitcnt lgkmcnt(0)
11411145
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0
1142-
; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
1143-
; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
1144-
; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
1145-
; VI-NEXT: v_add_u32_e32 v1, vcc, 2, v1
1146-
; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1147-
; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1148-
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x20000, v1
1149-
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x20000, v0
1146+
; VI-NEXT: v_readfirstlane_b32 s0, v1
1147+
; VI-NEXT: v_readfirstlane_b32 s1, v0
1148+
; VI-NEXT: s_and_b32 s2, s1, 0xffff0000
1149+
; VI-NEXT: s_add_i32 s1, s1, 2
1150+
; VI-NEXT: s_and_b32 s3, s0, 0xffff0000
1151+
; VI-NEXT: s_add_i32 s0, s0, 2
1152+
; VI-NEXT: s_and_b32 s0, s0, 0xffff
1153+
; VI-NEXT: s_and_b32 s1, s1, 0xffff
1154+
; VI-NEXT: s_or_b32 s0, s3, s0
1155+
; VI-NEXT: s_or_b32 s1, s2, s1
1156+
; VI-NEXT: s_add_i32 s0, s0, 0x20000
1157+
; VI-NEXT: s_add_i32 s1, s1, 0x20000
1158+
; VI-NEXT: v_mov_b32_e32 v0, s1
1159+
; VI-NEXT: v_mov_b32_e32 v1, s0
11501160
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
11511161
; VI-NEXT: s_endpgm
11521162
;

llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -737,9 +737,7 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
737737
; GFX908-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
738738
; GFX908-NEXT: s_mov_b64 s[4:5], -1
739739
; GFX908-NEXT: s_mov_b32 s6, 1
740-
; GFX908-NEXT: v_readfirstlane_b32 s7, v2
741-
; GFX908-NEXT: s_cmp_lg_u32 s7, s6
742-
; GFX908-NEXT: s_cselect_b64 s[6:7], -1, 0
740+
; GFX908-NEXT: v_cmp_ne_u32_e64 s[6:7], v2, s6
743741
; GFX908-NEXT: s_and_b64 vcc, exec, s[6:7]
744742
; GFX908-NEXT: ; implicit-def: $vgpr3_vgpr4
745743
; GFX908-NEXT: s_cbranch_vccnz .LBB5_2
@@ -808,9 +806,7 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
808806
; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
809807
; GFX90A-NEXT: s_mov_b64 s[4:5], -1
810808
; GFX90A-NEXT: s_mov_b32 s6, 1
811-
; GFX90A-NEXT: v_readfirstlane_b32 s7, v2
812-
; GFX90A-NEXT: s_cmp_lg_u32 s7, s6
813-
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
809+
; GFX90A-NEXT: v_cmp_ne_u32_e64 s[6:7], v2, s6
814810
; GFX90A-NEXT: s_and_b64 vcc, exec, s[6:7]
815811
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
816812
; GFX90A-NEXT: s_cbranch_vccnz .LBB5_2
@@ -877,9 +873,7 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
877873
; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
878874
; GFX942-NEXT: s_mov_b64 s[0:1], -1
879875
; GFX942-NEXT: s_mov_b32 s2, 1
880-
; GFX942-NEXT: v_readfirstlane_b32 s3, v2
881-
; GFX942-NEXT: s_cmp_lg_u32 s3, s2
882-
; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0
876+
; GFX942-NEXT: v_cmp_ne_u32_e64 s[2:3], v2, s2
883877
; GFX942-NEXT: s_and_b64 vcc, exec, s[2:3]
884878
; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3
885879
; GFX942-NEXT: s_cbranch_vccnz .LBB5_2
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s
3+
4+
; SGPR phi ends up with VGPR inputs. Make sure we do not try to
5+
; process a copy which has already been erased (which was already
6+
; inserted by the pass).
7+
8+
define double @issue130646(i64 %arg) {
9+
; CHECK-LABEL: issue130646:
10+
; CHECK: ; %bb.0: ; %entry
11+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12+
; CHECK-NEXT: v_mov_b32_e32 v2, 0
13+
; CHECK-NEXT: v_mov_b32_e32 v3, 0
14+
; CHECK-NEXT: s_mov_b64 s[4:5], 0
15+
; CHECK-NEXT: s_branch .LBB0_2
16+
; CHECK-NEXT: .LBB0_1: ; %for.body.5
17+
; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1
18+
; CHECK-NEXT: s_lshr_b64 s[6:7], s[4:5], 1
19+
; CHECK-NEXT: v_or_b32_e32 v3, s7, v3
20+
; CHECK-NEXT: v_or_b32_e32 v2, s6, v2
21+
; CHECK-NEXT: s_lshr_b64 s[6:7], s[4:5], 5
22+
; CHECK-NEXT: s_or_b32 s6, s6, 1
23+
; CHECK-NEXT: v_or3_b32 v3, v3, v1, s7
24+
; CHECK-NEXT: v_or3_b32 v2, v2, v0, s6
25+
; CHECK-NEXT: s_lshr_b64 s[4:5], s[4:5], 8
26+
; CHECK-NEXT: s_cbranch_execz .LBB0_4
27+
; CHECK-NEXT: .LBB0_2: ; %for.body
28+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
29+
; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 0
30+
; CHECK-NEXT: v_readfirstlane_b32 s8, v0
31+
; CHECK-NEXT: v_readfirstlane_b32 s9, v1
32+
; CHECK-NEXT: s_cbranch_scc0 .LBB0_1
33+
; CHECK-NEXT: ; %bb.3:
34+
; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
35+
; CHECK-NEXT: s_mov_b64 s[4:5], s[8:9]
36+
; CHECK-NEXT: .LBB0_4: ; %for.cond.cleanup
37+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
38+
; CHECK-NEXT: v_mov_b32_e32 v1, 0
39+
; CHECK-NEXT: s_setpc_b64 s[30:31]
40+
entry:
41+
br label %for.body
42+
43+
for.cond.cleanup: ; preds = %for.body
44+
%cmp3.not.i.i.i = icmp eq i64 %r.0108, 0
45+
br i1 %cmp3.not.i.i.i, label %cleanup, label %if.end26.i.i
46+
47+
if.end26.i.i: ; preds = %for.cond.cleanup
48+
br label %cleanup
49+
50+
for.body: ; preds = %for.body.5, %entry
51+
%current_bit.01093 = phi i64 [ 0, %entry ], [ %shr.3.7, %for.body.5 ]
52+
%r.0108 = phi i64 [ 0, %entry ], [ %shl28.3.7, %for.body.5 ]
53+
%shr.3 = lshr i64 %current_bit.01093, 1
54+
%i = or i64 %r.0108, %shr.3
55+
%i3 = or i64 %i, %arg
56+
%tobool27.not.3.4 = icmp ult i64 %current_bit.01093, 1
57+
br i1 %tobool27.not.3.4, label %for.cond.cleanup, label %for.body.5
58+
59+
for.body.5: ; preds = %for.body
60+
%shr.3.4 = lshr i64 %current_bit.01093, 5
61+
%i6 = or i64 %shr.3.4, 1
62+
%shl28.3.7 = or i64 %i6, %i3
63+
%shr.3.7 = lshr i64 %current_bit.01093, 8
64+
br label %for.body
65+
66+
cleanup: ; preds = %if.end26.i.i, %for.cond.cleanup
67+
ret double 0.000000e+00
68+
}
69+
70+
define amdgpu_cs void @issue130119(i1 %arg) {
71+
; CHECK-LABEL: issue130119:
72+
; CHECK: ; %bb.0: ; %bb
73+
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
74+
; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
75+
; CHECK-NEXT: s_mov_b32 s16, 0
76+
; CHECK-NEXT: s_mov_b64 s[4:5], 0
77+
; CHECK-NEXT: s_branch .LBB1_2
78+
; CHECK-NEXT: .LBB1_1: ; %Flow2
79+
; CHECK-NEXT: ; in Loop: Header=BB1_2 Depth=1
80+
; CHECK-NEXT: s_or_b64 exec, exec, s[6:7]
81+
; CHECK-NEXT: s_and_b64 s[2:3], exec, s[2:3]
82+
; CHECK-NEXT: s_or_b64 s[4:5], s[2:3], s[4:5]
83+
; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
84+
; CHECK-NEXT: s_cbranch_execz .LBB1_10
85+
; CHECK-NEXT: .LBB1_2: ; %bb1
86+
; CHECK-NEXT: ; =>This Loop Header: Depth=1
87+
; CHECK-NEXT: ; Child Loop BB1_4 Depth 2
88+
; CHECK-NEXT: s_and_b32 s2, s16, 1
89+
; CHECK-NEXT: s_cmp_eq_u32 s2, 0
90+
; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
91+
; CHECK-NEXT: s_cmp_eq_u32 s2, 1
92+
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
93+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
94+
; CHECK-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v0
95+
; CHECK-NEXT: s_mov_b64 s[10:11], 0
96+
; CHECK-NEXT: ; implicit-def: $sgpr8_sgpr9
97+
; CHECK-NEXT: s_branch .LBB1_4
98+
; CHECK-NEXT: .LBB1_3: ; %Flow1
99+
; CHECK-NEXT: ; in Loop: Header=BB1_4 Depth=2
100+
; CHECK-NEXT: s_xor_b64 s[14:15], s[14:15], -1
101+
; CHECK-NEXT: s_and_b64 s[12:13], exec, s[12:13]
102+
; CHECK-NEXT: s_or_b64 s[10:11], s[12:13], s[10:11]
103+
; CHECK-NEXT: s_andn2_b64 s[8:9], s[8:9], exec
104+
; CHECK-NEXT: s_and_b64 s[12:13], s[14:15], exec
105+
; CHECK-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
106+
; CHECK-NEXT: s_andn2_b64 exec, exec, s[10:11]
107+
; CHECK-NEXT: s_cbranch_execz .LBB1_8
108+
; CHECK-NEXT: .LBB1_4: ; %bb3
109+
; CHECK-NEXT: ; Parent Loop BB1_2 Depth=1
110+
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
111+
; CHECK-NEXT: s_and_b64 vcc, exec, s[2:3]
112+
; CHECK-NEXT: s_mov_b64 s[14:15], s[6:7]
113+
; CHECK-NEXT: s_cbranch_vccnz .LBB1_6
114+
; CHECK-NEXT: ; %bb.5: ; %bb7
115+
; CHECK-NEXT: ; in Loop: Header=BB1_4 Depth=2
116+
; CHECK-NEXT: s_mov_b64 s[14:15], -1
117+
; CHECK-NEXT: .LBB1_6: ; %Flow
118+
; CHECK-NEXT: ; in Loop: Header=BB1_4 Depth=2
119+
; CHECK-NEXT: s_mov_b64 s[12:13], -1
120+
; CHECK-NEXT: s_andn2_b64 vcc, exec, s[14:15]
121+
; CHECK-NEXT: s_mov_b64 s[14:15], -1
122+
; CHECK-NEXT: s_cbranch_vccnz .LBB1_3
123+
; CHECK-NEXT: ; %bb.7: ; %bb8
124+
; CHECK-NEXT: ; in Loop: Header=BB1_4 Depth=2
125+
; CHECK-NEXT: s_mov_b64 s[14:15], 0
126+
; CHECK-NEXT: s_orn2_b64 s[12:13], s[0:1], exec
127+
; CHECK-NEXT: s_branch .LBB1_3
128+
; CHECK-NEXT: .LBB1_8: ; %loop.exit.guard
129+
; CHECK-NEXT: ; in Loop: Header=BB1_2 Depth=1
130+
; CHECK-NEXT: s_or_b64 exec, exec, s[10:11]
131+
; CHECK-NEXT: s_mov_b64 s[2:3], -1
132+
; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[8:9]
133+
; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
134+
; CHECK-NEXT: s_cbranch_execz .LBB1_1
135+
; CHECK-NEXT: ; %bb.9: ; %bb10
136+
; CHECK-NEXT: ; in Loop: Header=BB1_2 Depth=1
137+
; CHECK-NEXT: s_or_b32 s16, s16, 1
138+
; CHECK-NEXT: s_xor_b64 s[2:3], exec, -1
139+
; CHECK-NEXT: s_branch .LBB1_1
140+
; CHECK-NEXT: .LBB1_10: ; %DummyReturnBlock
141+
; CHECK-NEXT: s_endpgm
142+
bb:
143+
br label %bb1
144+
145+
bb1: ; preds = %bb10, %bb
146+
%i = phi i32 [ 0, %bb ], [ %i11, %bb10 ]
147+
%i2 = phi i32 [ 0, %bb ], [ %i4, %bb10 ]
148+
br label %bb3
149+
150+
bb3: ; preds = %bb8, %bb1
151+
%i4 = phi i32 [ %i2, %bb1 ], [ %i9, %bb8 ]
152+
%i5 = and i32 %i, 1
153+
%i6 = icmp eq i32 %i5, 0
154+
br i1 %i6, label %bb8, label %bb7
155+
156+
bb7: ; preds = %bb3
157+
br label %bb8
158+
159+
bb8: ; preds = %bb7, %bb3
160+
%i9 = phi i32 [ %i2, %bb3 ], [ 0, %bb7 ]
161+
br i1 %arg, label %bb10, label %bb3
162+
163+
bb10: ; preds = %bb8
164+
%i11 = or i32 %i, 1
165+
br label %bb1
166+
}

0 commit comments

Comments
 (0)