Skip to content

Commit 4383079

Browse files
committed
[AMDGPU] Remove dubious logic in bidirectional list scheduler
Summary: pickNodeBidirectional tried to compare the best top candidate and the best bottom candidate by examining TopCand.Reason and BotCand.Reason. This is unsound because, after calling pickNodeFromQueue, Cand.Reason does not reflect the most important reason why Cand was chosen. Rather it reflects the most recent reason why it beat some other potential candidate, which could have been for some low priority tie breaker reason. I have seen this cause problems where TopCand is a good candidate, but because TopCand.Reason is ORDER (which is very low priority) it is repeatedly ignored in favour of a mediocre BotCand. This is not how bidirectional scheduling is supposed to work. To fix this I changed the code to always compare TopCand and BotCand directly, like the generic implementation of pickNodeBidirectional does. This removes some uncommented AMDGPU-specific logic; if this logic turns out to be important then perhaps it could be moved into an override of tryCandidate instead. Graphics shader benchmarking on gfx10 shows a lot more positive than negative effects from this change. Reviewers: arsenm, tstellar, rampitec, kzhuravl, vpykhtin, dstuttard, tpr, atrick, MatzeB Subscribers: jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D68338
1 parent 0a2d415 commit 4383079

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+4413
-4404
lines changed

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 5 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -233,33 +233,11 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
233233
// Pick best from BotCand and TopCand.
234234
LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
235235
dbgs() << "Bot Cand: "; traceCandidate(BotCand););
236-
SchedCandidate Cand;
237-
if (TopCand.Reason == BotCand.Reason) {
238-
Cand = BotCand;
239-
GenericSchedulerBase::CandReason TopReason = TopCand.Reason;
240-
TopCand.Reason = NoCand;
241-
GenericScheduler::tryCandidate(Cand, TopCand, nullptr);
242-
if (TopCand.Reason != NoCand) {
243-
Cand.setBest(TopCand);
244-
} else {
245-
TopCand.Reason = TopReason;
246-
}
247-
} else {
248-
if (TopCand.Reason == RegExcess && TopCand.RPDelta.Excess.getUnitInc() <= 0) {
249-
Cand = TopCand;
250-
} else if (BotCand.Reason == RegExcess && BotCand.RPDelta.Excess.getUnitInc() <= 0) {
251-
Cand = BotCand;
252-
} else if (TopCand.Reason == RegCritical && TopCand.RPDelta.CriticalMax.getUnitInc() <= 0) {
253-
Cand = TopCand;
254-
} else if (BotCand.Reason == RegCritical && BotCand.RPDelta.CriticalMax.getUnitInc() <= 0) {
255-
Cand = BotCand;
256-
} else {
257-
if (BotCand.Reason > TopCand.Reason) {
258-
Cand = TopCand;
259-
} else {
260-
Cand = BotCand;
261-
}
262-
}
236+
SchedCandidate Cand = BotCand;
237+
TopCand.Reason = NoCand;
238+
GenericScheduler::tryCandidate(Cand, TopCand, nullptr);
239+
if (TopCand.Reason != NoCand) {
240+
Cand.setBest(TopCand);
263241
}
264242
LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
265243

llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -374,10 +374,10 @@ define i16 @v_bswap_i16(i16 %src) {
374374
; GFX7-LABEL: v_bswap_i16:
375375
; GFX7: ; %bb.0:
376376
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
377-
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v0
378-
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0
379-
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v1
380-
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
377+
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0
378+
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
379+
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0
380+
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
381381
; GFX7-NEXT: s_setpc_b64 s[30:31]
382382
;
383383
; GFX8-LABEL: v_bswap_i16:
@@ -440,10 +440,10 @@ define i32 @v_bswap_i16_zext_to_i32(i16 %src) {
440440
; GFX7-LABEL: v_bswap_i16_zext_to_i32:
441441
; GFX7: ; %bb.0:
442442
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
443-
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v0
444-
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0
445-
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v1
446-
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
443+
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0
444+
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
445+
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0
446+
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
447447
; GFX7-NEXT: v_bfe_u32 v0, v0, 0, 16
448448
; GFX7-NEXT: s_setpc_b64 s[30:31]
449449
;
@@ -469,10 +469,10 @@ define i32 @v_bswap_i16_sext_to_i32(i16 %src) {
469469
; GFX7-LABEL: v_bswap_i16_sext_to_i32:
470470
; GFX7: ; %bb.0:
471471
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
472-
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v0
473-
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0
474-
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v1
475-
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
472+
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0
473+
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
474+
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0
475+
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
476476
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
477477
; GFX7-NEXT: s_setpc_b64 s[30:31]
478478
;

llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll

Lines changed: 47 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -126,21 +126,21 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
126126
; GFX8-LABEL: v_pow_v2f16:
127127
; GFX8: ; %bb.0:
128128
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
129-
; GFX8-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
130-
; GFX8-NEXT: v_log_f16_e32 v0, v0
131-
; GFX8-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
132-
; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1
129+
; GFX8-NEXT: v_log_f16_e32 v2, v0
130+
; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
131+
; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
132+
; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
133133
; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
134134
; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
135135
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
136136
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
137-
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2
138137
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
138+
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2
139139
; GFX8-NEXT: v_mov_b32_e32 v2, 16
140-
; GFX8-NEXT: v_exp_f16_e32 v1, v1
141140
; GFX8-NEXT: v_exp_f16_e32 v0, v0
142-
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
143-
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
141+
; GFX8-NEXT: v_exp_f16_e32 v1, v1
142+
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
143+
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
144144
; GFX8-NEXT: s_setpc_b64 s[30:31]
145145
;
146146
; GFX9-LABEL: v_pow_v2f16:
@@ -154,11 +154,11 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
154154
; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0
155155
; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
156156
; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
157-
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2
158157
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
159-
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
160-
; GFX9-NEXT: v_exp_f16_e32 v1, v1
158+
; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
161159
; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
160+
; GFX9-NEXT: v_exp_f16_e32 v1, v2
161+
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
162162
; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0
163163
; GFX9-NEXT: s_setpc_b64 s[30:31]
164164
%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y)
@@ -173,40 +173,40 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
173173
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
174174
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
175175
; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
176-
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
176+
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0
177+
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
177178
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
178-
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
179179
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
180+
; GFX6-NEXT: v_log_f32_e32 v1, v1
180181
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
181182
; GFX6-NEXT: v_log_f32_e32 v0, v0
182-
; GFX6-NEXT: v_log_f32_e32 v1, v1
183-
; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v2
184-
; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v3
185-
; GFX6-NEXT: v_exp_f32_e32 v0, v0
183+
; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2
186184
; GFX6-NEXT: v_exp_f32_e32 v1, v1
187-
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
188-
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
185+
; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v3
186+
; GFX6-NEXT: v_exp_f32_e32 v2, v0
187+
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v1
188+
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v2
189189
; GFX6-NEXT: s_setpc_b64 s[30:31]
190190
;
191191
; GFX8-LABEL: v_pow_v2f16_fneg_lhs:
192192
; GFX8: ; %bb.0:
193193
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
194194
; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
195-
; GFX8-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
196-
; GFX8-NEXT: v_log_f16_e32 v0, v0
197-
; GFX8-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
198-
; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1
195+
; GFX8-NEXT: v_log_f16_e32 v2, v0
196+
; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
197+
; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
198+
; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
199199
; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
200200
; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
201201
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
202202
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
203-
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2
204203
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
204+
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2
205205
; GFX8-NEXT: v_mov_b32_e32 v2, 16
206-
; GFX8-NEXT: v_exp_f16_e32 v1, v1
207206
; GFX8-NEXT: v_exp_f16_e32 v0, v0
208-
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
209-
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
207+
; GFX8-NEXT: v_exp_f16_e32 v1, v1
208+
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
209+
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
210210
; GFX8-NEXT: s_setpc_b64 s[30:31]
211211
;
212212
; GFX9-LABEL: v_pow_v2f16_fneg_lhs:
@@ -259,22 +259,22 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
259259
; GFX8-LABEL: v_pow_v2f16_fneg_rhs:
260260
; GFX8: ; %bb.0:
261261
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
262-
; GFX8-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
263-
; GFX8-NEXT: v_log_f16_e32 v0, v0
262+
; GFX8-NEXT: v_log_f16_e32 v2, v0
263+
; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
264264
; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
265-
; GFX8-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
266-
; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
265+
; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
266+
; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
267267
; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
268-
; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1
269-
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
270-
; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
268+
; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
271269
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
272270
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
273-
; GFX8-NEXT: v_exp_f16_e32 v1, v2
271+
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
272+
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2
274273
; GFX8-NEXT: v_mov_b32_e32 v2, 16
275274
; GFX8-NEXT: v_exp_f16_e32 v0, v0
276-
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
277-
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
275+
; GFX8-NEXT: v_exp_f16_e32 v1, v1
276+
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
277+
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
278278
; GFX8-NEXT: s_setpc_b64 s[30:31]
279279
;
280280
; GFX9-LABEL: v_pow_v2f16_fneg_rhs:
@@ -336,22 +336,22 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
336336
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
337337
; GFX8-NEXT: s_mov_b32 s4, 0x80008000
338338
; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0
339-
; GFX8-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
340-
; GFX8-NEXT: v_log_f16_e32 v0, v0
339+
; GFX8-NEXT: v_log_f16_e32 v2, v0
340+
; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
341341
; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1
342-
; GFX8-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
343-
; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
342+
; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
343+
; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
344344
; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
345-
; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1
346-
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
347-
; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
345+
; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
348346
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
349347
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
350-
; GFX8-NEXT: v_exp_f16_e32 v1, v2
348+
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
349+
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2
351350
; GFX8-NEXT: v_mov_b32_e32 v2, 16
352351
; GFX8-NEXT: v_exp_f16_e32 v0, v0
353-
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
354-
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
352+
; GFX8-NEXT: v_exp_f16_e32 v1, v1
353+
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
354+
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
355355
; GFX8-NEXT: s_setpc_b64 s[30:31]
356356
;
357357
; GFX9-LABEL: v_pow_v2f16_fneg_lhs_rhs:

llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll

Lines changed: 41 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -233,34 +233,41 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_s_v(<8 x float> inreg %v
233233
; MOVREL-NEXT: s_mov_b32 s4, s6
234234
; MOVREL-NEXT: s_mov_b32 s6, s8
235235
; MOVREL-NEXT: v_mov_b32_e32 v16, s7
236-
; MOVREL-NEXT: v_mov_b32_e32 v8, v0
237236
; MOVREL-NEXT: v_mov_b32_e32 v14, s5
237+
; MOVREL-NEXT: v_mov_b32_e32 v12, s3
238238
; MOVREL-NEXT: v_mov_b32_e32 v13, s4
239239
; MOVREL-NEXT: v_mov_b32_e32 v15, s6
240-
; MOVREL-NEXT: v_mov_b32_e32 v12, s3
241240
; MOVREL-NEXT: v_mov_b32_e32 v11, s2
242241
; MOVREL-NEXT: v_mov_b32_e32 v10, s1
243242
; MOVREL-NEXT: v_mov_b32_e32 v9, s0
244243
; MOVREL-NEXT: s_mov_b32 s0, exec_lo
245244
; MOVREL-NEXT: ; implicit-def: $vcc_hi
246245
; MOVREL-NEXT: BB3_1: ; =>This Inner Loop Header: Depth=1
247-
; MOVREL-NEXT: v_readfirstlane_b32 s1, v8
248-
; MOVREL-NEXT: v_mov_b32_e32 v0, v9
249-
; MOVREL-NEXT: v_mov_b32_e32 v1, v10
250-
; MOVREL-NEXT: v_mov_b32_e32 v2, v11
251-
; MOVREL-NEXT: v_mov_b32_e32 v3, v12
252-
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v8
246+
; MOVREL-NEXT: v_readfirstlane_b32 s1, v0
247+
; MOVREL-NEXT: v_mov_b32_e32 v1, v9
248+
; MOVREL-NEXT: v_mov_b32_e32 v2, v10
249+
; MOVREL-NEXT: v_mov_b32_e32 v3, v11
250+
; MOVREL-NEXT: v_mov_b32_e32 v4, v12
251+
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v0
253252
; MOVREL-NEXT: s_mov_b32 m0, s1
254-
; MOVREL-NEXT: v_mov_b32_e32 v4, v13
255-
; MOVREL-NEXT: v_mov_b32_e32 v5, v14
256-
; MOVREL-NEXT: v_mov_b32_e32 v6, v15
257-
; MOVREL-NEXT: v_mov_b32_e32 v7, v16
258-
; MOVREL-NEXT: v_movreld_b32_e32 v0, s10
253+
; MOVREL-NEXT: v_mov_b32_e32 v5, v13
254+
; MOVREL-NEXT: v_mov_b32_e32 v6, v14
255+
; MOVREL-NEXT: v_mov_b32_e32 v7, v15
256+
; MOVREL-NEXT: v_mov_b32_e32 v8, v16
257+
; MOVREL-NEXT: v_movreld_b32_e32 v1, s10
259258
; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
260259
; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
261260
; MOVREL-NEXT: s_cbranch_execnz BB3_1
262261
; MOVREL-NEXT: ; %bb.2:
263262
; MOVREL-NEXT: s_mov_b32 exec_lo, s0
263+
; MOVREL-NEXT: v_mov_b32_e32 v0, v1
264+
; MOVREL-NEXT: v_mov_b32_e32 v1, v2
265+
; MOVREL-NEXT: v_mov_b32_e32 v2, v3
266+
; MOVREL-NEXT: v_mov_b32_e32 v3, v4
267+
; MOVREL-NEXT: v_mov_b32_e32 v4, v5
268+
; MOVREL-NEXT: v_mov_b32_e32 v5, v6
269+
; MOVREL-NEXT: v_mov_b32_e32 v6, v7
270+
; MOVREL-NEXT: v_mov_b32_e32 v7, v8
264271
; MOVREL-NEXT: ; return to shader part epilog
265272
entry:
266273
%insert = insertelement <8 x float> %vec, float %val, i32 %idx
@@ -393,35 +400,41 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_v_v(<8 x float> inreg %v
393400
; MOVREL-NEXT: s_mov_b32 s4, s6
394401
; MOVREL-NEXT: s_mov_b32 s6, s8
395402
; MOVREL-NEXT: v_mov_b32_e32 v17, s7
396-
; MOVREL-NEXT: v_mov_b32_e32 v8, v0
397-
; MOVREL-NEXT: v_mov_b32_e32 v9, v1
398403
; MOVREL-NEXT: v_mov_b32_e32 v15, s5
399-
; MOVREL-NEXT: v_mov_b32_e32 v16, s6
400-
; MOVREL-NEXT: v_mov_b32_e32 v14, s4
401404
; MOVREL-NEXT: v_mov_b32_e32 v13, s3
405+
; MOVREL-NEXT: v_mov_b32_e32 v14, s4
406+
; MOVREL-NEXT: v_mov_b32_e32 v16, s6
402407
; MOVREL-NEXT: v_mov_b32_e32 v12, s2
403408
; MOVREL-NEXT: v_mov_b32_e32 v11, s1
404409
; MOVREL-NEXT: v_mov_b32_e32 v10, s0
405410
; MOVREL-NEXT: s_mov_b32 s0, exec_lo
406411
; MOVREL-NEXT: ; implicit-def: $vcc_hi
407412
; MOVREL-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1
408-
; MOVREL-NEXT: v_readfirstlane_b32 s1, v9
409-
; MOVREL-NEXT: v_mov_b32_e32 v0, v10
410-
; MOVREL-NEXT: v_mov_b32_e32 v1, v11
411-
; MOVREL-NEXT: v_mov_b32_e32 v2, v12
412-
; MOVREL-NEXT: v_mov_b32_e32 v3, v13
413-
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v9
413+
; MOVREL-NEXT: v_readfirstlane_b32 s1, v1
414+
; MOVREL-NEXT: v_mov_b32_e32 v2, v10
415+
; MOVREL-NEXT: v_mov_b32_e32 v3, v11
416+
; MOVREL-NEXT: v_mov_b32_e32 v4, v12
417+
; MOVREL-NEXT: v_mov_b32_e32 v5, v13
418+
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1
414419
; MOVREL-NEXT: s_mov_b32 m0, s1
415-
; MOVREL-NEXT: v_mov_b32_e32 v4, v14
416-
; MOVREL-NEXT: v_mov_b32_e32 v5, v15
417-
; MOVREL-NEXT: v_mov_b32_e32 v6, v16
418-
; MOVREL-NEXT: v_mov_b32_e32 v7, v17
419-
; MOVREL-NEXT: v_movreld_b32_e32 v0, v8
420+
; MOVREL-NEXT: v_mov_b32_e32 v6, v14
421+
; MOVREL-NEXT: v_mov_b32_e32 v7, v15
422+
; MOVREL-NEXT: v_mov_b32_e32 v8, v16
423+
; MOVREL-NEXT: v_mov_b32_e32 v9, v17
424+
; MOVREL-NEXT: v_movreld_b32_e32 v2, v0
420425
; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
421426
; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
422427
; MOVREL-NEXT: s_cbranch_execnz BB6_1
423428
; MOVREL-NEXT: ; %bb.2:
424429
; MOVREL-NEXT: s_mov_b32 exec_lo, s0
430+
; MOVREL-NEXT: v_mov_b32_e32 v0, v2
431+
; MOVREL-NEXT: v_mov_b32_e32 v1, v3
432+
; MOVREL-NEXT: v_mov_b32_e32 v2, v4
433+
; MOVREL-NEXT: v_mov_b32_e32 v3, v5
434+
; MOVREL-NEXT: v_mov_b32_e32 v4, v6
435+
; MOVREL-NEXT: v_mov_b32_e32 v5, v7
436+
; MOVREL-NEXT: v_mov_b32_e32 v6, v8
437+
; MOVREL-NEXT: v_mov_b32_e32 v7, v9
425438
; MOVREL-NEXT: ; return to shader part epilog
426439
entry:
427440
%insert = insertelement <8 x float> %vec, float %val, i32 %idx

llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@ define amdgpu_kernel void @use_lds_globals(i32 addrspace(1)* %out, i32 addrspace
1010
; CHECK: ; %bb.0: ; %entry
1111
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1212
; CHECK-NEXT: s_add_u32 s2, 4, 4
13-
; CHECK-NEXT: v_mov_b32_e32 v2, s2
13+
; CHECK-NEXT: v_mov_b32_e32 v0, s2
1414
; CHECK-NEXT: s_mov_b32 m0, -1
15-
; CHECK-NEXT: ds_read_b32 v2, v2
15+
; CHECK-NEXT: ds_read_b32 v2, v0
1616
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
1717
; CHECK-NEXT: s_add_u32 s0, s0, 4
1818
; CHECK-NEXT: s_addc_u32 s1, s1, 0

0 commit comments

Comments
 (0)