Skip to content

Commit bbbe8ec

Browse files
authored
[GlobalISel][Localizer] Allow localization of a small number of repeated phi uses. (llvm#77566)
We previously had a heuristic that if a value V was used multiple times in a single PHI, then to avoid potentially rematerializing into many predecessors we bail out. The phi uses only counted as a single use in the shouldLocalize() hook because it counted the PHI as a single instruction use, not factoring in it may have many incoming edges. It turns out this heuristic is slightly too pessimistic, and allowing a small number of these uses to be localized can improve code size due to shortening live ranges, especially if those ranges span a call. This change results in some improvements in size on CTMark -Os: ``` Program size.__text before after diff kimwitu++/kc 451676.00 451860.00 0.0% mafft/pairlocalalign 241460.00 241540.00 0.0% tramp3d-v4/tramp3d-v4 389216.00 389208.00 -0.0% 7zip/7zip-benchmark 587528.00 587464.00 -0.0% Bullet/bullet 457424.00 457348.00 -0.0% consumer-typeset/consumer-typeset 405472.00 405376.00 -0.0% SPASS/SPASS 410288.00 410120.00 -0.0% lencod/lencod 426396.00 426108.00 -0.1% ClamAV/clamscan 380108.00 379756.00 -0.1% sqlite3/sqlite3 283664.00 283372.00 -0.1% Geomean difference -0.0% ``` I experimented with different variations and thresholds. Using 3 instead of 2 resulted in a further 0.1% improvement on ClamAV but also regressed sqlite3 by the same %.
1 parent e4e0b65 commit bbbe8ec

File tree

12 files changed

+142
-78
lines changed

12 files changed

+142
-78
lines changed

llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -558,6 +558,24 @@ class GVecReduce : public GenericMachineInstr {
558558
}
559559
};
560560

561+
/// Represents a G_PHI.
562+
class GPhi : public GenericMachineInstr {
563+
public:
564+
/// Returns the number of incoming values.
565+
unsigned getNumIncomingValues() const { return (getNumOperands() - 1) / 2; }
566+
/// Returns the I'th incoming vreg.
567+
Register getIncomingValue(unsigned I) {
568+
return getOperand(I * 2 + 1).getReg();
569+
}
570+
/// Returns the I'th incoming basic block.
571+
MachineBasicBlock *getIncomingBlock(unsigned I) {
572+
return getOperand(I * 2 + 2).getMBB();
573+
}
574+
575+
static bool classof(const MachineInstr *MI) {
576+
return MI->getOpcode() == TargetOpcode::G_PHI;
577+
}
578+
};
561579

562580
} // namespace llvm
563581

llvm/include/llvm/CodeGen/GlobalISel/Localizer.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,9 @@ class Localizer : public MachineFunctionPass {
6767

6868
typedef SmallSetVector<MachineInstr *, 32> LocalizedSetVecT;
6969

70-
/// If \p Op is a phi operand and not unique in that phi, that is,
71-
/// there are other operands in the phi with the same register,
72-
/// return true.
73-
bool isNonUniquePhiValue(MachineOperand &Op) const;
70+
/// If \p Op is a reg operand of a PHI, return the number of total
71+
/// operands in the PHI that are the same as \p Op, including itself.
72+
unsigned getNumPhiUses(MachineOperand &Op) const;
7473

7574
/// Do inter-block localization from the entry block.
7675
bool localizeInterBlock(MachineFunction &MF,

llvm/lib/CodeGen/GlobalISel/Localizer.cpp

Lines changed: 30 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "llvm/ADT/DenseMap.h"
1414
#include "llvm/ADT/STLExtras.h"
1515
#include "llvm/Analysis/TargetTransformInfo.h"
16+
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
1617
#include "llvm/CodeGen/GlobalISel/Utils.h"
1718
#include "llvm/CodeGen/MachineRegisterInfo.h"
1819
#include "llvm/CodeGen/TargetLowering.h"
@@ -58,18 +59,18 @@ bool Localizer::isLocalUse(MachineOperand &MOUse, const MachineInstr &Def,
5859
return InsertMBB == Def.getParent();
5960
}
6061

61-
bool Localizer::isNonUniquePhiValue(MachineOperand &Op) const {
62-
MachineInstr *MI = Op.getParent();
63-
if (!MI->isPHI())
64-
return false;
62+
unsigned Localizer::getNumPhiUses(MachineOperand &Op) const {
63+
auto *MI = dyn_cast<GPhi>(&*Op.getParent());
64+
if (!MI)
65+
return 0;
6566

6667
Register SrcReg = Op.getReg();
67-
for (unsigned Idx = 1; Idx < MI->getNumOperands(); Idx += 2) {
68-
auto &MO = MI->getOperand(Idx);
69-
if (&MO != &Op && MO.isReg() && MO.getReg() == SrcReg)
70-
return true;
68+
unsigned NumUses = 0;
69+
for (unsigned I = 0, NumVals = MI->getNumIncomingValues(); I < NumVals; ++I) {
70+
if (MI->getIncomingValue(I) == SrcReg)
71+
++NumUses;
7172
}
72-
return false;
73+
return NumUses;
7374
}
7475

7576
bool Localizer::localizeInterBlock(MachineFunction &MF,
@@ -108,11 +109,12 @@ bool Localizer::localizeInterBlock(MachineFunction &MF,
108109
continue;
109110
}
110111

111-
// If the use is a phi operand that's not unique, don't try to localize.
112-
// If we do, we can cause unnecessary instruction bloat by duplicating
113-
// into each predecessor block, when the existing one is sufficient and
114-
// allows for easier optimization later.
115-
if (isNonUniquePhiValue(MOUse))
112+
// PHIs look like a single user but can use the same register in multiple
113+
// edges, causing remat into each predecessor. Allow this to a certain
114+
// extent.
115+
unsigned NumPhiUses = getNumPhiUses(MOUse);
116+
const unsigned PhiThreshold = 2; // FIXME: Tune this more.
117+
if (NumPhiUses > PhiThreshold)
116118
continue;
117119

118120
LLVM_DEBUG(dbgs() << "Fixing non-local use\n");
@@ -164,19 +166,22 @@ bool Localizer::localizeIntraBlock(LocalizedSetVecT &LocalizedInstrs) {
164166
if (!UseMI.isPHI())
165167
Users.insert(&UseMI);
166168
}
167-
// If all the users were PHIs then they're not going to be in our block,
168-
// don't try to move this instruction.
169-
if (Users.empty())
170-
continue;
171-
172169
MachineBasicBlock::iterator II(MI);
173-
++II;
174-
while (II != MBB.end() && !Users.count(&*II))
170+
// If all the users were PHIs then they're not going to be in our block, we
171+
// may still benefit from sinking, especially since the value might be live
172+
// across a call.
173+
if (Users.empty()) {
174+
// Make sure we don't sink in between two terminator sequences by scanning
175+
// forward, not backward.
176+
II = MBB.getFirstTerminatorForward();
177+
LLVM_DEBUG(dbgs() << "Only phi users: moving inst to end: " << *MI);
178+
} else {
175179
++II;
176-
177-
assert(II != MBB.end() && "Didn't find the user in the MBB");
178-
LLVM_DEBUG(dbgs() << "Intra-block: moving " << *MI << " before " << *II
179-
<< '\n');
180+
while (II != MBB.end() && !Users.count(&*II))
181+
++II;
182+
assert(II != MBB.end() && "Didn't find the user in the MBB");
183+
LLVM_DEBUG(dbgs() << "Intra-block: moving " << *MI << " before " << *II);
184+
}
180185

181186
MI->removeFromParent();
182187
MBB.insert(II, MI);

llvm/test/CodeGen/AArch64/GlobalISel/invoke-region.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,10 @@ define i1 @test_lpad_phi_widen_into_pred() personality ptr @__gxx_personality_v0
1212
; CHECK: bb.1 (%ir-block.0):
1313
; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
1414
; CHECK-NEXT: {{ $}}
15-
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
1615
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @global_var
17-
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
18-
; CHECK-NEXT: G_STORE [[C1]](s32), [[GV]](p0) :: (store (s32) into @global_var)
16+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
17+
; CHECK-NEXT: G_STORE [[C]](s32), [[GV]](p0) :: (store (s32) into @global_var)
18+
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
1919
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
2020
; CHECK-NEXT: G_INVOKE_REGION_START
2121
; CHECK-NEXT: EH_LABEL <mcsymbol >
@@ -29,7 +29,7 @@ define i1 @test_lpad_phi_widen_into_pred() personality ptr @__gxx_personality_v0
2929
; CHECK-NEXT: successors: %bb.3(0x80000000)
3030
; CHECK-NEXT: liveins: $x0, $x1
3131
; CHECK-NEXT: {{ $}}
32-
; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.1
32+
; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1
3333
; CHECK-NEXT: EH_LABEL <mcsymbol >
3434
; CHECK-NEXT: [[GV1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @global_var
3535
; CHECK-NEXT: G_STORE [[PHI]](s32), [[GV1]](p0) :: (store (s32) into @global_var)
@@ -67,12 +67,12 @@ define i1 @test_lpad_phi_widen_into_pred_ext(ptr %ptr) personality ptr @__gxx_pe
6767
; CHECK-NEXT: liveins: $x0
6868
; CHECK-NEXT: {{ $}}
6969
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
70-
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
7170
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @global_var
72-
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
73-
; CHECK-NEXT: G_STORE [[C1]](s32), [[GV]](p0) :: (store (s32) into @global_var)
71+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
72+
; CHECK-NEXT: G_STORE [[C]](s32), [[GV]](p0) :: (store (s32) into @global_var)
7473
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p0) :: (load (s8) from %ir.ptr)
7574
; CHECK-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s8) = G_ASSERT_ZEXT [[LOAD]], 1
75+
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
7676
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[ASSERT_ZEXT]](s8)
7777
; CHECK-NEXT: G_INVOKE_REGION_START
7878
; CHECK-NEXT: EH_LABEL <mcsymbol >
@@ -86,7 +86,7 @@ define i1 @test_lpad_phi_widen_into_pred_ext(ptr %ptr) personality ptr @__gxx_pe
8686
; CHECK-NEXT: successors: %bb.3(0x80000000)
8787
; CHECK-NEXT: liveins: $x0, $x1
8888
; CHECK-NEXT: {{ $}}
89-
; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.1
89+
; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1
9090
; CHECK-NEXT: EH_LABEL <mcsymbol >
9191
; CHECK-NEXT: [[GV1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @global_var
9292
; CHECK-NEXT: G_STORE [[PHI]](s32), [[GV1]](p0) :: (store (s32) into @global_var)

llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-hoisted-constants.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,16 +46,16 @@ define i32 @test(i32 %a, i1 %c) {
4646
; PRESELECTION-NEXT: {{ $}}
4747
; PRESELECTION-NEXT: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
4848
; PRESELECTION-NEXT: [[COPY1:%[0-9]+]]:gpr(s32) = COPY $w1
49-
; PRESELECTION-NEXT: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0
50-
; PRESELECTION-NEXT: [[C1:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 100000
51-
; PRESELECTION-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:gpr(s32) = G_CONSTANT_FOLD_BARRIER [[C1]]
49+
; PRESELECTION-NEXT: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 100000
50+
; PRESELECTION-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:gpr(s32) = G_CONSTANT_FOLD_BARRIER [[C]]
51+
; PRESELECTION-NEXT: [[C1:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0
5252
; PRESELECTION-NEXT: [[C2:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
5353
; PRESELECTION-NEXT: [[AND:%[0-9]+]]:gpr(s32) = G_AND [[COPY1]], [[C2]]
5454
; PRESELECTION-NEXT: G_BRCOND [[AND]](s32), %bb.3
5555
; PRESELECTION-NEXT: G_BR %bb.2
5656
; PRESELECTION-NEXT: {{ $}}
5757
; PRESELECTION-NEXT: bb.2.common.ret:
58-
; PRESELECTION-NEXT: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI %7(s32), %bb.3, [[C]](s32), %bb.1
58+
; PRESELECTION-NEXT: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI %7(s32), %bb.3, [[C1]](s32), %bb.1
5959
; PRESELECTION-NEXT: $w0 = COPY [[PHI]](s32)
6060
; PRESELECTION-NEXT: RET_ReallyLR implicit $w0
6161
; PRESELECTION-NEXT: {{ $}}
@@ -75,8 +75,8 @@ define i32 @test(i32 %a, i1 %c) {
7575
; POSTSELECTION-NEXT: {{ $}}
7676
; POSTSELECTION-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0
7777
; POSTSELECTION-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
78-
; POSTSELECTION-NEXT: [[COPY2:%[0-9]+]]:gpr32 = COPY $wzr
7978
; POSTSELECTION-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 100000
79+
; POSTSELECTION-NEXT: [[COPY2:%[0-9]+]]:gpr32 = COPY $wzr
8080
; POSTSELECTION-NEXT: TBNZW [[COPY1]], 0, %bb.3
8181
; POSTSELECTION-NEXT: B %bb.2
8282
; POSTSELECTION-NEXT: {{ $}}

llvm/test/CodeGen/AArch64/GlobalISel/localizer.mir

Lines changed: 49 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,9 @@
5656

5757
define void @test_inttoptr() { ret void }
5858
define void @many_local_use_intra_block() { ret void }
59-
define void @non_local_phi_use_nonunique() { ret void }
59+
define void @non_local_phi_single_use() { ret void }
60+
define void @non_local_phi_three_uses() { ret void }
61+
6062
...
6163

6264
---
@@ -285,8 +287,8 @@ body: |
285287
; CHECK: bb.1:
286288
; CHECK: successors: %bb.1(0x80000000)
287289
; CHECK: [[PHI:%[0-9]+]]:fpr(s32) = PHI [[FADD]](s32), %bb.0, %4(s32), %bb.1
288-
; CHECK: [[C1:%[0-9]+]]:fpr(s32) = G_FCONSTANT float 1.000000e+00
289290
; CHECK: [[FADD1:%[0-9]+]]:fpr(s32) = G_FADD [[PHI]], [[FADD]]
291+
; CHECK: [[C1:%[0-9]+]]:fpr(s32) = G_FCONSTANT float 1.000000e+00
290292
; CHECK: G_BR %bb.1
291293
292294
; Existing registers should be left untouched
@@ -566,12 +568,12 @@ body: |
566568
...
567569

568570
---
569-
name: non_local_phi_use_nonunique
571+
name: non_local_phi_single_use
570572
legalized: true
571573
regBankSelected: true
572574
tracksRegLiveness: true
573575
body: |
574-
; CHECK-LABEL: name: non_local_phi_use_nonunique
576+
; CHECK-LABEL: name: non_local_phi_single_use
575577
; CHECK: bb.0:
576578
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
577579
; CHECK: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
@@ -582,12 +584,12 @@ body: |
582584
; CHECK: G_BR %bb.2
583585
; CHECK: bb.1:
584586
; CHECK: successors: %bb.2(0x80000000)
587+
; CHECK: [[C2:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
585588
; CHECK: bb.2:
586-
; CHECK: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI [[C]](s32), %bb.1
589+
; CHECK: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI [[C2]](s32), %bb.1
587590
; CHECK: [[ADD1:%[0-9]+]]:gpr(s32) = G_ADD [[PHI]], [[PHI]]
588591
589-
; Don't localize the 1 into bb.1, because there are multiple edges
590-
; using that register.
592+
; Localize the 1 into bb.1, since the number of uses is under the threshold.
591593
592594
bb.0:
593595
successors: %bb.1, %bb.2
@@ -606,3 +608,43 @@ body: |
606608
%3:gpr(s32) = G_PHI %0(s32), %bb.1, %0(s32), %bb.0
607609
%2:gpr(s32) = G_ADD %3, %3
608610
...
611+
---
612+
name: non_local_phi_three_uses
613+
legalized: true
614+
regBankSelected: true
615+
tracksRegLiveness: true
616+
body: |
617+
; CHECK-LABEL: name: non_local_phi_three_uses
618+
; CHECK: bb.0:
619+
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
620+
; CHECK: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
621+
; CHECK: [[ADD:%[0-9]+]]:gpr(s32) = G_ADD [[C]], [[C]]
622+
; CHECK: %cmp:gpr(s32) = G_ICMP intpred(eq), [[ADD]](s32), [[C]]
623+
; CHECK: %cond:gpr(s1) = G_TRUNC %cmp(s32)
624+
; CHECK: G_BRCOND %cond(s1), %bb.1
625+
; CHECK: G_BR %bb.2
626+
; CHECK: bb.1:
627+
; CHECK: successors: %bb.2(0x80000000)
628+
; CHECK: bb.2:
629+
; CHECK: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI [[C]](s32), %bb.1
630+
; CHECK: [[ADD1:%[0-9]+]]:gpr(s32) = G_ADD [[PHI]], [[PHI]]
631+
632+
; Don't localize the 1 into bb.1, above the thresold of uses in the phi.
633+
634+
bb.0:
635+
successors: %bb.1, %bb.2
636+
637+
%0:gpr(s32) = G_CONSTANT i32 1
638+
%1:gpr(s32) = G_ADD %0, %0
639+
%cmp:gpr(s32) = G_ICMP intpred(eq), %1(s32), %0
640+
%cond:gpr(s1) = G_TRUNC %cmp(s32)
641+
G_BRCOND %cond(s1), %bb.1
642+
G_BR %bb.2
643+
644+
bb.1:
645+
successors: %bb.2
646+
647+
bb.2:
648+
%3:gpr(s32) = G_PHI %0(s32), %bb.1, %0(s32), %bb.0, %0(s32), %bb.0, %0(s32), %bb.0
649+
%2:gpr(s32) = G_ADD %3, %3
650+
...

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -230,32 +230,32 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3
230230
; GFX10-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
231231
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
232232
; GFX10-NEXT: s_mov_b32 s2, 1
233-
; GFX10-NEXT: v_mbcnt_hi_u32_b32 v2, -1, v1
234-
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v2
235-
; GFX10-NEXT: v_and_b32_e32 v3, 1, v2
233+
; GFX10-NEXT: v_mbcnt_hi_u32_b32 v1, -1, v1
234+
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v1
235+
; GFX10-NEXT: v_and_b32_e32 v3, 1, v1
236236
; GFX10-NEXT: v_xor_b32_e32 v3, 1, v3
237237
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
238238
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
239-
; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen
239+
; GFX10-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen
240240
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
241241
; GFX10-NEXT: ; implicit-def: $vgpr3
242242
; GFX10-NEXT: s_waitcnt vmcnt(0)
243-
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v1
243+
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2
244244
; GFX10-NEXT: s_cbranch_vccnz .LBB4_4
245245
; GFX10-NEXT: ; %bb.1: ; %.preheader.preheader
246-
; GFX10-NEXT: v_mov_b32_e32 v4, s12
247246
; GFX10-NEXT: v_mov_b32_e32 v3, s12
247+
; GFX10-NEXT: v_mov_b32_e32 v4, s12
248248
; GFX10-NEXT: .LBB4_2: ; %.preheader
249249
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
250-
; GFX10-NEXT: buffer_load_dword v5, v4, s[4:7], 0 offen
251-
; GFX10-NEXT: v_add_nc_u32_e32 v2, -1, v2
252-
; GFX10-NEXT: v_add_nc_u32_e32 v4, 4, v4
253-
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
250+
; GFX10-NEXT: buffer_load_dword v5, v3, s[4:7], 0 offen
251+
; GFX10-NEXT: v_add_nc_u32_e32 v1, -1, v1
252+
; GFX10-NEXT: v_add_nc_u32_e32 v3, 4, v3
253+
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
254254
; GFX10-NEXT: s_waitcnt vmcnt(0)
255-
; GFX10-NEXT: v_add_nc_u32_e32 v3, v5, v3
255+
; GFX10-NEXT: v_add_nc_u32_e32 v4, v5, v4
256256
; GFX10-NEXT: s_cbranch_vccnz .LBB4_2
257257
; GFX10-NEXT: ; %bb.3: ; %.preheader._crit_edge
258-
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1
258+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
259259
; GFX10-NEXT: s_or_b32 s2, s0, vcc_lo
260260
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
261261
; GFX10-NEXT: s_mov_b32 s2, 0

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,24 +48,24 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
4848
; GCN-NEXT: s_buffer_load_dword s2, s[4:7], 0x0
4949
; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
5050
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
51-
; GCN-NEXT: s_mov_b32 s4, 1
5251
; GCN-NEXT: s_waitcnt lgkmcnt(0)
5352
; GCN-NEXT: s_cmp_lg_u32 s2, 56
54-
; GCN-NEXT: s_cselect_b32 s2, 1, 0
53+
; GCN-NEXT: s_cselect_b32 s4, 1, 0
5554
; GCN-NEXT: v_mov_b32_e32 v0, s3
5655
; GCN-NEXT: s_not_b64 exec, exec
5756
; GCN-NEXT: v_mov_b32_e32 v0, 42
5857
; GCN-NEXT: s_not_b64 exec, exec
59-
; GCN-NEXT: s_cmp_lg_u32 s2, 0
58+
; GCN-NEXT: s_mov_b32 s2, 1
59+
; GCN-NEXT: s_cmp_lg_u32 s4, 0
6060
; GCN-NEXT: s_cbranch_scc0 .LBB2_2
6161
; GCN-NEXT: ; %bb.1: ; %.one
6262
; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0
6363
; GCN-NEXT: s_mov_b32 s2, -1
6464
; GCN-NEXT: s_mov_b32 s3, 0xf000
65-
; GCN-NEXT: s_mov_b32 s4, 0
6665
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
66+
; GCN-NEXT: s_mov_b32 s2, 0
6767
; GCN-NEXT: .LBB2_2: ; %Flow
68-
; GCN-NEXT: s_xor_b32 s2, s4, 1
68+
; GCN-NEXT: s_xor_b32 s2, s2, 1
6969
; GCN-NEXT: s_and_b32 s2, s2, 1
7070
; GCN-NEXT: s_cmp_lg_u32 s2, 0
7171
; GCN-NEXT: s_cbranch_scc1 .LBB2_4

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,11 @@ define amdgpu_cs void @memmove_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src
3636
; LOOP-NEXT: s_andn2_saveexec_b64 s[0:1], s[4:5]
3737
; LOOP-NEXT: s_cbranch_execz .LBB0_6
3838
; LOOP-NEXT: ; %bb.4: ; %copy_backwards
39-
; LOOP-NEXT: s_mov_b32 s0, -4
4039
; LOOP-NEXT: v_add_i32_e32 v0, vcc, 3, v0
4140
; LOOP-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4241
; LOOP-NEXT: v_add_i32_e32 v2, vcc, 3, v2
4342
; LOOP-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
43+
; LOOP-NEXT: s_mov_b32 s0, -4
4444
; LOOP-NEXT: s_mov_b32 s6, 0
4545
; LOOP-NEXT: s_mov_b32 s7, 0xf000
4646
; LOOP-NEXT: s_mov_b64 s[4:5], 0

0 commit comments

Comments
 (0)