Skip to content

[GlobalISel][Localizer] Allow localization of a small number of repeated phi uses. #77566

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,24 @@ class GVecReduce : public GenericMachineInstr {
}
};

/// Represents a G_PHI.
class GPhi : public GenericMachineInstr {
public:
/// Returns the number of incoming values.
unsigned getNumIncomingValues() const { return (getNumOperands() - 1) / 2; }
/// Returns the I'th incoming vreg.
Register getIncomingValue(unsigned I) {
return getOperand(I * 2 + 1).getReg();
}
/// Returns the I'th incoming basic block.
MachineBasicBlock *getIncomingBlock(unsigned I) {
return getOperand(I * 2 + 2).getMBB();
}

static bool classof(const MachineInstr *MI) {
return MI->getOpcode() == TargetOpcode::G_PHI;
}
};

} // namespace llvm

Expand Down
7 changes: 3 additions & 4 deletions llvm/include/llvm/CodeGen/GlobalISel/Localizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,9 @@ class Localizer : public MachineFunctionPass {

typedef SmallSetVector<MachineInstr *, 32> LocalizedSetVecT;

/// If \p Op is a phi operand and not unique in that phi, that is,
/// there are other operands in the phi with the same register,
/// return true.
bool isNonUniquePhiValue(MachineOperand &Op) const;
/// If \p Op is a reg operand of a PHI, return the number of total
/// operands in the PHI that are the same as \p Op, including itself.
unsigned getNumPhiUses(MachineOperand &Op) const;

/// Do inter-block localization from the entry block.
bool localizeInterBlock(MachineFunction &MF,
Expand Down
55 changes: 30 additions & 25 deletions llvm/lib/CodeGen/GlobalISel/Localizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
Expand Down Expand Up @@ -58,18 +59,18 @@ bool Localizer::isLocalUse(MachineOperand &MOUse, const MachineInstr &Def,
return InsertMBB == Def.getParent();
}

bool Localizer::isNonUniquePhiValue(MachineOperand &Op) const {
MachineInstr *MI = Op.getParent();
if (!MI->isPHI())
return false;
unsigned Localizer::getNumPhiUses(MachineOperand &Op) const {
auto *MI = dyn_cast<GPhi>(&*Op.getParent());
if (!MI)
return 0;

Register SrcReg = Op.getReg();
for (unsigned Idx = 1; Idx < MI->getNumOperands(); Idx += 2) {
auto &MO = MI->getOperand(Idx);
if (&MO != &Op && MO.isReg() && MO.getReg() == SrcReg)
return true;
unsigned NumUses = 0;
for (unsigned I = 0, NumVals = MI->getNumIncomingValues(); I < NumVals; ++I) {
if (MI->getIncomingValue(I) == SrcReg)
++NumUses;
}
return false;
return NumUses;
}

bool Localizer::localizeInterBlock(MachineFunction &MF,
Expand Down Expand Up @@ -108,11 +109,12 @@ bool Localizer::localizeInterBlock(MachineFunction &MF,
continue;
}

// If the use is a phi operand that's not unique, don't try to localize.
// If we do, we can cause unnecessary instruction bloat by duplicating
// into each predecessor block, when the existing one is sufficient and
// allows for easier optimization later.
if (isNonUniquePhiValue(MOUse))
// PHIs look like a single user but can use the same register in multiple
// edges, causing remat into each predecessor. Allow this to a certain
// extent.
unsigned NumPhiUses = getNumPhiUses(MOUse);
const unsigned PhiThreshold = 2; // FIXME: Tune this more.
if (NumPhiUses > PhiThreshold)
continue;

LLVM_DEBUG(dbgs() << "Fixing non-local use\n");
Expand Down Expand Up @@ -164,19 +166,22 @@ bool Localizer::localizeIntraBlock(LocalizedSetVecT &LocalizedInstrs) {
if (!UseMI.isPHI())
Users.insert(&UseMI);
}
// If all the users were PHIs then they're not going to be in our block,
// don't try to move this instruction.
if (Users.empty())
continue;

MachineBasicBlock::iterator II(MI);
++II;
while (II != MBB.end() && !Users.count(&*II))
// If all the users were PHIs then they're not going to be in our block, we
// may still benefit from sinking, especially since the value might be live
// across a call.
if (Users.empty()) {
// Make sure we don't sink in between two terminator sequences by scanning
// forward, not backward.
II = MBB.getFirstTerminatorForward();
LLVM_DEBUG(dbgs() << "Only phi users: moving inst to end: " << *MI);
} else {
++II;

assert(II != MBB.end() && "Didn't find the user in the MBB");
LLVM_DEBUG(dbgs() << "Intra-block: moving " << *MI << " before " << *II
<< '\n');
while (II != MBB.end() && !Users.count(&*II))
++II;
assert(II != MBB.end() && "Didn't find the user in the MBB");
LLVM_DEBUG(dbgs() << "Intra-block: moving " << *MI << " before " << *II);
}

MI->removeFromParent();
MBB.insert(II, MI);
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AArch64/GlobalISel/invoke-region.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ define i1 @test_lpad_phi_widen_into_pred() personality ptr @__gxx_personality_v0
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @global_var
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
; CHECK-NEXT: G_STORE [[C1]](s32), [[GV]](p0) :: (store (s32) into @global_var)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
; CHECK-NEXT: G_STORE [[C]](s32), [[GV]](p0) :: (store (s32) into @global_var)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
; CHECK-NEXT: G_INVOKE_REGION_START
; CHECK-NEXT: EH_LABEL <mcsymbol >
Expand All @@ -29,7 +29,7 @@ define i1 @test_lpad_phi_widen_into_pred() personality ptr @__gxx_personality_v0
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: liveins: $x0, $x1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.1
; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1
; CHECK-NEXT: EH_LABEL <mcsymbol >
; CHECK-NEXT: [[GV1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @global_var
; CHECK-NEXT: G_STORE [[PHI]](s32), [[GV1]](p0) :: (store (s32) into @global_var)
Expand Down Expand Up @@ -67,12 +67,12 @@ define i1 @test_lpad_phi_widen_into_pred_ext(ptr %ptr) personality ptr @__gxx_pe
; CHECK-NEXT: liveins: $x0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @global_var
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
; CHECK-NEXT: G_STORE [[C1]](s32), [[GV]](p0) :: (store (s32) into @global_var)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
; CHECK-NEXT: G_STORE [[C]](s32), [[GV]](p0) :: (store (s32) into @global_var)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p0) :: (load (s8) from %ir.ptr)
; CHECK-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s8) = G_ASSERT_ZEXT [[LOAD]], 1
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[ASSERT_ZEXT]](s8)
; CHECK-NEXT: G_INVOKE_REGION_START
; CHECK-NEXT: EH_LABEL <mcsymbol >
Expand All @@ -86,7 +86,7 @@ define i1 @test_lpad_phi_widen_into_pred_ext(ptr %ptr) personality ptr @__gxx_pe
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: liveins: $x0, $x1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.1
; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1
; CHECK-NEXT: EH_LABEL <mcsymbol >
; CHECK-NEXT: [[GV1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @global_var
; CHECK-NEXT: G_STORE [[PHI]](s32), [[GV1]](p0) :: (store (s32) into @global_var)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,16 @@ define i32 @test(i32 %a, i1 %c) {
; PRESELECTION-NEXT: {{ $}}
; PRESELECTION-NEXT: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
; PRESELECTION-NEXT: [[COPY1:%[0-9]+]]:gpr(s32) = COPY $w1
; PRESELECTION-NEXT: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0
; PRESELECTION-NEXT: [[C1:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 100000
; PRESELECTION-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:gpr(s32) = G_CONSTANT_FOLD_BARRIER [[C1]]
; PRESELECTION-NEXT: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 100000
; PRESELECTION-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:gpr(s32) = G_CONSTANT_FOLD_BARRIER [[C]]
; PRESELECTION-NEXT: [[C1:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0
; PRESELECTION-NEXT: [[C2:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
; PRESELECTION-NEXT: [[AND:%[0-9]+]]:gpr(s32) = G_AND [[COPY1]], [[C2]]
; PRESELECTION-NEXT: G_BRCOND [[AND]](s32), %bb.3
; PRESELECTION-NEXT: G_BR %bb.2
; PRESELECTION-NEXT: {{ $}}
; PRESELECTION-NEXT: bb.2.common.ret:
; PRESELECTION-NEXT: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI %7(s32), %bb.3, [[C]](s32), %bb.1
; PRESELECTION-NEXT: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI %7(s32), %bb.3, [[C1]](s32), %bb.1
; PRESELECTION-NEXT: $w0 = COPY [[PHI]](s32)
; PRESELECTION-NEXT: RET_ReallyLR implicit $w0
; PRESELECTION-NEXT: {{ $}}
Expand All @@ -75,8 +75,8 @@ define i32 @test(i32 %a, i1 %c) {
; POSTSELECTION-NEXT: {{ $}}
; POSTSELECTION-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0
; POSTSELECTION-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
; POSTSELECTION-NEXT: [[COPY2:%[0-9]+]]:gpr32 = COPY $wzr
; POSTSELECTION-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 100000
; POSTSELECTION-NEXT: [[COPY2:%[0-9]+]]:gpr32 = COPY $wzr
; POSTSELECTION-NEXT: TBNZW [[COPY1]], 0, %bb.3
; POSTSELECTION-NEXT: B %bb.2
; POSTSELECTION-NEXT: {{ $}}
Expand Down
56 changes: 49 additions & 7 deletions llvm/test/CodeGen/AArch64/GlobalISel/localizer.mir
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@

define void @test_inttoptr() { ret void }
define void @many_local_use_intra_block() { ret void }
define void @non_local_phi_use_nonunique() { ret void }
define void @non_local_phi_single_use() { ret void }
define void @non_local_phi_three_uses() { ret void }

...

---
Expand Down Expand Up @@ -285,8 +287,8 @@ body: |
; CHECK: bb.1:
; CHECK: successors: %bb.1(0x80000000)
; CHECK: [[PHI:%[0-9]+]]:fpr(s32) = PHI [[FADD]](s32), %bb.0, %4(s32), %bb.1
; CHECK: [[C1:%[0-9]+]]:fpr(s32) = G_FCONSTANT float 1.000000e+00
; CHECK: [[FADD1:%[0-9]+]]:fpr(s32) = G_FADD [[PHI]], [[FADD]]
; CHECK: [[C1:%[0-9]+]]:fpr(s32) = G_FCONSTANT float 1.000000e+00
; CHECK: G_BR %bb.1

; Existing registers should be left untouched
Expand Down Expand Up @@ -566,12 +568,12 @@ body: |
...

---
name: non_local_phi_use_nonunique
name: non_local_phi_single_use
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: non_local_phi_use_nonunique
; CHECK-LABEL: name: non_local_phi_single_use
; CHECK: bb.0:
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
Expand All @@ -582,12 +584,12 @@ body: |
; CHECK: G_BR %bb.2
; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: [[C2:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
; CHECK: bb.2:
; CHECK: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI [[C]](s32), %bb.1
; CHECK: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI [[C2]](s32), %bb.1
; CHECK: [[ADD1:%[0-9]+]]:gpr(s32) = G_ADD [[PHI]], [[PHI]]

; Don't localize the 1 into bb.1, because there are multiple edges
; using that register.
; Localize the 1 into bb.1, since the number of uses is under the threshold.

bb.0:
successors: %bb.1, %bb.2
Expand All @@ -606,3 +608,43 @@ body: |
%3:gpr(s32) = G_PHI %0(s32), %bb.1, %0(s32), %bb.0
%2:gpr(s32) = G_ADD %3, %3
...
---
name: non_local_phi_three_uses
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: non_local_phi_three_uses
; CHECK: bb.0:
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
; CHECK: [[ADD:%[0-9]+]]:gpr(s32) = G_ADD [[C]], [[C]]
; CHECK: %cmp:gpr(s32) = G_ICMP intpred(eq), [[ADD]](s32), [[C]]
; CHECK: %cond:gpr(s1) = G_TRUNC %cmp(s32)
; CHECK: G_BRCOND %cond(s1), %bb.1
; CHECK: G_BR %bb.2
; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: bb.2:
; CHECK: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI [[C]](s32), %bb.1
; CHECK: [[ADD1:%[0-9]+]]:gpr(s32) = G_ADD [[PHI]], [[PHI]]

; Don't localize the 1 into bb.1, above the thresold of uses in the phi.

bb.0:
successors: %bb.1, %bb.2

%0:gpr(s32) = G_CONSTANT i32 1
%1:gpr(s32) = G_ADD %0, %0
%cmp:gpr(s32) = G_ICMP intpred(eq), %1(s32), %0
%cond:gpr(s1) = G_TRUNC %cmp(s32)
G_BRCOND %cond(s1), %bb.1
G_BR %bb.2

bb.1:
successors: %bb.2

bb.2:
%3:gpr(s32) = G_PHI %0(s32), %bb.1, %0(s32), %bb.0, %0(s32), %bb.0, %0(s32), %bb.0
%2:gpr(s32) = G_ADD %3, %3
...
Original file line number Diff line number Diff line change
Expand Up @@ -230,32 +230,32 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3
; GFX10-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GFX10-NEXT: s_mov_b32 s2, 1
; GFX10-NEXT: v_mbcnt_hi_u32_b32 v2, -1, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v2
; GFX10-NEXT: v_and_b32_e32 v3, 1, v2
; GFX10-NEXT: v_mbcnt_hi_u32_b32 v1, -1, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v1
; GFX10-NEXT: v_and_b32_e32 v3, 1, v1
; GFX10-NEXT: v_xor_b32_e32 v3, 1, v3
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen
; GFX10-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
; GFX10-NEXT: ; implicit-def: $vgpr3
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v1
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2
; GFX10-NEXT: s_cbranch_vccnz .LBB4_4
; GFX10-NEXT: ; %bb.1: ; %.preheader.preheader
; GFX10-NEXT: v_mov_b32_e32 v4, s12
; GFX10-NEXT: v_mov_b32_e32 v3, s12
; GFX10-NEXT: v_mov_b32_e32 v4, s12
; GFX10-NEXT: .LBB4_2: ; %.preheader
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: buffer_load_dword v5, v4, s[4:7], 0 offen
; GFX10-NEXT: v_add_nc_u32_e32 v2, -1, v2
; GFX10-NEXT: v_add_nc_u32_e32 v4, 4, v4
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX10-NEXT: buffer_load_dword v5, v3, s[4:7], 0 offen
; GFX10-NEXT: v_add_nc_u32_e32 v1, -1, v1
; GFX10-NEXT: v_add_nc_u32_e32 v3, 4, v3
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v3, v5, v3
; GFX10-NEXT: v_add_nc_u32_e32 v4, v5, v4
; GFX10-NEXT: s_cbranch_vccnz .LBB4_2
; GFX10-NEXT: ; %bb.3: ; %.preheader._crit_edge
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX10-NEXT: s_or_b32 s2, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
; GFX10-NEXT: s_mov_b32 s2, 0
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
Original file line number Diff line number Diff line change
Expand Up @@ -48,24 +48,24 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
; GCN-NEXT: s_buffer_load_dword s2, s[4:7], 0x0
; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_mov_b32 s4, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s2, 56
; GCN-NEXT: s_cselect_b32 s2, 1, 0
; GCN-NEXT: s_cselect_b32 s4, 1, 0
; GCN-NEXT: v_mov_b32_e32 v0, s3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 42
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_cmp_lg_u32 s2, 0
; GCN-NEXT: s_mov_b32 s2, 1
; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_cbranch_scc0 .LBB2_2
; GCN-NEXT: ; %bb.1: ; %.one
; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s4, 0
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: .LBB2_2: ; %Flow
; GCN-NEXT: s_xor_b32 s2, s4, 1
; GCN-NEXT: s_xor_b32 s2, s2, 1
; GCN-NEXT: s_and_b32 s2, s2, 1
; GCN-NEXT: s_cmp_lg_u32 s2, 0
; GCN-NEXT: s_cbranch_scc1 .LBB2_4
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@ define amdgpu_cs void @memmove_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src
; LOOP-NEXT: s_andn2_saveexec_b64 s[0:1], s[4:5]
; LOOP-NEXT: s_cbranch_execz .LBB0_6
; LOOP-NEXT: ; %bb.4: ; %copy_backwards
; LOOP-NEXT: s_mov_b32 s0, -4
; LOOP-NEXT: v_add_i32_e32 v0, vcc, 3, v0
; LOOP-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; LOOP-NEXT: v_add_i32_e32 v2, vcc, 3, v2
; LOOP-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; LOOP-NEXT: s_mov_b32 s0, -4
; LOOP-NEXT: s_mov_b32 s6, 0
; LOOP-NEXT: s_mov_b32 s7, 0xf000
; LOOP-NEXT: s_mov_b64 s[4:5], 0
Expand Down
Loading