-
Notifications
You must be signed in to change notification settings - Fork 14.3k
AMDGPU: Handle v_add* in eliminateFrameIndex #102346
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesPatch is 297.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/102346.diff 13 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 76da1f0eb4f7d..81337c62ffe17 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2086,7 +2086,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
assert(MF->getRegInfo().isReserved(MFI->getScratchRSrcReg()) &&
"unreserved scratch RSRC register");
- MachineOperand &FIOp = MI->getOperand(FIOperandNum);
+ MachineOperand *FIOp = &MI->getOperand(FIOperandNum);
int Index = MI->getOperand(FIOperandNum).getIndex();
Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
@@ -2268,6 +2268,208 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
MI->eraseFromParent();
return true;
}
+ case AMDGPU::V_ADD_U32_e32:
+ case AMDGPU::V_ADD_U32_e64:
+ case AMDGPU::V_ADD_CO_U32_e32:
+ case AMDGPU::V_ADD_CO_U32_e64: {
+ // TODO: Handle sub, and, or.
+ unsigned NumDefs = MI->getNumExplicitDefs();
+ unsigned Src0Idx = NumDefs;
+
+ bool HasClamp = false;
+ MachineOperand *VCCOp = nullptr;
+
+ switch (MI->getOpcode()) {
+ case AMDGPU::V_ADD_U32_e32:
+ break;
+ case AMDGPU::V_ADD_U32_e64:
+ HasClamp = MI->getOperand(3).getImm();
+ break;
+ case AMDGPU::V_ADD_CO_U32_e32:
+ VCCOp = &MI->getOperand(3);
+ break;
+ case AMDGPU::V_ADD_CO_U32_e64:
+ VCCOp = &MI->getOperand(1);
+ HasClamp = MI->getOperand(4).getImm();
+ break;
+ default:
+ break;
+ }
+ bool DeadVCC = !VCCOp || VCCOp->isDead();
+ MachineOperand &DstOp = MI->getOperand(0);
+ Register DstReg = DstOp.getReg();
+
+ unsigned OtherOpIdx =
+ FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
+ MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx);
+
+ unsigned Src1Idx = Src0Idx + 1;
+ Register MaterializedReg = FrameReg;
+ Register ScavengedVGPR;
+
+ if (FrameReg && !ST.enableFlatScratch()) {
+ // We should just do an in-place update of the result register. However,
+ // the value there may also be used by the add, in which case we need a
+ // temporary register.
+ //
+ // FIXME: The scavenger is not finding the result register in the
+ // common case where the add does not read the register.
+
+ ScavengedVGPR = RS->scavengeRegisterBackwards(
+ AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0);
+
+ // TODO: If we have a free SGPR, it's sometimes better to use a scalar
+ // shift.
+ BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64))
+ .addDef(ScavengedVGPR, RegState::Renamable)
+ .addImm(ST.getWavefrontSizeLog2())
+ .addReg(FrameReg);
+ MaterializedReg = ScavengedVGPR;
+ }
+
+ int64_t Offset = FrameInfo.getObjectOffset(Index);
+ // For the non-immediate case, we could fall through to the default
+ // handling, but we do an in-place update of the result register here to
+ // avoid scavenging another register.
+ if (OtherOp->isImm()) {
+ OtherOp->setImm(OtherOp->getImm() + Offset);
+ Offset = 0;
+ }
+
+ if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
+ if (ST.enableFlatScratch() &&
+ !TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
+ // We didn't need the shift above, so we have an SGPR for the frame
+ // register, but may have a VGPR only operand.
+ //
+ // TODO: On gfx10+, we can easily change the opcode to the e64 version
+ // and use the higher constant bus restriction to avoid this copy.
+
+ if (!ScavengedVGPR) {
+ ScavengedVGPR = RS->scavengeRegisterBackwards(
+ AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
+ /*SPAdj=*/0);
+ }
+
+ assert(ScavengedVGPR != DstReg);
+
+ BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
+ .addReg(MaterializedReg,
+ MaterializedReg != FrameReg ? RegState::Kill : 0);
+ MaterializedReg = ScavengedVGPR;
+ }
+
+ auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode()))
+ .addDef(DstReg, RegState::Renamable);
+ if (NumDefs == 2)
+ AddI32.add(MI->getOperand(1));
+
+ unsigned MaterializedRegFlags =
+ MaterializedReg != FrameReg ? RegState::Kill : 0;
+
+ if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
+ // If we know we have a VGPR already, it's more likely the other
+ // operand is a legal vsrc0.
+ AddI32
+ .add(*OtherOp)
+ .addReg(MaterializedReg, MaterializedRegFlags);
+ } else {
+ // Commute operands to avoid violating VOP2 restrictions. This will
+ // typically happen when using scratch.
+ AddI32
+ .addReg(MaterializedReg, MaterializedRegFlags)
+ .add(*OtherOp);
+ }
+
+ if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
+ MI->getOpcode() == AMDGPU::V_ADD_U32_e64)
+ AddI32.addImm(0); // clamp
+
+ if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32)
+ AddI32.setOperandDead(3); // Dead vcc
+
+ MaterializedReg = DstReg;
+
+ OtherOp->ChangeToRegister(MaterializedReg, false);
+ OtherOp->setIsKill(true);
+ FIOp->ChangeToImmediate(Offset);
+ Offset = 0;
+ } else if (Offset != 0) {
+ assert(!MaterializedReg);
+ FIOp->ChangeToImmediate(Offset);
+ Offset = 0;
+ } else {
+ if (DeadVCC && !HasClamp) {
+ assert(Offset == 0);
+
+ // TODO: Losing kills and implicit operands. Just mutate to copy and
+ // let lowerCopy deal with it?
+ if (OtherOp->isReg() && OtherOp->getReg() == DstReg) {
+ // Folded to an identity copy.
+ MI->eraseFromParent();
+ return true;
+ }
+
+ // The immediate value should be in OtherOp
+ MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
+ MI->removeOperand(FIOperandNum);
+
+ unsigned NumOps = MI->getNumOperands();
+ for (unsigned I = NumOps - 2; I >= 2; --I)
+ MI->removeOperand(I);
+
+ if (NumDefs == 2)
+ MI->removeOperand(1);
+
+ // The code below can't deal with a mov.
+ return true;
+ }
+
+ // This folded to a constant, but we have to keep the add around for
+ // pointless implicit defs or clamp modifier.
+ FIOp->ChangeToImmediate(0);
+ }
+
+ // Try to improve legality by commuting.
+ if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) {
+ std::swap(FIOp, OtherOp);
+ std::swap(FIOperandNum, OtherOpIdx);
+ }
+
+ for (unsigned SrcIdx : {Src1Idx, Src0Idx}) {
+ // Depending on operand constraints we may need to insert another copy.
+ if (!TII->isOperandLegal(*MI, SrcIdx)) {
+ // If commuting didn't make the operands legal, we need to materialize
+ // in a register.
+ // TODO: Can use SGPR on gfx10+ in some cases.
+ if (!ScavengedVGPR) {
+ ScavengedVGPR = RS->scavengeRegisterBackwards(
+ AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
+ /*SPAdj=*/0);
+ }
+
+ assert(ScavengedVGPR != DstReg);
+
+ MachineOperand &Src = MI->getOperand(SrcIdx);
+ BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
+ .add(Src);
+
+ Src.ChangeToRegister(ScavengedVGPR, false);
+ Src.setIsKill(true);
+ }
+ }
+
+ // Fold out add of 0 case that can appear in kernels.
+ if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) {
+ if (OtherOp->isReg() && OtherOp->getReg() != DstReg) {
+ BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp);
+ }
+
+ MI->eraseFromParent();
+ }
+
+ return true;
+ }
case AMDGPU::S_ADD_I32:
case AMDGPU::S_OR_B32:
case AMDGPU::S_AND_B32: {
@@ -2336,7 +2538,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
} else {
if (MaterializedReg)
OtherOp.ChangeToRegister(MaterializedReg, false);
- FIOp.ChangeToImmediate(NewOffset);
+ FIOp->ChangeToImmediate(NewOffset);
}
return true;
@@ -2354,7 +2556,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
// The offset is always swizzled, just replace it
if (FrameReg)
- FIOp.ChangeToRegister(FrameReg, false);
+ FIOp->ChangeToRegister(FrameReg, false);
MachineOperand *OffsetOp =
TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
@@ -2407,18 +2609,18 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
}
if (!FrameReg) {
- FIOp.ChangeToImmediate(Offset);
- if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp))
+ FIOp->ChangeToImmediate(Offset);
+ if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp))
return false;
}
// We need to use register here. Check if we can use an SGPR or need
// a VGPR.
- FIOp.ChangeToRegister(AMDGPU::M0, false);
- bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp);
+ FIOp->ChangeToRegister(AMDGPU::M0, false);
+ bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp);
if (!Offset && FrameReg && UseSGPR) {
- FIOp.setReg(FrameReg);
+ FIOp->setReg(FrameReg);
return false;
}
@@ -2427,8 +2629,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
Register TmpReg =
RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
- FIOp.setReg(TmpReg);
- FIOp.setIsKill();
+ FIOp->setReg(TmpReg);
+ FIOp->setIsKill();
if ((!FrameReg || !Offset) && TmpReg) {
unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
@@ -2457,8 +2659,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (!TmpSReg) {
// Use frame register and restore it after.
TmpSReg = FrameReg;
- FIOp.setReg(FrameReg);
- FIOp.setIsKill(false);
+ FIOp->setReg(FrameReg);
+ FIOp->setIsKill(false);
}
if (NeedSaveSCC) {
@@ -2706,7 +2908,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
MI->eraseFromParent();
return true;
}
- FIOp.ChangeToRegister(ResultReg, false, false, true);
+ FIOp->ChangeToRegister(ResultReg, false, false, true);
return false;
}
@@ -2737,13 +2939,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
// If the offset is simply too big, don't convert to a scratch wave offset
// relative index.
- FIOp.ChangeToImmediate(Offset);
- if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
+ FIOp->ChangeToImmediate(Offset);
+ if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) {
Register TmpReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
MI, false, 0);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
.addImm(Offset);
- FIOp.ChangeToRegister(TmpReg, false, false, true);
+ FIOp->ChangeToRegister(TmpReg, false, false, true);
}
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 83f2329feb8f2..e2eac156ea787 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -105,15 +105,13 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX9-LABEL: store_load_vindex_kernel:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
-; GFX9-NEXT: v_add_u32_e32 v1, 0, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 15
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
; GFX9-NEXT: scratch_store_dword v1, v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v0, 0, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
@@ -128,8 +126,6 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v2, 15
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
-; GFX10-NEXT: v_add_nc_u32_e32 v0, 0, v0
-; GFX10-NEXT: v_add_nc_u32_e32 v1, 0, v1
; GFX10-NEXT: scratch_store_dword v0, v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc
@@ -140,12 +136,11 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX940: ; %bb.0: ; %bb
; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0
; GFX940-NEXT: v_mov_b32_e32 v2, 15
-; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0
; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_add_u32_e32 v0, 0, v0
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_endpgm
@@ -160,7 +155,6 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX11-NEXT: scratch_store_b32 v0, v2, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0, v1
; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:124 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_endpgm
@@ -539,15 +533,15 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX9-NEXT: scratch_load_dword v1, off, s32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT: s_add_i32 s0, s32, 0x100
; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX9-NEXT: v_add_u32_e32 v1, s0, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, 15
+; GFX9-NEXT: v_add_u32_e32 v1, s32, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_add_i32 s0, s32, 0x100
+; GFX9-NEXT: v_add_u32_e32 v1, 0x100, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 15
+; GFX9-NEXT: v_add_u32_e32 v0, s32, v0
; GFX9-NEXT: scratch_store_dword v1, v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-NEXT: v_add_u32_e32 v0, 0x100, v0
; GFX9-NEXT: scratch_load_dword v0, v0, off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -557,14 +551,14 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v1, 15, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT: s_add_i32 s0, s32, 0x100
; GFX10-NEXT: v_mov_b32_e32 v2, 15
; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
-; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX10-NEXT: s_add_i32 s0, s32, 0x100
-; GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v0, s32, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v1, s32, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x100, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x100, v1
; GFX10-NEXT: scratch_store_dword v0, v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
@@ -577,8 +571,8 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX940-NEXT: scratch_load_dword v1, off, s32 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX940-NEXT: s_add_i32 s0, s32, 0x100
-; GFX940-NEXT: v_add_u32_e32 v1, s0, v1
+; GFX940-NEXT: v_add_u32_e32 v1, s32, v1
+; GFX940-NEXT: v_add_u32_e32 v1, 0x100, v1
; GFX940-NEXT: v_mov_b32_e32 v2, 15
; GFX940-NEXT: v_and_b32_e32 v0, 15, v0
; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1
@@ -593,11 +587,12 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
; GFX11-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX11-NEXT: s_add_i32 s0, s32, 0x100
; GFX11-NEXT: scratch_load_b32 v3, off, s32 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, s0, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v1, s32, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x100, v1
; GFX11-NEXT: scratch_store_b32 v1, v2, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_load_b32 v0, v0, s32 offset:256 glc dlc
@@ -855,15 +850,15 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX9-NEXT: scratch_load_dword v1, off, s32 offset:4 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT: s_add_i32 s0, s32, 0x4004
; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX9-NEXT: v_add_u32_e32 v1, s0, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, 15
+; GFX9-NEXT: v_add_u32_e32 v1, s32, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_add_i32 s0, s32, 0x4004
+; GFX9-NEXT: v_add_u32_e32 v1, 0x4004, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 15
+; GFX9-NEXT: v_add_u32_e32 v0, s32, v0
; GFX9-NEXT: scratch_store_dword v1, v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-NEXT: v_add_u32_e32 v0, 0x4004, v0
; GFX9-NEXT: scratch_load_dword v0, v0, off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -873,14 +868,14 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v1, 15, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT: s_add_i32 s0, s32, 0x4004
; GFX10-NEXT: v_mov_b32_e32 v2, 15
; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
-; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX10-NEXT: s_add_i32 s0, s32, 0x4004
-; GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v0, s32, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v1, s32, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x4004, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x4004, v1
; GFX10-NEXT: scratch_store_dword v0, v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
@@ -893,8 +888,8 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX940-NEXT: scratch_load_dword v1, off, s32 offset:4 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX940-NEXT: s_add_i32 s0, s32, 0x4004
-; GFX940-NEXT: v_add_u32_e32 v1, s0, v1
+; GFX940-NEXT: v_add_u32_e32 v1, s32, v1
+; GFX940-NEXT: v_add_u32_e32 v1, 0x4004, v1
; GFX940-NEXT: v_mov_b32_e32 v2, 15
; GFX940-NEXT: v_and_b32_e32 v0, 15, v0
; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1
@@ -913,9 +908,10 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX11-NEXT: s_add_i32 s0, s32, 0x4004
; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc
; GFX11-NEXT: ...
[truncated]
|
46686cb
to
8576cb8
Compare
8726c2b
to
b9a85ce
Compare
8576cb8
to
af30ca7
Compare
You can test this locally with the following command:git-clang-format --diff 09ba83be0ac178851e3c9c9c8fefddbdd4d8353f bdff9b143087ba60098816501f219654589a3bf9 --extensions cpp -- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp View the diff from clang-format here.diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 9e1c4941db..c21dc88ea7 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2557,15 +2557,11 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
// If we know we have a VGPR already, it's more likely the other
// operand is a legal vsrc0.
- AddI32
- .add(*OtherOp)
- .addReg(MaterializedReg, MaterializedRegFlags);
+ AddI32.add(*OtherOp).addReg(MaterializedReg, MaterializedRegFlags);
} else {
// Commute operands to avoid violating VOP2 restrictions. This will
// typically happen when using scratch.
- AddI32
- .addReg(MaterializedReg, MaterializedRegFlags)
- .add(*OtherOp);
+ AddI32.addReg(MaterializedReg, MaterializedRegFlags).add(*OtherOp);
}
if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
|
af30ca7
to
348bf1b
Compare
348bf1b
to
f19c9ec
Compare
PR: #102346 v_add_u32_e64 test cases for wave32
f19c9ec
to
dd532ee
Compare
dd532ee
to
bdff9b1
Compare
ping |
case AMDGPU::V_ADD_U32_e32: | ||
break; | ||
case AMDGPU::V_ADD_U32_e64: | ||
HasClamp = MI->getOperand(3).getImm(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not to use named operands?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
No description provided.