-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types #89217
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-clang @llvm/pr-subscribers-llvm-globalisel Author: Vikram Hegde (vikramRH) ChangesThis patch is intended to be the first of a series with end goal to adapt atomic optimizer pass to support i64 and f64 operations (along with removing all unnecessary bitcasts). This patch introduces pseudos for 64bit readlane, writelane and readfirstlane ops which are lowered post ISel. Requesting comments on the approach here, still TODO in this patch :
Patch is 107.80 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/89217.diff 15 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index ee9a5d7a343980..34feee1c56be82 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2177,14 +2177,14 @@ def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
def int_amdgcn_readfirstlane :
ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
- Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
+ Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
// The lane argument must be uniform across the currently active threads of the
// current wave. Otherwise, the result is undefined.
def int_amdgcn_readlane :
ClangBuiltin<"__builtin_amdgcn_readlane">,
- Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+ Intrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
// The value to write and lane select arguments must be uniform across the
@@ -2192,10 +2192,10 @@ def int_amdgcn_readlane :
// undefined.
def int_amdgcn_writelane :
ClangBuiltin<"__builtin_amdgcn_writelane">,
- Intrinsic<[llvm_i32_ty], [
- llvm_i32_ty, // uniform value to write: returned by the selected lane
- llvm_i32_ty, // uniform lane select
- llvm_i32_ty // returned by all lanes other than the selected one
+ Intrinsic<[llvm_any_ty], [
+ LLVMMatchType<0>, // uniform value to write: returned by the selected lane
+ llvm_i32_ty, // uniform lane select
+ LLVMMatchType<0> // returned by all lanes other than the selected one
],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index dbb3de76b4ddae..0ec77d66e596da 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -433,7 +433,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
// combine them with a scalar operation.
Function *ReadLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
V = B.CreateBitCast(V, IntNTy);
Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
@@ -493,8 +493,8 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
if (!ST->isWave32()) {
// Combine lane 31 into lanes 32..63.
V = B.CreateBitCast(V, IntNTy);
- Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
- {V, B.getInt32(31)});
+ Value *const Lane31 = B.CreateIntrinsic(
+ Intrinsic::amdgcn_readlane, B.getInt32Ty(), {V, B.getInt32(31)});
Value *UpdateDPPCall = B.CreateCall(
UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
@@ -523,10 +523,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
B.getInt32(0xf), B.getFalse()});
} else {
- Function *ReadLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
- Function *WriteLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
+ Function *ReadLane = Intrinsic::getDeclaration(
+ M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
+ Function *WriteLane = Intrinsic::getDeclaration(
+ M, Intrinsic::amdgcn_writelane, B.getInt32Ty());
// On GFX10 all DPP operations are confined to a single row. To get cross-
// row operations we have to use permlane or readlane.
@@ -598,8 +598,8 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
// Get the value required for atomic operation
V = B.CreateBitCast(V, IntNTy);
- Value *LaneValue =
- B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {V, LaneIdxInt});
+ Value *LaneValue = B.CreateIntrinsic(Intrinsic::amdgcn_readlane,
+ B.getInt32Ty(), {V, LaneIdxInt});
LaneValue = B.CreateBitCast(LaneValue, Ty);
// Perform writelane if intermediate scan results are required later in the
@@ -607,7 +607,7 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
Value *OldValue = nullptr;
if (NeedResult) {
OldValue =
- B.CreateIntrinsic(Intrinsic::amdgcn_writelane, {},
+ B.CreateIntrinsic(Intrinsic::amdgcn_writelane, B.getInt32Ty(),
{B.CreateBitCast(Accumulator, IntNTy), LaneIdxInt,
B.CreateBitCast(OldValuePhi, IntNTy)});
OldValue = B.CreateBitCast(OldValue, Ty);
@@ -789,7 +789,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
assert(TyBitWidth == 32);
NewV = B.CreateBitCast(NewV, IntNTy);
- NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, B.getInt32Ty(),
{NewV, LastLaneIdx});
NewV = B.CreateBitCast(NewV, Ty);
}
@@ -925,10 +925,10 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
Value *const ExtractLo = B.CreateTrunc(CastedPhi, Int32Ty);
Value *const ExtractHi =
B.CreateTrunc(B.CreateLShr(CastedPhi, 32), Int32Ty);
- CallInst *const ReadFirstLaneLo =
- B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
- CallInst *const ReadFirstLaneHi =
- B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
+ CallInst *const ReadFirstLaneLo = B.CreateIntrinsic(
+ Intrinsic::amdgcn_readfirstlane, Int32Ty, ExtractLo);
+ CallInst *const ReadFirstLaneHi = B.CreateIntrinsic(
+ Intrinsic::amdgcn_readfirstlane, Int32Ty, ExtractHi);
Value *const PartialInsert = B.CreateInsertElement(
PoisonValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
Value *const Insert =
@@ -936,8 +936,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
BroadcastI = B.CreateBitCast(Insert, Ty);
} else if (TyBitWidth == 32) {
Value *CastedPhi = B.CreateBitCast(PHI, IntNTy);
- BroadcastI =
- B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, CastedPhi);
+ BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, Int32Ty,
+ CastedPhi);
BroadcastI = B.CreateBitCast(BroadcastI, Ty);
} else {
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 8b21c22b449710..d722b6fb56bccb 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -691,7 +691,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
break;
}
- case AMDGPU::V_WRITELANE_B32: {
+ case AMDGPU::V_WRITELANE_B32:
+ case AMDGPU::V_WRITELANE_PSEUDO_B64: {
// Some architectures allow more than one constant bus access without
// SGPR restriction
if (ST.getConstantBusLimit(MI.getOpcode()) != 1)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0c706d51cb665f..79a9f451589b16 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4822,6 +4822,109 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
return RetBB;
}
+static MachineBasicBlock *lowerPseudoLaneOp(MachineInstr &MI,
+ MachineBasicBlock *BB,
+ const GCNSubtarget &ST,
+ unsigned Opc) {
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ MachineOperand &Dest = MI.getOperand(0);
+ MachineOperand &Src0 = MI.getOperand(1);
+
+ const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
+ const TargetRegisterClass *Src0SubRC =
+ TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
+
+ Register DestSub0 = MRI.createVirtualRegister(
+ (Opc == AMDGPU::V_WRITELANE_PSEUDO_B64) ? &AMDGPU::VGPR_32RegClass
+ : &AMDGPU::SGPR_32RegClass);
+ Register DestSub1 = MRI.createVirtualRegister(
+ (Opc == AMDGPU::V_WRITELANE_PSEUDO_B64) ? &AMDGPU::VGPR_32RegClass
+ : &AMDGPU::SGPR_32RegClass);
+
+ MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
+
+ MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
+
+ MachineInstr *LoHalf, *HighHalf;
+ switch (Opc) {
+ case AMDGPU::V_READLANE_PSEUDO_B64: {
+ MachineOperand &Src1 = MI.getOperand(2);
+ auto IsKill = (Src1.isReg() && Src1.isKill());
+ if (IsKill)
+ Src1.setIsKill(false);
+ LoHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), DestSub0)
+ .add(SrcReg0Sub0)
+ .add(Src1);
+
+ if (IsKill)
+ Src1.setIsKill(true);
+ HighHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), DestSub1)
+ .add(SrcReg0Sub1)
+ .add(Src1);
+ break;
+ }
+ case AMDGPU::V_READFIRSTLANE_PSEUDO_B64: {
+ LoHalf =
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DestSub0)
+ .add(SrcReg0Sub0);
+ HighHalf =
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DestSub1)
+ .add(SrcReg0Sub1);
+ break;
+ }
+ case AMDGPU::V_WRITELANE_PSEUDO_B64: {
+ MachineOperand &Src1 = MI.getOperand(2);
+ MachineOperand &Src2 = MI.getOperand(3);
+ auto IsKill = (Src1.isReg() && Src1.isKill());
+
+ const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
+ const TargetRegisterClass *Src2SubRC =
+ TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
+
+ MachineOperand SrcReg2Sub0 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src2, Src2RC, AMDGPU::sub0, Src2SubRC);
+
+ MachineOperand SrcReg2Sub1 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src2, Src2RC, AMDGPU::sub1, Src2SubRC);
+
+ if (IsKill)
+ Src1.setIsKill(false);
+ LoHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), DestSub0)
+ .add(SrcReg0Sub0)
+ .add(Src1)
+ .add(SrcReg2Sub0);
+
+ if (IsKill)
+ Src1.setIsKill(true);
+ HighHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), DestSub1)
+ .add(SrcReg0Sub1)
+ .add(Src1)
+ .add(SrcReg2Sub1);
+ break;
+ }
+ default:
+ llvm_unreachable("should not occur");
+ }
+
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+
+ TII->legalizeOperands(*LoHalf);
+ TII->legalizeOperands(*HighHalf);
+
+ MI.eraseFromParent();
+ return BB;
+}
+
MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MachineInstr &MI, MachineBasicBlock *BB) const {
@@ -5065,6 +5168,10 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MI.eraseFromParent();
return BB;
}
+ case AMDGPU::V_READLANE_PSEUDO_B64:
+ case AMDGPU::V_READFIRSTLANE_PSEUDO_B64:
+ case AMDGPU::V_WRITELANE_PSEUDO_B64:
+ return lowerPseudoLaneOp(MI, BB, *getSubtarget(), MI.getOpcode());
case AMDGPU::SI_INIT_M0: {
BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index d6d49889656bbc..e8ece71fe07b7b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -288,6 +288,41 @@ def V_SUB_U64_PSEUDO : VPseudoInstSI <
>;
} // End usesCustomInserter = 1, Defs = [VCC]
+
+let usesCustomInserter = 1 in {
+ def V_READLANE_PSEUDO_B64 : VPseudoInstSI <
+ (outs SReg_64:$sdst), (ins VReg_64:$src0, SSrc_b32:$src1)>;
+
+ def V_READFIRSTLANE_PSEUDO_B64 : VPseudoInstSI <
+ (outs SReg_64:$sdst), (ins VReg_64:$src0)>;
+
+ def V_WRITELANE_PSEUDO_B64 : VPseudoInstSI <
+ (outs VReg_64:$sdst), (ins SReg_64:$src0, SSrc_b32:$src1, VReg_64:$src2)> {
+ let UseNamedOperandTable = 1;
+ }
+} // End usesCustomInserter = 1
+
+class ReadLanePseudoPat <ValueType vt> : GCNPat <
+ (vt (int_amdgcn_readlane vt:$src0, i32:$src1)),
+ (V_READLANE_PSEUDO_B64 VReg_64:$src0, SSrc_b32:$src1)>;
+
+def : ReadLanePseudoPat<i64>;
+def : ReadLanePseudoPat<f64>;
+
+class WriteLanePseudoPat <ValueType vt> : GCNPat <
+ (vt (int_amdgcn_writelane vt:$src0, i32:$src1, vt:$src2)),
+ (V_WRITELANE_PSEUDO_B64 SReg_64:$src0, SSrc_b32:$src1, VReg_64:$src2)>;
+
+def : WriteLanePseudoPat<i64>;
+def : WriteLanePseudoPat<f64>;
+
+class ReadFirstLanePseudoPat <ValueType vt> : GCNPat <
+ (vt (int_amdgcn_readfirstlane vt:$src0)),
+ (V_READFIRSTLANE_PSEUDO_B64 VReg_64:$src0)>;
+
+def : ReadFirstLanePseudoPat<i64>;
+def : ReadFirstLanePseudoPat<f64>;
+
let usesCustomInserter = 1, Defs = [SCC] in {
def S_ADD_U64_PSEUDO : SPseudoInstSI <
(outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
@@ -3405,7 +3440,7 @@ def : GCNPat<
// FIXME: Should also do this for readlane, but tablegen crashes on
// the ignored src1.
def : GCNPat<
- (int_amdgcn_readfirstlane (i32 imm:$src)),
+ (i32 (int_amdgcn_readfirstlane (i32 imm:$src))),
(S_MOV_B32 SReg_32:$src)
>;
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 2341e0d9d32bb4..0ee80f45c91603 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -112,7 +112,7 @@ class getVOP1Pat <SDPatternOperator node, VOPProfile P> : LetDummies {
!if(P.HasOMod,
[(set P.DstVT:$vdst, (node (P.Src0VT (VOP3OMods P.Src0VT:$src0,
i1:$clamp, i32:$omod))))],
- [(set P.DstVT:$vdst, (node P.Src0RC32:$src0))]
+ [(set P.DstVT:$vdst, (node (P.Src0VT P.Src0RC32:$src0)))]
)
);
}
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 26c85e83b53adc..74d2f53d7b317c 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -56,9 +56,9 @@ define amdgpu_kernel void @mov_dpp8(ptr addrspace(1) %out, i32 %in) #0 {
ret void
}
-; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.writelane(i32 0, i32 1, i32 2)
+; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.writelane.i32(i32 0, i32 1, i32 2)
define amdgpu_kernel void @writelane(ptr addrspace(1) %out) #0 {
- %tmp0 = call i32 @llvm.amdgcn.writelane(i32 0, i32 1, i32 2)
+ %tmp0 = call i32 @llvm.amdgcn.writelane.i32(i32 0, i32 1, i32 2)
store i32 %tmp0, ptr addrspace(1) %out
ret void
}
@@ -237,7 +237,7 @@ declare i32 @llvm.amdgcn.permlanex16.var(i32, i32, i32, i1, i1) #1
declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1
declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #1
-declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #1
+declare i32 @llvm.amdgcn.writelane.i32(i32, i32, i32) #1
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half>, <16 x half> , <8 x float>) #1
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16>, <16 x i16> , <8 x float>) #1
declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
index 220dc70165e87c..bdfafa89cd0477 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
@@ -1,5 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-atomic-optimizer %s | FileCheck -check-prefix=IR %s
; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
@@ -74,7 +73,7 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
; IR-NEXT: br label [[TMP11]]
; IR: 11:
; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
-; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]])
+; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
; IR-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP5]]
; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0)
; IR-NEXT: ret void
@@ -172,7 +171,7 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
; IR-NEXT: br label [[TMP11]]
; IR: 11:
; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
-; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]])
+; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
; IR-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], [[TMP5]]
; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0)
; IR-NEXT: ret void
@@ -273,7 +272,7 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = and i32 [[TMP5]], 1
; IR-NEXT: [[TMP16:%.*]] = xor i32 [[TMP14]], [[TMP15]]
; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP16]], i32 0, i32 0, i32 0)
@@ -374,7 +373,7 @@ define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) {
; IR-NEXT: br label [[TMP11]]
; IR: 11:
; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
-; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]])
+; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
; IR-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP5]]
; IR-NEXT: [[ARG_INT:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128
; IR-NEXT: [[ARG_VEC:%.*]] = bitcast i128 [[ARG_INT]] to <4 x i32>
@@ -476,7 +475,7 @@ define amdgpu_cs void @atomic_ptr_sub_and_format(ptr addrspace(8) inreg %arg) {
; IR-NEXT: br label [[TMP11]]
; IR: 11:
; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ]
-; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]])
+; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]])
; IR-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], [[TMP5]]
; IR-NEXT: [[ARG_INT:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128
; IR-NEXT: [[ARG_VEC:%.*]] = bitcast i128 [[ARG_INT]] to <4 x i32>
@@ -581,7 +580,7 @@ define amdgpu_cs void @atomic_ptr_xor_and_format(ptr addrspace(8) inreg %arg) {
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i3...
[truncated]
|
Added/updated tests for readfirstlane and writelane ops |
Gentle ping :) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In a separate patch should have AMDGPUInstCombineIntrinsic try to fold bitcasts into the intrinsic
new commit extends @jayfoad's implementation with GIsel support. yet to add tests for half, floats and some vectors |
That's another option. The only real plus to the intermediate is it's slightly less annoying to write combines for. But there are limited combining opportunities for these |
we now legalize to intrinsics directly. The SDAG lowering uses a new helper to unroll vector cases while also handling convergence glue operands |
…ing for generic types (llvm#89217) This patch is intended to be the first of a series with end goal to adapt atomic optimizer pass to support i64 and f64 operations (along with removing all unnecessary bitcasts). This legalizes 64 bit readlane, writelane and readfirstlane ops pre-ISel --------- Co-authored-by: vikramRH <[email protected]>
…ring for generic types (llvm#92725) These are incremental changes over llvm#89217 , with core logic being the same. This patch along with llvm#89217 and llvm#91190 should get us ready to enable 64 bit optimizations in atomic optimizer.
…#91190) Follow up patch to llvm#89217, before we make changes to atomic optimizer. Change-Id: I3857ef9314db77354875a3f3f2e1eb7f5fe8067a
…ing for generic types (llvm#89217) This patch is intended to be the first of a series with end goal to adapt atomic optimizer pass to support i64 and f64 operations (along with removing all unnecessary bitcasts). This legalizes 64 bit readlane, writelane and readfirstlane ops pre-ISel --------- Co-authored-by: vikramRH <[email protected]> Change-Id: Iab334da95db7aea86d9816e6e11fc609f9e7c533
…ring for generic types (llvm#92725) These are incremental changes over llvm#89217 , with core logic being the same. This patch along with llvm#89217 and llvm#91190 should get us ready to enable 64 bit optimizations in atomic optimizer. Change-Id: Ief70422a47461606c29134b217f40204ee4a198b
…#91190) Follow up patch to llvm#89217, before we make changes to atomic optimizer. Change-Id: If0e36752e59da11e596b9e008044b9cc1f69a36a
…ing for generic types (llvm#89217) This patch is intended to be the first of a series with end goal to adapt atomic optimizer pass to support i64 and f64 operations (along with removing all unnecessary bitcasts). This legalizes 64 bit readlane, writelane and readfirstlane ops pre-ISel Change-Id: I9d302867e39316767b2aabcf876e9ea7a9e484e0
…ring for generic types (llvm#92725) These are incremental changes over llvm#89217 , with core logic being the same. This patch along with llvm#89217 and llvm#91190 should get us ready to enable 64 bit optimizations in atomic optimizer. Change-Id: Ief70422a47461606c29134b217f40204ee4a198b
This patch is intended to be the first of a series with end goal to adapt atomic optimizer pass to support i64 and f64 operations (along with removing all unnecessary bitcasts). This patch introduces pseudos for 64bit readlane, writelane and readfirstlane ops which are lowered post ISel. Requesting comments on the approach here,