Skip to content

Commit 69ff815

Browse files
easyonaaditarsenm
authored andcommitted
[AMDGPU] Extend wave reduce intrinsics for i32 type (llvm#126469)
Currently, wave reduction intrinsics are supported for `umin` and `umax` operations for `i32` type only. This patch extends support for the following operations: `add`, `sub`, `min`, `max`, `and`, `or`, `xor` for `i32` type. --------- Co-authored-by: Matt Arsenault <[email protected]>
1 parent 20a1ddf commit 69ff815

15 files changed

+7927
-255
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2347,8 +2347,14 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
23472347
],
23482348
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
23492349

2350-
def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
2351-
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
2350+
multiclass AMDGPUWaveReduceOps {
2351+
foreach Op =
2352+
["umin", "min", "umax", "max", "add", "sub", "and", "or", "xor"] in {
2353+
def Op : AMDGPUWaveReduce;
2354+
}
2355+
}
2356+
2357+
defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceOps;
23522358

23532359
def int_amdgcn_readfirstlane :
23542360
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5006,8 +5006,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
50065006
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
50075007
break;
50085008
}
5009+
case Intrinsic::amdgcn_wave_reduce_add:
5010+
case Intrinsic::amdgcn_wave_reduce_sub:
5011+
case Intrinsic::amdgcn_wave_reduce_min:
50095012
case Intrinsic::amdgcn_wave_reduce_umin:
5010-
case Intrinsic::amdgcn_wave_reduce_umax: {
5013+
case Intrinsic::amdgcn_wave_reduce_max:
5014+
case Intrinsic::amdgcn_wave_reduce_umax:
5015+
case Intrinsic::amdgcn_wave_reduce_and:
5016+
case Intrinsic::amdgcn_wave_reduce_or:
5017+
case Intrinsic::amdgcn_wave_reduce_xor: {
50115018
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
50125019
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
50135020
unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 111 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5040,6 +5040,28 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
50405040
return LoopBB;
50415041
}
50425042

5043+
static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
5044+
switch (Opc) {
5045+
case AMDGPU::S_MIN_U32:
5046+
return std::numeric_limits<uint32_t>::max();
5047+
case AMDGPU::S_MIN_I32:
5048+
return std::numeric_limits<int32_t>::max();
5049+
case AMDGPU::S_MAX_U32:
5050+
return std::numeric_limits<uint32_t>::min();
5051+
case AMDGPU::S_MAX_I32:
5052+
return std::numeric_limits<int32_t>::min();
5053+
case AMDGPU::S_ADD_I32:
5054+
case AMDGPU::S_SUB_I32:
5055+
case AMDGPU::S_OR_B32:
5056+
case AMDGPU::S_XOR_B32:
5057+
return std::numeric_limits<uint32_t>::min();
5058+
case AMDGPU::S_AND_B32:
5059+
return std::numeric_limits<uint32_t>::max();
5060+
default:
5061+
llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
5062+
}
5063+
}
5064+
50435065
static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
50445066
MachineBasicBlock &BB,
50455067
const GCNSubtarget &ST,
@@ -5055,13 +5077,78 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
50555077
Register DstReg = MI.getOperand(0).getReg();
50565078
MachineBasicBlock *RetBB = nullptr;
50575079
if (isSGPR) {
5058-
// These operations with a uniform value i.e. SGPR are idempotent.
5059-
// Reduced value will be same as given sgpr.
5060-
// clang-format off
5061-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
5062-
.addReg(SrcReg);
5063-
// clang-format on
5064-
RetBB = &BB;
5080+
switch (Opc) {
5081+
case AMDGPU::S_MIN_U32:
5082+
case AMDGPU::S_MIN_I32:
5083+
case AMDGPU::S_MAX_U32:
5084+
case AMDGPU::S_MAX_I32:
5085+
case AMDGPU::S_AND_B32:
5086+
case AMDGPU::S_OR_B32: {
5087+
// Idempotent operations.
5088+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5089+
RetBB = &BB;
5090+
break;
5091+
}
5092+
case AMDGPU::S_XOR_B32:
5093+
case AMDGPU::S_ADD_I32:
5094+
case AMDGPU::S_SUB_I32: {
5095+
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5096+
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5097+
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5098+
Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
5099+
5100+
bool IsWave32 = ST.isWave32();
5101+
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5102+
MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5103+
unsigned CountReg =
5104+
IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5105+
5106+
auto Exec =
5107+
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5108+
5109+
auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5110+
.addReg(Exec->getOperand(0).getReg());
5111+
5112+
switch (Opc) {
5113+
case AMDGPU::S_XOR_B32: {
5114+
// Performing an XOR operation on a uniform value
5115+
// depends on the parity of the number of active lanes.
5116+
// For even parity, the result will be 0, for odd
5117+
// parity the result will be the same as the input value.
5118+
Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5119+
5120+
auto ParityReg =
5121+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5122+
.addReg(NewAccumulator->getOperand(0).getReg())
5123+
.addImm(1);
5124+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5125+
.addReg(SrcReg)
5126+
.addReg(ParityReg->getOperand(0).getReg());
5127+
break;
5128+
}
5129+
case AMDGPU::S_SUB_I32: {
5130+
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5131+
5132+
// Take the negation of the source operand.
5133+
auto InvertedValReg =
5134+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5135+
.addImm(-1)
5136+
.addReg(SrcReg);
5137+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5138+
.addReg(InvertedValReg->getOperand(0).getReg())
5139+
.addReg(NewAccumulator->getOperand(0).getReg());
5140+
break;
5141+
}
5142+
case AMDGPU::S_ADD_I32: {
5143+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5144+
.addReg(SrcReg)
5145+
.addReg(NewAccumulator->getOperand(0).getReg());
5146+
break;
5147+
}
5148+
}
5149+
RetBB = &BB;
5150+
}
5151+
}
50655152
} else {
50665153
// TODO: Implement DPP Strategy and switch based on immediate strategy
50675154
// operand. For now, for all the cases (default, Iterative and DPP we use
@@ -5097,10 +5184,9 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
50975184
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
50985185
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
50995186

5100-
// Create initail values of induction variable from Exec, Accumulator and
5101-
// insert branch instr to newly created ComputeBlockk
5102-
uint32_t InitalValue =
5103-
(Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
5187+
// Create initial values of induction variable from Exec, Accumulator and
5188+
// insert branch instr to newly created ComputeBlock
5189+
uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
51045190
auto TmpSReg =
51055191
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
51065192
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
@@ -5172,8 +5258,22 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
51725258
switch (MI.getOpcode()) {
51735259
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
51745260
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5261+
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5262+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
51755263
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
51765264
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5265+
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5266+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5267+
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5268+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5269+
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5270+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5271+
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5272+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5273+
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5274+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5275+
case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5276+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
51775277
case AMDGPU::S_UADDO_PSEUDO:
51785278
case AMDGPU::S_USUBO_PSEUDO: {
51795279
const DebugLoc &DL = MI.getDebugLoc();

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -303,16 +303,29 @@ def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
303303
def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
304304
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
305305

306-
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
307-
def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
308-
(ins VSrc_b32: $src, VSrc_b32:$strategy),
309-
[(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
306+
// clang-format off
307+
defvar int_amdgcn_wave_reduce_ = "int_amdgcn_wave_reduce_";
308+
multiclass
309+
AMDGPUWaveReducePseudoGenerator<string Op, string DataType> {
310+
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
311+
def !toupper(Op) #"_PSEUDO_" #DataType
312+
: VPseudoInstSI<(outs SGPR_32 : $sdst),
313+
(ins VSrc_b32 : $src, VSrc_b32 : $strategy),
314+
[(set i32 : $sdst, (!cast<AMDGPUWaveReduce>(int_amdgcn_wave_reduce_ #Op) i32 : $src, i32 : $strategy))]> {}
310315
}
316+
}
317+
// clang-format on
311318

312-
def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
313-
(ins VSrc_b32: $src, VSrc_b32:$strategy),
314-
[(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
315-
}
319+
// Input list : [Operation_name,
320+
// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B)]
321+
defvar Operations = [
322+
["umin", "U32"], ["min", "I32"], ["umax", "U32"], ["max", "I32"],
323+
["add", "I32"], ["sub", "I32"], ["and", "B32"], ["or", "B32"],
324+
["xor", "B32"]
325+
];
326+
327+
foreach Op = Operations in {
328+
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op[0], Op[1]>;
316329
}
317330

318331
let usesCustomInserter = 1, Defs = [VCC] in {

0 commit comments

Comments
 (0)