@@ -5040,6 +5040,28 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
5040
5040
return LoopBB;
5041
5041
}
5042
5042
5043
+ static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
5044
+ switch (Opc) {
5045
+ case AMDGPU::S_MIN_U32:
5046
+ return std::numeric_limits<uint32_t>::max();
5047
+ case AMDGPU::S_MIN_I32:
5048
+ return std::numeric_limits<int32_t>::max();
5049
+ case AMDGPU::S_MAX_U32:
5050
+ return std::numeric_limits<uint32_t>::min();
5051
+ case AMDGPU::S_MAX_I32:
5052
+ return std::numeric_limits<int32_t>::min();
5053
+ case AMDGPU::S_ADD_I32:
5054
+ case AMDGPU::S_SUB_I32:
5055
+ case AMDGPU::S_OR_B32:
5056
+ case AMDGPU::S_XOR_B32:
5057
+ return std::numeric_limits<uint32_t>::min();
5058
+ case AMDGPU::S_AND_B32:
5059
+ return std::numeric_limits<uint32_t>::max();
5060
+ default:
5061
+ llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
5062
+ }
5063
+ }
5064
+
5043
5065
static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5044
5066
MachineBasicBlock &BB,
5045
5067
const GCNSubtarget &ST,
@@ -5055,13 +5077,78 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5055
5077
Register DstReg = MI.getOperand(0).getReg();
5056
5078
MachineBasicBlock *RetBB = nullptr;
5057
5079
if (isSGPR) {
5058
- // These operations with a uniform value i.e. SGPR are idempotent.
5059
- // Reduced value will be same as given sgpr.
5060
- // clang-format off
5061
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
5062
- .addReg(SrcReg);
5063
- // clang-format on
5064
- RetBB = &BB;
5080
+ switch (Opc) {
5081
+ case AMDGPU::S_MIN_U32:
5082
+ case AMDGPU::S_MIN_I32:
5083
+ case AMDGPU::S_MAX_U32:
5084
+ case AMDGPU::S_MAX_I32:
5085
+ case AMDGPU::S_AND_B32:
5086
+ case AMDGPU::S_OR_B32: {
5087
+ // Idempotent operations.
5088
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5089
+ RetBB = &BB;
5090
+ break;
5091
+ }
5092
+ case AMDGPU::S_XOR_B32:
5093
+ case AMDGPU::S_ADD_I32:
5094
+ case AMDGPU::S_SUB_I32: {
5095
+ const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5096
+ const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5097
+ Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5098
+ Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
5099
+
5100
+ bool IsWave32 = ST.isWave32();
5101
+ unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5102
+ MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5103
+ unsigned CountReg =
5104
+ IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5105
+
5106
+ auto Exec =
5107
+ BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5108
+
5109
+ auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5110
+ .addReg(Exec->getOperand(0).getReg());
5111
+
5112
+ switch (Opc) {
5113
+ case AMDGPU::S_XOR_B32: {
5114
+ // Performing an XOR operation on a uniform value
5115
+ // depends on the parity of the number of active lanes.
5116
+ // For even parity, the result will be 0, for odd
5117
+ // parity the result will be the same as the input value.
5118
+ Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5119
+
5120
+ auto ParityReg =
5121
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5122
+ .addReg(NewAccumulator->getOperand(0).getReg())
5123
+ .addImm(1);
5124
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5125
+ .addReg(SrcReg)
5126
+ .addReg(ParityReg->getOperand(0).getReg());
5127
+ break;
5128
+ }
5129
+ case AMDGPU::S_SUB_I32: {
5130
+ Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5131
+
5132
+ // Take the negation of the source operand.
5133
+ auto InvertedValReg =
5134
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5135
+ .addImm(-1)
5136
+ .addReg(SrcReg);
5137
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5138
+ .addReg(InvertedValReg->getOperand(0).getReg())
5139
+ .addReg(NewAccumulator->getOperand(0).getReg());
5140
+ break;
5141
+ }
5142
+ case AMDGPU::S_ADD_I32: {
5143
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5144
+ .addReg(SrcReg)
5145
+ .addReg(NewAccumulator->getOperand(0).getReg());
5146
+ break;
5147
+ }
5148
+ }
5149
+ RetBB = &BB;
5150
+ }
5151
+ }
5065
5152
} else {
5066
5153
// TODO: Implement DPP Strategy and switch based on immediate strategy
5067
5154
// operand. For now, for all the cases (default, Iterative and DPP we use
@@ -5097,10 +5184,9 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5097
5184
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5098
5185
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5099
5186
5100
- // Create initail values of induction variable from Exec, Accumulator and
5101
- // insert branch instr to newly created ComputeBlockk
5102
- uint32_t InitalValue =
5103
- (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
5187
+ // Create initial values of induction variable from Exec, Accumulator and
5188
+ // insert branch instr to newly created ComputeBlock
5189
+ uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
5104
5190
auto TmpSReg =
5105
5191
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
5106
5192
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
@@ -5172,8 +5258,22 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
5172
5258
switch (MI.getOpcode()) {
5173
5259
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5174
5260
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5261
+ case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5262
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5175
5263
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5176
5264
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5265
+ case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5266
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5267
+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5268
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5269
+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5270
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5271
+ case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5272
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5273
+ case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5274
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5275
+ case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5276
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5177
5277
case AMDGPU::S_UADDO_PSEUDO:
5178
5278
case AMDGPU::S_USUBO_PSEUDO: {
5179
5279
const DebugLoc &DL = MI.getDebugLoc();
0 commit comments