Skip to content

Commit 21f7225

Browse files
vmustyaigcbot
authored andcommitted
Support vector reduce intrinsics lowering
VC lowers `@llvm.vector.reduce.*` intrinsics as follows: * If an input vector has more then N elements, emit simd linear sequence of operations. N is target dependent. * Emit tree-like sequence reducing sequence. The linear sequence produces less operations then tree-like one for large vectors. We can also utilize accumulator registers.
1 parent 14c15f0 commit 21f7225

File tree

2 files changed

+599
-1
lines changed

2 files changed

+599
-1
lines changed

IGC/VectorCompiler/lib/GenXCodeGen/GenXLowering.cpp

Lines changed: 141 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*========================== begin_copyright_notice ============================
22
3-
Copyright (C) 2017-2023 Intel Corporation
3+
Copyright (C) 2017-2024 Intel Corporation
44
55
SPDX-License-Identifier: MIT
66
@@ -263,6 +263,13 @@ class GenXLowering : public FunctionPass {
263263
Value *swapLowHighHalves(IRBuilder<> &Builder, Value *Arg) const;
264264
bool lowerByteSwap(CallInst *CI);
265265

266+
template <typename BuilderOp>
267+
bool lowerReduction(CallInst *CI, Value *Src, Value *Start,
268+
BuilderOp Builder);
269+
270+
bool lowerReduction(CallInst *CI, Instruction::BinaryOps Opcode);
271+
bool lowerReduction(CallInst *CI, Intrinsic::ID);
272+
266273
bool generatePredicatedWrrForNewLoad(CallInst *CI);
267274
};
268275

@@ -2128,6 +2135,33 @@ bool GenXLowering::processInst(Instruction *Inst) {
21282135
return lowerStackSave(CI);
21292136
case Intrinsic::stackrestore:
21302137
return lowerStackRestore(CI);
2138+
#if LLVM_VERSION_MAJOR >= 12
2139+
case Intrinsic::vector_reduce_add:
2140+
return lowerReduction(CI, Instruction::Add);
2141+
case Intrinsic::vector_reduce_mul:
2142+
return lowerReduction(CI, Instruction::Mul);
2143+
case Intrinsic::vector_reduce_fadd:
2144+
return lowerReduction(CI, Instruction::FAdd);
2145+
case Intrinsic::vector_reduce_fmul:
2146+
return lowerReduction(CI, Instruction::FMul);
2147+
case Intrinsic::vector_reduce_fmax:
2148+
return lowerReduction(CI, Intrinsic::maxnum);
2149+
case Intrinsic::vector_reduce_fmin:
2150+
return lowerReduction(CI, Intrinsic::minnum);
2151+
#else // LLVM_VERSION_MAJOR >= 12
2152+
case Intrinsic::experimental_vector_reduce_add:
2153+
return lowerReduction(CI, Instruction::Add);
2154+
case Intrinsic::experimental_vector_reduce_mul:
2155+
return lowerReduction(CI, Instruction::Mul);
2156+
case Intrinsic::experimental_vector_reduce_v2_fadd:
2157+
return lowerReduction(CI, Instruction::FAdd);
2158+
case Intrinsic::experimental_vector_reduce_v2_fmul:
2159+
return lowerReduction(CI, Instruction::FMul);
2160+
case Intrinsic::experimental_vector_reduce_fmax:
2161+
return lowerReduction(CI, Intrinsic::maxnum);
2162+
case Intrinsic::experimental_vector_reduce_fmin:
2163+
return lowerReduction(CI, Intrinsic::minnum);
2164+
#endif // LLVM_VERSION_MAJOR >= 12
21312165
case GenXIntrinsic::genx_get_hwid:
21322166
return lowerHardwareThreadID(CI);
21332167
case vc::InternalIntrinsic::logical_thread_id:
@@ -4551,6 +4585,112 @@ bool GenXLowering::lowerLogicalThreadID(CallInst *CI) {
45514585
return true;
45524586
}
45534587

4588+
template <typename BuilderOp>
4589+
bool GenXLowering::lowerReduction(CallInst *CI, Value *Src, Value *Start,
4590+
BuilderOp Builder) {
4591+
const auto &DebugLoc = CI->getDebugLoc();
4592+
4593+
auto *Ty = CI->getType();
4594+
// VC doesn't support lowering of ordered floating-point reduction
4595+
if (Ty->isFloatingPointTy() && !CI->hasAllowReassoc())
4596+
return false;
4597+
4598+
auto *SrcVTy = cast<IGCLLVM::FixedVectorType>(Src->getType());
4599+
auto SrcWidth = SrcVTy->getNumElements();
4600+
4601+
const uint64_t MaxSimd = 2 * ST->getGRFByteSize() * genx::ByteBits /
4602+
DL->getTypeStoreSizeInBits(Ty);
4603+
const auto LinearGrain = std::min<uint64_t>(32, MaxSimd);
4604+
auto TailWidth = SrcWidth % LinearGrain;
4605+
const auto LinearWidth = SrcWidth - TailWidth;
4606+
auto TailIndex = LinearWidth;
4607+
4608+
auto *Acc = Src;
4609+
4610+
if (LinearWidth > LinearGrain) {
4611+
IGC_ASSERT(LinearWidth % LinearGrain == 0);
4612+
auto *AccTy = IGCLLVM::FixedVectorType::get(Ty, LinearGrain);
4613+
4614+
vc::CMRegion R(AccTy, DL);
4615+
R.Offset = 0;
4616+
4617+
Acc = R.createRdRegion(Src, "", CI, DebugLoc);
4618+
4619+
const auto GrainBytes = LinearGrain * R.ElementBytes;
4620+
R.Offset = GrainBytes;
4621+
for (; R.getOffsetInElements() < LinearWidth; R.Offset += GrainBytes) {
4622+
auto *NewRgn = R.createRdRegion(Src, "", CI, DebugLoc);
4623+
Acc = Builder(Acc, NewRgn);
4624+
}
4625+
SrcWidth = LinearGrain;
4626+
} else if (!isPowerOf2_32(SrcWidth)) {
4627+
TailIndex = PowerOf2Floor(SrcWidth);
4628+
TailWidth = SrcWidth % TailIndex;
4629+
SrcWidth = TailIndex;
4630+
}
4631+
4632+
for (SrcWidth /= 2; SrcWidth > 0; SrcWidth /= 2) {
4633+
auto *OpTy = IGCLLVM::FixedVectorType::get(Ty, SrcWidth);
4634+
vc::CMRegion R(OpTy, DL);
4635+
4636+
R.Offset = 0;
4637+
auto *Op0 = R.createRdRegion(Acc, "", CI, DebugLoc);
4638+
4639+
R.Offset = R.ElementBytes * SrcWidth;
4640+
auto *Op1 = R.createRdRegion(Acc, "", CI, DebugLoc);
4641+
4642+
Acc = Builder(Op0, Op1);
4643+
4644+
if ((TailWidth & SrcWidth) != 0) {
4645+
vc::CMRegion RTail(OpTy, DL);
4646+
R.Offset = TailIndex * R.ElementBytes;
4647+
auto *Tail = R.createRdRegion(Src, "", CI, DebugLoc);
4648+
4649+
Acc = Builder(Acc, Tail);
4650+
TailIndex += SrcWidth;
4651+
TailWidth -= SrcWidth;
4652+
}
4653+
}
4654+
4655+
IGC_ASSERT(TailWidth == 0);
4656+
4657+
IRBuilder<> IRB(CI);
4658+
auto *Res = IRB.CreateBitCast(Acc, Ty);
4659+
if (Start)
4660+
Res = Builder(Res, Start);
4661+
4662+
CI->replaceAllUsesWith(Res);
4663+
ToErase.push_back(CI);
4664+
return true;
4665+
}
4666+
4667+
bool GenXLowering::lowerReduction(CallInst *CI, Instruction::BinaryOps Opcode) {
4668+
Value *Start = nullptr;
4669+
auto *Src = CI->getArgOperand(0);
4670+
4671+
if (Opcode == Instruction::FAdd || Opcode == Instruction::FMul) {
4672+
Start = CI->getArgOperand(0);
4673+
Src = CI->getArgOperand(1);
4674+
}
4675+
4676+
IRBuilder<> Builder(CI);
4677+
4678+
return lowerReduction(CI, Src, Start, [&](Value *LHS, Value *RHS) {
4679+
return Builder.CreateBinOp(Opcode, LHS, RHS);
4680+
});
4681+
}
4682+
4683+
bool GenXLowering::lowerReduction(CallInst *CI, Intrinsic::ID IID) {
4684+
Value *Start = nullptr;
4685+
auto *Src = CI->getArgOperand(0);
4686+
4687+
IRBuilder<> Builder(CI);
4688+
4689+
return lowerReduction(CI, Src, Start, [&](Value *LHS, Value *RHS) {
4690+
return Builder.CreateBinaryIntrinsic(IID, LHS, RHS);
4691+
});
4692+
}
4693+
45544694
/***********************************************************************
45554695
* widenByteOp : widen a vector byte operation to short if that might
45564696
* improve code

0 commit comments

Comments
 (0)