|
1 | 1 | /*========================== begin_copyright_notice ============================
|
2 | 2 |
|
3 |
| -Copyright (C) 2017-2023 Intel Corporation |
| 3 | +Copyright (C) 2017-2024 Intel Corporation |
4 | 4 |
|
5 | 5 | SPDX-License-Identifier: MIT
|
6 | 6 |
|
@@ -263,6 +263,13 @@ class GenXLowering : public FunctionPass {
|
263 | 263 | Value *swapLowHighHalves(IRBuilder<> &Builder, Value *Arg) const;
|
264 | 264 | bool lowerByteSwap(CallInst *CI);
|
265 | 265 |
|
| 266 | + template <typename BuilderOp> |
| 267 | + bool lowerReduction(CallInst *CI, Value *Src, Value *Start, |
| 268 | + BuilderOp Builder); |
| 269 | + |
| 270 | + bool lowerReduction(CallInst *CI, Instruction::BinaryOps Opcode); |
| 271 | + bool lowerReduction(CallInst *CI, Intrinsic::ID); |
| 272 | + |
266 | 273 | bool generatePredicatedWrrForNewLoad(CallInst *CI);
|
267 | 274 | };
|
268 | 275 |
|
@@ -2128,6 +2135,33 @@ bool GenXLowering::processInst(Instruction *Inst) {
|
2128 | 2135 | return lowerStackSave(CI);
|
2129 | 2136 | case Intrinsic::stackrestore:
|
2130 | 2137 | return lowerStackRestore(CI);
|
| 2138 | +#if LLVM_VERSION_MAJOR >= 12 |
| 2139 | + case Intrinsic::vector_reduce_add: |
| 2140 | + return lowerReduction(CI, Instruction::Add); |
| 2141 | + case Intrinsic::vector_reduce_mul: |
| 2142 | + return lowerReduction(CI, Instruction::Mul); |
| 2143 | + case Intrinsic::vector_reduce_fadd: |
| 2144 | + return lowerReduction(CI, Instruction::FAdd); |
| 2145 | + case Intrinsic::vector_reduce_fmul: |
| 2146 | + return lowerReduction(CI, Instruction::FMul); |
| 2147 | + case Intrinsic::vector_reduce_fmax: |
| 2148 | + return lowerReduction(CI, Intrinsic::maxnum); |
| 2149 | + case Intrinsic::vector_reduce_fmin: |
| 2150 | + return lowerReduction(CI, Intrinsic::minnum); |
| 2151 | +#else // LLVM_VERSION_MAJOR >= 12 |
| 2152 | + case Intrinsic::experimental_vector_reduce_add: |
| 2153 | + return lowerReduction(CI, Instruction::Add); |
| 2154 | + case Intrinsic::experimental_vector_reduce_mul: |
| 2155 | + return lowerReduction(CI, Instruction::Mul); |
| 2156 | + case Intrinsic::experimental_vector_reduce_v2_fadd: |
| 2157 | + return lowerReduction(CI, Instruction::FAdd); |
| 2158 | + case Intrinsic::experimental_vector_reduce_v2_fmul: |
| 2159 | + return lowerReduction(CI, Instruction::FMul); |
| 2160 | + case Intrinsic::experimental_vector_reduce_fmax: |
| 2161 | + return lowerReduction(CI, Intrinsic::maxnum); |
| 2162 | + case Intrinsic::experimental_vector_reduce_fmin: |
| 2163 | + return lowerReduction(CI, Intrinsic::minnum); |
| 2164 | +#endif // LLVM_VERSION_MAJOR >= 12 |
2131 | 2165 | case GenXIntrinsic::genx_get_hwid:
|
2132 | 2166 | return lowerHardwareThreadID(CI);
|
2133 | 2167 | case vc::InternalIntrinsic::logical_thread_id:
|
@@ -4551,6 +4585,112 @@ bool GenXLowering::lowerLogicalThreadID(CallInst *CI) {
|
4551 | 4585 | return true;
|
4552 | 4586 | }
|
4553 | 4587 |
|
| 4588 | +template <typename BuilderOp> |
| 4589 | +bool GenXLowering::lowerReduction(CallInst *CI, Value *Src, Value *Start, |
| 4590 | + BuilderOp Builder) { |
| 4591 | + const auto &DebugLoc = CI->getDebugLoc(); |
| 4592 | + |
| 4593 | + auto *Ty = CI->getType(); |
| 4594 | + // VC doesn't support lowering of ordered floating-point reduction |
| 4595 | + if (Ty->isFloatingPointTy() && !CI->hasAllowReassoc()) |
| 4596 | + return false; |
| 4597 | + |
| 4598 | + auto *SrcVTy = cast<IGCLLVM::FixedVectorType>(Src->getType()); |
| 4599 | + auto SrcWidth = SrcVTy->getNumElements(); |
| 4600 | + |
| 4601 | + const uint64_t MaxSimd = 2 * ST->getGRFByteSize() * genx::ByteBits / |
| 4602 | + DL->getTypeStoreSizeInBits(Ty); |
| 4603 | + const auto LinearGrain = std::min<uint64_t>(32, MaxSimd); |
| 4604 | + auto TailWidth = SrcWidth % LinearGrain; |
| 4605 | + const auto LinearWidth = SrcWidth - TailWidth; |
| 4606 | + auto TailIndex = LinearWidth; |
| 4607 | + |
| 4608 | + auto *Acc = Src; |
| 4609 | + |
| 4610 | + if (LinearWidth > LinearGrain) { |
| 4611 | + IGC_ASSERT(LinearWidth % LinearGrain == 0); |
| 4612 | + auto *AccTy = IGCLLVM::FixedVectorType::get(Ty, LinearGrain); |
| 4613 | + |
| 4614 | + vc::CMRegion R(AccTy, DL); |
| 4615 | + R.Offset = 0; |
| 4616 | + |
| 4617 | + Acc = R.createRdRegion(Src, "", CI, DebugLoc); |
| 4618 | + |
| 4619 | + const auto GrainBytes = LinearGrain * R.ElementBytes; |
| 4620 | + R.Offset = GrainBytes; |
| 4621 | + for (; R.getOffsetInElements() < LinearWidth; R.Offset += GrainBytes) { |
| 4622 | + auto *NewRgn = R.createRdRegion(Src, "", CI, DebugLoc); |
| 4623 | + Acc = Builder(Acc, NewRgn); |
| 4624 | + } |
| 4625 | + SrcWidth = LinearGrain; |
| 4626 | + } else if (!isPowerOf2_32(SrcWidth)) { |
| 4627 | + TailIndex = PowerOf2Floor(SrcWidth); |
| 4628 | + TailWidth = SrcWidth % TailIndex; |
| 4629 | + SrcWidth = TailIndex; |
| 4630 | + } |
| 4631 | + |
| 4632 | + for (SrcWidth /= 2; SrcWidth > 0; SrcWidth /= 2) { |
| 4633 | + auto *OpTy = IGCLLVM::FixedVectorType::get(Ty, SrcWidth); |
| 4634 | + vc::CMRegion R(OpTy, DL); |
| 4635 | + |
| 4636 | + R.Offset = 0; |
| 4637 | + auto *Op0 = R.createRdRegion(Acc, "", CI, DebugLoc); |
| 4638 | + |
| 4639 | + R.Offset = R.ElementBytes * SrcWidth; |
| 4640 | + auto *Op1 = R.createRdRegion(Acc, "", CI, DebugLoc); |
| 4641 | + |
| 4642 | + Acc = Builder(Op0, Op1); |
| 4643 | + |
| 4644 | + if ((TailWidth & SrcWidth) != 0) { |
| 4645 | + vc::CMRegion RTail(OpTy, DL); |
| 4646 | + R.Offset = TailIndex * R.ElementBytes; |
| 4647 | + auto *Tail = R.createRdRegion(Src, "", CI, DebugLoc); |
| 4648 | + |
| 4649 | + Acc = Builder(Acc, Tail); |
| 4650 | + TailIndex += SrcWidth; |
| 4651 | + TailWidth -= SrcWidth; |
| 4652 | + } |
| 4653 | + } |
| 4654 | + |
| 4655 | + IGC_ASSERT(TailWidth == 0); |
| 4656 | + |
| 4657 | + IRBuilder<> IRB(CI); |
| 4658 | + auto *Res = IRB.CreateBitCast(Acc, Ty); |
| 4659 | + if (Start) |
| 4660 | + Res = Builder(Res, Start); |
| 4661 | + |
| 4662 | + CI->replaceAllUsesWith(Res); |
| 4663 | + ToErase.push_back(CI); |
| 4664 | + return true; |
| 4665 | +} |
| 4666 | + |
| 4667 | +bool GenXLowering::lowerReduction(CallInst *CI, Instruction::BinaryOps Opcode) { |
| 4668 | + Value *Start = nullptr; |
| 4669 | + auto *Src = CI->getArgOperand(0); |
| 4670 | + |
| 4671 | + if (Opcode == Instruction::FAdd || Opcode == Instruction::FMul) { |
| 4672 | + Start = CI->getArgOperand(0); |
| 4673 | + Src = CI->getArgOperand(1); |
| 4674 | + } |
| 4675 | + |
| 4676 | + IRBuilder<> Builder(CI); |
| 4677 | + |
| 4678 | + return lowerReduction(CI, Src, Start, [&](Value *LHS, Value *RHS) { |
| 4679 | + return Builder.CreateBinOp(Opcode, LHS, RHS); |
| 4680 | + }); |
| 4681 | +} |
| 4682 | + |
| 4683 | +bool GenXLowering::lowerReduction(CallInst *CI, Intrinsic::ID IID) { |
| 4684 | + Value *Start = nullptr; |
| 4685 | + auto *Src = CI->getArgOperand(0); |
| 4686 | + |
| 4687 | + IRBuilder<> Builder(CI); |
| 4688 | + |
| 4689 | + return lowerReduction(CI, Src, Start, [&](Value *LHS, Value *RHS) { |
| 4690 | + return Builder.CreateBinaryIntrinsic(IID, LHS, RHS); |
| 4691 | + }); |
| 4692 | +} |
| 4693 | + |
4554 | 4694 | /***********************************************************************
|
4555 | 4695 | * widenByteOp : widen a vector byte operation to short if that might
|
4556 | 4696 | * improve code
|
|
0 commit comments