-
Notifications
You must be signed in to change notification settings - Fork 14.3k
Revert "[AMDGPU] Add IR LiveReg type-based optimization" #97138
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Revert "[AMDGPU] Add IR LiveReg type-based optimization" #97138
Conversation
Created using spr 1.3.4
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Vitaly Buka (vitalybuka) ChangesPart of #66838. https://lab.llvm.org/buildbot/#/builders/52/builds/404 This reverts commit ded9564. Patch is 279.63 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/97138.diff 11 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 7623b73d6dd5f..69fdeaebe0a01 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -81,73 +81,6 @@ class AMDGPULateCodeGenPrepare
bool visitLoadInst(LoadInst &LI);
};
-using ValueToValueMap = DenseMap<const Value *, Value *>;
-
-class LiveRegOptimizer {
-private:
- Module *Mod = nullptr;
- const DataLayout *DL = nullptr;
- const GCNSubtarget *ST;
- /// The scalar type to convert to
- Type *ConvertToScalar;
- /// The set of visited Instructions
- SmallPtrSet<Instruction *, 4> Visited;
- /// The set of Instructions to be deleted
- SmallPtrSet<Instruction *, 4> DeadInstrs;
- /// Map of Value -> Converted Value
- ValueToValueMap ValMap;
- /// Map of containing conversions from Optimal Type -> Original Type per BB.
- DenseMap<BasicBlock *, ValueToValueMap> BBUseValMap;
-
-public:
- /// Calculate the and \p return the type to convert to given a problematic \p
- /// OriginalType. In some instances, we may widen the type (e.g. v2i8 -> i32).
- Type *calculateConvertType(Type *OriginalType);
- /// Convert the virtual register defined by \p V to the compatible vector of
- /// legal type
- Value *convertToOptType(Instruction *V, BasicBlock::iterator &InstPt);
- /// Convert the virtual register defined by \p V back to the original type \p
- /// ConvertType, stripping away the MSBs in cases where there was an imperfect
- /// fit (e.g. v2i32 -> v7i8)
- Value *convertFromOptType(Type *ConvertType, Instruction *V,
- BasicBlock::iterator &InstPt,
- BasicBlock *InsertBlock);
- /// Check for problematic PHI nodes or cross-bb values based on the value
- /// defined by \p I, and coerce to legal types if necessary. For problematic
- /// PHI node, we coerce all incoming values in a single invocation.
- bool optimizeLiveType(Instruction *I);
-
- /// Remove all instructions that have become dead (i.e. all the re-typed PHIs)
- void removeDeadInstrs();
-
- // Whether or not the type should be replaced to avoid inefficient
- // legalization code
- bool shouldReplace(Type *ITy) {
- FixedVectorType *VTy = dyn_cast<FixedVectorType>(ITy);
- if (!VTy)
- return false;
-
- auto TLI = ST->getTargetLowering();
-
- Type *EltTy = VTy->getElementType();
- // If the element size is not less than the convert to scalar size, then we
- // can't do any bit packing
- if (!EltTy->isIntegerTy() ||
- EltTy->getScalarSizeInBits() > ConvertToScalar->getScalarSizeInBits())
- return false;
-
- // Only coerce illegal types
- TargetLoweringBase::LegalizeKind LK =
- TLI->getTypeConversion(EltTy->getContext(), EVT::getEVT(EltTy, false));
- return LK.first != TargetLoweringBase::TypeLegal;
- }
-
- LiveRegOptimizer(Module *Mod, const GCNSubtarget *ST) : Mod(Mod), ST(ST) {
- DL = &Mod->getDataLayout();
- ConvertToScalar = Type::getInt32Ty(Mod->getContext());
- }
-};
-
} // end anonymous namespace
bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
@@ -169,238 +102,14 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
- // "Optimize" the virtual regs that cross basic block boundaries. When
- // building the SelectionDAG, vectors of illegal types that cross basic blocks
- // will be scalarized and widened, with each scalar living in its
- // own register. To work around this, this optimization converts the
- // vectors to equivalent vectors of legal type (which are converted back
- // before uses in subsequent blocks), to pack the bits into fewer physical
- // registers (used in CopyToReg/CopyFromReg pairs).
- LiveRegOptimizer LRO(Mod, &ST);
-
bool Changed = false;
-
for (auto &BB : F)
- for (Instruction &I : make_early_inc_range(BB)) {
+ for (Instruction &I : llvm::make_early_inc_range(BB))
Changed |= visit(I);
- Changed |= LRO.optimizeLiveType(&I);
- }
- LRO.removeDeadInstrs();
return Changed;
}
-Type *LiveRegOptimizer::calculateConvertType(Type *OriginalType) {
- assert(OriginalType->getScalarSizeInBits() <=
- ConvertToScalar->getScalarSizeInBits());
-
- FixedVectorType *VTy = cast<FixedVectorType>(OriginalType);
-
- TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
- TypeSize ConvertScalarSize = DL->getTypeSizeInBits(ConvertToScalar);
- unsigned ConvertEltCount =
- (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
-
- if (OriginalSize <= ConvertScalarSize)
- return IntegerType::get(Mod->getContext(), ConvertScalarSize);
-
- return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize),
- ConvertEltCount, false);
-}
-
-Value *LiveRegOptimizer::convertToOptType(Instruction *V,
- BasicBlock::iterator &InsertPt) {
- FixedVectorType *VTy = cast<FixedVectorType>(V->getType());
- Type *NewTy = calculateConvertType(V->getType());
-
- TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
- TypeSize NewSize = DL->getTypeSizeInBits(NewTy);
-
- IRBuilder<> Builder(V->getParent(), InsertPt);
- // If there is a bitsize match, we can fit the old vector into a new vector of
- // desired type.
- if (OriginalSize == NewSize)
- return Builder.CreateBitCast(V, NewTy, V->getName() + ".bc");
-
- // If there is a bitsize mismatch, we must use a wider vector.
- assert(NewSize > OriginalSize);
- uint64_t ExpandedVecElementCount = NewSize / VTy->getScalarSizeInBits();
-
- SmallVector<int, 8> ShuffleMask;
- uint64_t OriginalElementCount = VTy->getElementCount().getFixedValue();
- for (unsigned I = 0; I < OriginalElementCount; I++)
- ShuffleMask.push_back(I);
-
- for (uint64_t I = OriginalElementCount; I < ExpandedVecElementCount; I++)
- ShuffleMask.push_back(OriginalElementCount);
-
- Value *ExpandedVec = Builder.CreateShuffleVector(V, ShuffleMask);
- return Builder.CreateBitCast(ExpandedVec, NewTy, V->getName() + ".bc");
-}
-
-Value *LiveRegOptimizer::convertFromOptType(Type *ConvertType, Instruction *V,
- BasicBlock::iterator &InsertPt,
- BasicBlock *InsertBB) {
- FixedVectorType *NewVTy = cast<FixedVectorType>(ConvertType);
-
- TypeSize OriginalSize = DL->getTypeSizeInBits(V->getType());
- TypeSize NewSize = DL->getTypeSizeInBits(NewVTy);
-
- IRBuilder<> Builder(InsertBB, InsertPt);
- // If there is a bitsize match, we simply convert back to the original type.
- if (OriginalSize == NewSize)
- return Builder.CreateBitCast(V, NewVTy, V->getName() + ".bc");
-
- // If there is a bitsize mismatch, then we must have used a wider value to
- // hold the bits.
- assert(OriginalSize > NewSize);
- // For wide scalars, we can just truncate the value.
- if (!V->getType()->isVectorTy()) {
- Instruction *Trunc = cast<Instruction>(
- Builder.CreateTrunc(V, IntegerType::get(Mod->getContext(), NewSize)));
- return cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
- }
-
- // For wider vectors, we must strip the MSBs to convert back to the original
- // type.
- VectorType *ExpandedVT = VectorType::get(
- Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
- (OriginalSize / NewVTy->getScalarSizeInBits()), false);
- Instruction *Converted =
- cast<Instruction>(Builder.CreateBitCast(V, ExpandedVT));
-
- unsigned NarrowElementCount = NewVTy->getElementCount().getFixedValue();
- SmallVector<int, 8> ShuffleMask(NarrowElementCount);
- std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
-
- return Builder.CreateShuffleVector(Converted, ShuffleMask);
-}
-
-bool LiveRegOptimizer::optimizeLiveType(Instruction *I) {
- SmallVector<Instruction *, 4> Worklist;
- SmallPtrSet<PHINode *, 4> PhiNodes;
- SmallPtrSet<Instruction *, 4> Defs;
- SmallPtrSet<Instruction *, 4> Uses;
-
- Worklist.push_back(cast<Instruction>(I));
- while (!Worklist.empty()) {
- Instruction *II = Worklist.pop_back_val();
-
- if (!Visited.insert(II).second)
- continue;
-
- if (!shouldReplace(II->getType()))
- continue;
-
- if (PHINode *Phi = dyn_cast<PHINode>(II)) {
- PhiNodes.insert(Phi);
- // Collect all the incoming values of problematic PHI nodes.
- for (Value *V : Phi->incoming_values()) {
- // Repeat the collection process for newly found PHI nodes.
- if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
- if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
- Worklist.push_back(OpPhi);
- continue;
- }
-
- Instruction *IncInst = dyn_cast<Instruction>(V);
- // Other incoming value types (e.g. vector literals) are unhandled
- if (!IncInst && !isa<ConstantAggregateZero>(V))
- return false;
-
- // Collect all other incoming values for coercion.
- if (IncInst)
- Defs.insert(IncInst);
- }
- }
-
- // Collect all relevant uses.
- for (User *V : II->users()) {
- // Repeat the collection process for problematic PHI nodes.
- if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
- if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
- Worklist.push_back(OpPhi);
- continue;
- }
-
- Instruction *UseInst = cast<Instruction>(V);
- // Collect all uses of PHINodes and any use the crosses BB boundaries.
- if (UseInst->getParent() != II->getParent() || isa<PHINode>(II)) {
- Uses.insert(UseInst);
- if (!Defs.count(II) && !isa<PHINode>(II)) {
- Defs.insert(II);
- }
- }
- }
- }
-
- // Coerce and track the defs.
- for (Instruction *D : Defs) {
- if (!ValMap.contains(D)) {
- BasicBlock::iterator InsertPt = std::next(D->getIterator());
- Value *ConvertVal = convertToOptType(D, InsertPt);
- assert(ConvertVal);
- ValMap[D] = ConvertVal;
- }
- }
-
- // Construct new-typed PHI nodes.
- for (PHINode *Phi : PhiNodes) {
- ValMap[Phi] = PHINode::Create(calculateConvertType(Phi->getType()),
- Phi->getNumIncomingValues(),
- Phi->getName() + ".tc", Phi->getIterator());
- }
-
- // Connect all the PHI nodes with their new incoming values.
- for (PHINode *Phi : PhiNodes) {
- PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
- bool MissingIncVal = false;
- for (int I = 0, E = Phi->getNumIncomingValues(); I < E; I++) {
- Value *IncVal = Phi->getIncomingValue(I);
- if (isa<ConstantAggregateZero>(IncVal)) {
- Type *NewType = calculateConvertType(Phi->getType());
- NewPhi->addIncoming(ConstantInt::get(NewType, 0, false),
- Phi->getIncomingBlock(I));
- } else if (ValMap.contains(IncVal))
- NewPhi->addIncoming(ValMap[IncVal], Phi->getIncomingBlock(I));
- else
- MissingIncVal = true;
- }
- DeadInstrs.insert(MissingIncVal ? cast<Instruction>(ValMap[Phi]) : Phi);
- }
- // Coerce back to the original type and replace the uses.
- for (Instruction *U : Uses) {
- // Replace all converted operands for a use.
- for (auto [OpIdx, Op] : enumerate(U->operands())) {
- if (ValMap.contains(Op)) {
- Value *NewVal = nullptr;
- if (BBUseValMap.contains(U->getParent()) &&
- BBUseValMap[U->getParent()].contains(ValMap[Op]))
- NewVal = BBUseValMap[U->getParent()][ValMap[Op]];
- else {
- BasicBlock::iterator InsertPt = U->getParent()->getFirstNonPHIIt();
- NewVal =
- convertFromOptType(Op->getType(), cast<Instruction>(ValMap[Op]),
- InsertPt, U->getParent());
- BBUseValMap[U->getParent()][ValMap[Op]] = NewVal;
- }
- assert(NewVal);
- U->setOperand(OpIdx, NewVal);
- }
- }
- }
-
- return true;
-}
-
-void LiveRegOptimizer::removeDeadInstrs() {
- // Remove instrs that have been marked dead after type-coercion.
- for (auto *I : DeadInstrs) {
- I->replaceAllUsesWith(PoisonValue::get(I->getType()));
- I->eraseFromParent();
- }
-}
-
bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
unsigned AS = LI.getPointerAddressSpace();
// Skip non-constant address space.
@@ -410,7 +119,7 @@ bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
// Skip non-simple loads.
if (!LI.isSimple())
return false;
- Type *Ty = LI.getType();
+ auto *Ty = LI.getType();
// Skip aggregate types.
if (Ty->isAggregateType())
return false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f50a18ccc2188..9162e110aa10b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1197,10 +1197,10 @@ bool GCNPassConfig::addPreISel() {
AMDGPUPassConfig::addPreISel();
if (TM->getOptLevel() > CodeGenOptLevel::None)
- addPass(createSinkingPass());
+ addPass(createAMDGPULateCodeGenPreparePass());
if (TM->getOptLevel() > CodeGenOptLevel::None)
- addPass(createAMDGPULateCodeGenPreparePass());
+ addPass(createSinkingPass());
// Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
// regions formed by them.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
deleted file mode 100644
index 83cb92210ec84..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ /dev/null
@@ -1,636 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
-
-define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v3i8_liveout:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX906-NEXT: v_mov_b32_e32 v3, 8
-; GFX906-NEXT: v_mov_b32_e32 v5, 16
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dword v4, v2, s[4:5]
-; GFX906-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_and_b32_e32 v6, 0xff, v4
-; GFX906-NEXT: v_lshlrev_b32_sdwa v7, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX906-NEXT: v_lshlrev_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX906-NEXT: v_or3_b32 v4, v6, v7, v4
-; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT: s_cbranch_execz .LBB0_2
-; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dword v0, v2, s[6:7]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v0
-; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX906-NEXT: v_or3_b32 v4, v2, v3, v0
-; GFX906-NEXT: .LBB0_2: ; %bb.2
-; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v4
-; GFX906-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX906-NEXT: v_mov_b32_e32 v1, 0
-; GFX906-NEXT: global_store_short v1, v0, s[2:3]
-; GFX906-NEXT: global_store_byte_d16_hi v1, v0, s[2:3] offset:2
-; GFX906-NEXT: s_endpgm
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <3 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <3 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <3 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <3 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
- br label %bb.2
-
-bb.2:
- %tmp5 = phi <3 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
- store <3 x i8> %tmp5, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v4i8_liveout:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dword v1, v2, s[4:5]
-; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT: s_cbranch_execz .LBB1_2
-; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dword v1, v2, s[6:7]
-; GFX906-NEXT: .LBB1_2: ; %bb.2
-; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX906-NEXT: s_endpgm
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
- br label %bb.2
-
-bb.2:
- %tmp5 = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
- store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v5i8_liveout:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT: s_cbranch_execz .LBB2_2
-; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX906-NEXT: .LBB2_2: ; %bb.2
-; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_mov_b32_e32 v4, 0
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v1
-; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v1
-; GFX906-NEXT: global_store_byte v4, v1, s[2:3]
-; GFX906-NEXT: global_store_byte v4, v0, s[2:3] offset:1
-; GFX906-NEXT: global_store_byte_d16_hi v4, v1, s[2:3] offset:2
-; GFX906-NEXT: global_store_byte v4, v3, s[2:3] offset:3
-; GFX906-NEXT: global_store_byte v4, v2, s[2:3] offset:4
-; GFX906-NEXT: s_endpgm
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <5 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <5 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <5 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <5 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
- br label %bb.2
-
-bb.2:
- %tmp5 = phi <5 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
- store <5 x i8> %tmp5, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v8i8_liveout:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
...
[truncated]
|
Part of #66838.
https://lab.llvm.org/buildbot/#/builders/52/builds/404
https://lab.llvm.org/buildbot/#/builders/55/builds/358
https://lab.llvm.org/buildbot/#/builders/164/builds/518
This reverts commit ded9564.