Skip to content

Revert "[AMDGPU] Add IR LiveReg type-based optimization" #97138

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
295 changes: 2 additions & 293 deletions llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,73 +81,6 @@ class AMDGPULateCodeGenPrepare
bool visitLoadInst(LoadInst &LI);
};

using ValueToValueMap = DenseMap<const Value *, Value *>;

class LiveRegOptimizer {
private:
Module *Mod = nullptr;
const DataLayout *DL = nullptr;
const GCNSubtarget *ST;
/// The scalar type to convert to
Type *ConvertToScalar;
/// The set of visited Instructions
SmallPtrSet<Instruction *, 4> Visited;
/// The set of Instructions to be deleted
SmallPtrSet<Instruction *, 4> DeadInstrs;
/// Map of Value -> Converted Value
ValueToValueMap ValMap;
/// Map of containing conversions from Optimal Type -> Original Type per BB.
DenseMap<BasicBlock *, ValueToValueMap> BBUseValMap;

public:
/// Calculate the and \p return the type to convert to given a problematic \p
/// OriginalType. In some instances, we may widen the type (e.g. v2i8 -> i32).
Type *calculateConvertType(Type *OriginalType);
/// Convert the virtual register defined by \p V to the compatible vector of
/// legal type
Value *convertToOptType(Instruction *V, BasicBlock::iterator &InstPt);
/// Convert the virtual register defined by \p V back to the original type \p
/// ConvertType, stripping away the MSBs in cases where there was an imperfect
/// fit (e.g. v2i32 -> v7i8)
Value *convertFromOptType(Type *ConvertType, Instruction *V,
BasicBlock::iterator &InstPt,
BasicBlock *InsertBlock);
/// Check for problematic PHI nodes or cross-bb values based on the value
/// defined by \p I, and coerce to legal types if necessary. For problematic
/// PHI node, we coerce all incoming values in a single invocation.
bool optimizeLiveType(Instruction *I);

/// Remove all instructions that have become dead (i.e. all the re-typed PHIs)
void removeDeadInstrs();

// Whether or not the type should be replaced to avoid inefficient
// legalization code
bool shouldReplace(Type *ITy) {
FixedVectorType *VTy = dyn_cast<FixedVectorType>(ITy);
if (!VTy)
return false;

auto TLI = ST->getTargetLowering();

Type *EltTy = VTy->getElementType();
// If the element size is not less than the convert to scalar size, then we
// can't do any bit packing
if (!EltTy->isIntegerTy() ||
EltTy->getScalarSizeInBits() > ConvertToScalar->getScalarSizeInBits())
return false;

// Only coerce illegal types
TargetLoweringBase::LegalizeKind LK =
TLI->getTypeConversion(EltTy->getContext(), EVT::getEVT(EltTy, false));
return LK.first != TargetLoweringBase::TypeLegal;
}

LiveRegOptimizer(Module *Mod, const GCNSubtarget *ST) : Mod(Mod), ST(ST) {
DL = &Mod->getDataLayout();
ConvertToScalar = Type::getInt32Ty(Mod->getContext());
}
};

} // end anonymous namespace

bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
Expand All @@ -169,238 +102,14 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();

// "Optimize" the virtual regs that cross basic block boundaries. When
// building the SelectionDAG, vectors of illegal types that cross basic blocks
// will be scalarized and widened, with each scalar living in its
// own register. To work around this, this optimization converts the
// vectors to equivalent vectors of legal type (which are converted back
// before uses in subsequent blocks), to pack the bits into fewer physical
// registers (used in CopyToReg/CopyFromReg pairs).
LiveRegOptimizer LRO(Mod, &ST);

bool Changed = false;

for (auto &BB : F)
for (Instruction &I : make_early_inc_range(BB)) {
for (Instruction &I : llvm::make_early_inc_range(BB))
Changed |= visit(I);
Changed |= LRO.optimizeLiveType(&I);
}

LRO.removeDeadInstrs();
return Changed;
}

Type *LiveRegOptimizer::calculateConvertType(Type *OriginalType) {
assert(OriginalType->getScalarSizeInBits() <=
ConvertToScalar->getScalarSizeInBits());

FixedVectorType *VTy = cast<FixedVectorType>(OriginalType);

TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
TypeSize ConvertScalarSize = DL->getTypeSizeInBits(ConvertToScalar);
unsigned ConvertEltCount =
(OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;

if (OriginalSize <= ConvertScalarSize)
return IntegerType::get(Mod->getContext(), ConvertScalarSize);

return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize),
ConvertEltCount, false);
}

Value *LiveRegOptimizer::convertToOptType(Instruction *V,
BasicBlock::iterator &InsertPt) {
FixedVectorType *VTy = cast<FixedVectorType>(V->getType());
Type *NewTy = calculateConvertType(V->getType());

TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
TypeSize NewSize = DL->getTypeSizeInBits(NewTy);

IRBuilder<> Builder(V->getParent(), InsertPt);
// If there is a bitsize match, we can fit the old vector into a new vector of
// desired type.
if (OriginalSize == NewSize)
return Builder.CreateBitCast(V, NewTy, V->getName() + ".bc");

// If there is a bitsize mismatch, we must use a wider vector.
assert(NewSize > OriginalSize);
uint64_t ExpandedVecElementCount = NewSize / VTy->getScalarSizeInBits();

SmallVector<int, 8> ShuffleMask;
uint64_t OriginalElementCount = VTy->getElementCount().getFixedValue();
for (unsigned I = 0; I < OriginalElementCount; I++)
ShuffleMask.push_back(I);

for (uint64_t I = OriginalElementCount; I < ExpandedVecElementCount; I++)
ShuffleMask.push_back(OriginalElementCount);

Value *ExpandedVec = Builder.CreateShuffleVector(V, ShuffleMask);
return Builder.CreateBitCast(ExpandedVec, NewTy, V->getName() + ".bc");
}

Value *LiveRegOptimizer::convertFromOptType(Type *ConvertType, Instruction *V,
BasicBlock::iterator &InsertPt,
BasicBlock *InsertBB) {
FixedVectorType *NewVTy = cast<FixedVectorType>(ConvertType);

TypeSize OriginalSize = DL->getTypeSizeInBits(V->getType());
TypeSize NewSize = DL->getTypeSizeInBits(NewVTy);

IRBuilder<> Builder(InsertBB, InsertPt);
// If there is a bitsize match, we simply convert back to the original type.
if (OriginalSize == NewSize)
return Builder.CreateBitCast(V, NewVTy, V->getName() + ".bc");

// If there is a bitsize mismatch, then we must have used a wider value to
// hold the bits.
assert(OriginalSize > NewSize);
// For wide scalars, we can just truncate the value.
if (!V->getType()->isVectorTy()) {
Instruction *Trunc = cast<Instruction>(
Builder.CreateTrunc(V, IntegerType::get(Mod->getContext(), NewSize)));
return cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
}

// For wider vectors, we must strip the MSBs to convert back to the original
// type.
VectorType *ExpandedVT = VectorType::get(
Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
(OriginalSize / NewVTy->getScalarSizeInBits()), false);
Instruction *Converted =
cast<Instruction>(Builder.CreateBitCast(V, ExpandedVT));

unsigned NarrowElementCount = NewVTy->getElementCount().getFixedValue();
SmallVector<int, 8> ShuffleMask(NarrowElementCount);
std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);

return Builder.CreateShuffleVector(Converted, ShuffleMask);
}

bool LiveRegOptimizer::optimizeLiveType(Instruction *I) {
SmallVector<Instruction *, 4> Worklist;
SmallPtrSet<PHINode *, 4> PhiNodes;
SmallPtrSet<Instruction *, 4> Defs;
SmallPtrSet<Instruction *, 4> Uses;

Worklist.push_back(cast<Instruction>(I));
while (!Worklist.empty()) {
Instruction *II = Worklist.pop_back_val();

if (!Visited.insert(II).second)
continue;

if (!shouldReplace(II->getType()))
continue;

if (PHINode *Phi = dyn_cast<PHINode>(II)) {
PhiNodes.insert(Phi);
// Collect all the incoming values of problematic PHI nodes.
for (Value *V : Phi->incoming_values()) {
// Repeat the collection process for newly found PHI nodes.
if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
Worklist.push_back(OpPhi);
continue;
}

Instruction *IncInst = dyn_cast<Instruction>(V);
// Other incoming value types (e.g. vector literals) are unhandled
if (!IncInst && !isa<ConstantAggregateZero>(V))
return false;

// Collect all other incoming values for coercion.
if (IncInst)
Defs.insert(IncInst);
}
}

// Collect all relevant uses.
for (User *V : II->users()) {
// Repeat the collection process for problematic PHI nodes.
if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
Worklist.push_back(OpPhi);
continue;
}

Instruction *UseInst = cast<Instruction>(V);
// Collect all uses of PHINodes and any use the crosses BB boundaries.
if (UseInst->getParent() != II->getParent() || isa<PHINode>(II)) {
Uses.insert(UseInst);
if (!Defs.count(II) && !isa<PHINode>(II)) {
Defs.insert(II);
}
}
}
}

// Coerce and track the defs.
for (Instruction *D : Defs) {
if (!ValMap.contains(D)) {
BasicBlock::iterator InsertPt = std::next(D->getIterator());
Value *ConvertVal = convertToOptType(D, InsertPt);
assert(ConvertVal);
ValMap[D] = ConvertVal;
}
}

// Construct new-typed PHI nodes.
for (PHINode *Phi : PhiNodes) {
ValMap[Phi] = PHINode::Create(calculateConvertType(Phi->getType()),
Phi->getNumIncomingValues(),
Phi->getName() + ".tc", Phi->getIterator());
}

// Connect all the PHI nodes with their new incoming values.
for (PHINode *Phi : PhiNodes) {
PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
bool MissingIncVal = false;
for (int I = 0, E = Phi->getNumIncomingValues(); I < E; I++) {
Value *IncVal = Phi->getIncomingValue(I);
if (isa<ConstantAggregateZero>(IncVal)) {
Type *NewType = calculateConvertType(Phi->getType());
NewPhi->addIncoming(ConstantInt::get(NewType, 0, false),
Phi->getIncomingBlock(I));
} else if (ValMap.contains(IncVal))
NewPhi->addIncoming(ValMap[IncVal], Phi->getIncomingBlock(I));
else
MissingIncVal = true;
}
DeadInstrs.insert(MissingIncVal ? cast<Instruction>(ValMap[Phi]) : Phi);
}
// Coerce back to the original type and replace the uses.
for (Instruction *U : Uses) {
// Replace all converted operands for a use.
for (auto [OpIdx, Op] : enumerate(U->operands())) {
if (ValMap.contains(Op)) {
Value *NewVal = nullptr;
if (BBUseValMap.contains(U->getParent()) &&
BBUseValMap[U->getParent()].contains(ValMap[Op]))
NewVal = BBUseValMap[U->getParent()][ValMap[Op]];
else {
BasicBlock::iterator InsertPt = U->getParent()->getFirstNonPHIIt();
NewVal =
convertFromOptType(Op->getType(), cast<Instruction>(ValMap[Op]),
InsertPt, U->getParent());
BBUseValMap[U->getParent()][ValMap[Op]] = NewVal;
}
assert(NewVal);
U->setOperand(OpIdx, NewVal);
}
}
}

return true;
}

void LiveRegOptimizer::removeDeadInstrs() {
// Remove instrs that have been marked dead after type-coercion.
for (auto *I : DeadInstrs) {
I->replaceAllUsesWith(PoisonValue::get(I->getType()));
I->eraseFromParent();
}
}

bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
unsigned AS = LI.getPointerAddressSpace();
// Skip non-constant address space.
Expand All @@ -410,7 +119,7 @@ bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
// Skip non-simple loads.
if (!LI.isSimple())
return false;
Type *Ty = LI.getType();
auto *Ty = LI.getType();
// Skip aggregate types.
if (Ty->isAggregateType())
return false;
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1197,10 +1197,10 @@ bool GCNPassConfig::addPreISel() {
AMDGPUPassConfig::addPreISel();

if (TM->getOptLevel() > CodeGenOptLevel::None)
addPass(createSinkingPass());
addPass(createAMDGPULateCodeGenPreparePass());

if (TM->getOptLevel() > CodeGenOptLevel::None)
addPass(createAMDGPULateCodeGenPreparePass());
addPass(createSinkingPass());

// Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
// regions formed by them.
Expand Down
Loading
Loading