Skip to content

[PowerPC][AIX] Refactor existing logic to handle non-zero offsets for aix-small-local-dynamic-tls #89182

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 23 additions & 22 deletions llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,8 +205,8 @@ class PPCAsmPrinter : public AsmPrinter {
void LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI);
void EmitTlsCall(const MachineInstr *MI, MCSymbolRefExpr::VariantKind VK);
void EmitAIXTlsCallHelper(const MachineInstr *MI);
const MCExpr *getAdjustedLocalExecExpr(const MachineOperand &MO,
int64_t Offset);
const MCExpr *getAdjustedFasterLocalExpr(const MachineOperand &MO,
int64_t Offset);
bool runOnMachineFunction(MachineFunction &MF) override {
Subtarget = &MF.getSubtarget<PPCSubtarget>();
bool Changed = AsmPrinter::runOnMachineFunction(MF);
Expand Down Expand Up @@ -1598,7 +1598,8 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
// machine operand (which is a TargetGlobalTLSAddress) is expected to be
// the same operand for both loads and stores.
for (const MachineOperand &TempMO : MI->operands()) {
if (((TempMO.getTargetFlags() == PPCII::MO_TPREL_FLAG)) &&
if (((TempMO.getTargetFlags() == PPCII::MO_TPREL_FLAG ||
TempMO.getTargetFlags() == PPCII::MO_TLSLD_FLAG)) &&
TempMO.getOperandNo() == 1)
OpNum = 1;
}
Expand Down Expand Up @@ -1634,8 +1635,8 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
case PPC::ADDI8: {
// A faster non-TOC-based local-[exec|dynamic] sequence is represented by
// `addi` or a load/store instruction (that directly loads or stores off of
// the thread pointer) with an immediate operand having the MO_TPREL_FLAG.
// Such instructions do not otherwise arise.
// the thread pointer) with an immediate operand having the
// [MO_TPREL_FLAG|MO_TLSLD_FLAG]. Such instructions do not otherwise arise.
if (!HasAIXSmallLocalTLS)
break;
bool IsMIADDI8 = MI->getOpcode() == PPC::ADDI8;
Expand All @@ -1647,7 +1648,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
Flag == PPCII::MO_TPREL_PCREL_FLAG || Flag == PPCII::MO_TLSLD_FLAG) {
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);

const MCExpr *Expr = getAdjustedLocalExecExpr(MO, MO.getOffset());
const MCExpr *Expr = getAdjustedFasterLocalExpr(MO, MO.getOffset());
if (Expr)
TmpInst.getOperand(OpNum) = MCOperand::createExpr(Expr);

Expand Down Expand Up @@ -1677,28 +1678,25 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
EmitToStreamer(*OutStreamer, TmpInst);
}

// For non-TOC-based local-exec variables that have a non-zero offset,
// For non-TOC-based local-[exec|dynamic] variables that have a non-zero offset,
// we need to create a new MCExpr that adds the non-zero offset to the address
// of the local-exec variable that will be used in either an addi, load or
// store. However, the final displacement for these instructions must be
// of the local-[exec|dynamic] variable that will be used in either an addi,
// load or store. However, the final displacement for these instructions must be
// between [-32768, 32768), so if the TLS address + its non-zero offset is
// greater than 32KB, a new MCExpr is produced to accommodate this situation.
const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO,
int64_t Offset) {
const MCExpr *
PPCAsmPrinter::getAdjustedFasterLocalExpr(const MachineOperand &MO,
int64_t Offset) {
// Non-zero offsets (for loads, stores or `addi`) require additional handling.
// When the offset is zero, there is no need to create an adjusted MCExpr.
if (!Offset)
return nullptr;

assert(MO.isGlobal() && "Only expecting a global MachineOperand here!");
const GlobalValue *GValue = MO.getGlobal();
// TODO: Handle the aix-small-local-dynamic-tls non-zero offset case.
TLSModel::Model Model = TM.getTLSModel(GValue);
if (Model == TLSModel::LocalDynamic) {
return nullptr;
}
assert(Model == TLSModel::LocalExec &&
"Only local-exec accesses are handled!");
assert((Model == TLSModel::LocalExec || Model == TLSModel::LocalDynamic) &&
"Only local-[exec|dynamic] accesses are handled!");

bool IsGlobalADeclaration = GValue->isDeclarationForLinker();
// Find the GlobalVariable that corresponds to the particular TLS variable
Expand All @@ -1719,7 +1717,10 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO,
// For when TLS variables are extern, this is safe to do because we can
// assume that the address of extern TLS variables are zero.
const MCExpr *Expr = MCSymbolRefExpr::create(
getSymbol(GValue), MCSymbolRefExpr::VK_PPC_AIX_TLSLE, OutContext);
getSymbol(GValue),
Model == TLSModel::LocalExec ? MCSymbolRefExpr::VK_PPC_AIX_TLSLE
: MCSymbolRefExpr::VK_PPC_AIX_TLSLD,
OutContext);
Expr = MCBinaryExpr::createAdd(
Expr, MCConstantExpr::create(Offset, OutContext), OutContext);
if (FinalAddress >= 32768) {
Expand All @@ -1732,10 +1733,10 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO,
ptrdiff_t Delta = ((FinalAddress + 32768) & ~0xFFFF);
// Check that the total instruction displacement fits within [-32768,32768).
[[maybe_unused]] ptrdiff_t InstDisp = TLSVarAddress + Offset - Delta;
assert(((InstDisp < 32768) &&
(InstDisp >= -32768)) &&
"Expecting the instruction displacement for local-exec TLS "
"variables to be between [-32768, 32768)!");
assert(
((InstDisp < 32768) && (InstDisp >= -32768)) &&
"Expecting the instruction displacement for local-[exec|dynamic] TLS "
"variables to be between [-32768, 32768)!");
Expr = MCBinaryExpr::createAdd(
Expr, MCConstantExpr::create(-Delta, OutContext), OutContext);
}
Expand Down
89 changes: 44 additions & 45 deletions llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7587,29 +7587,23 @@ static bool hasAIXSmallTLSAttr(SDValue Val) {
return false;
}

// Is an ADDI eligible for folding for non-TOC-based local-exec accesses?
static bool isEligibleToFoldADDIForLocalExecAccesses(SelectionDAG *DAG,
SDValue ADDIToFold) {
// Is an ADDI eligible for folding for non-TOC-based local-[exec|dynamic]
// accesses?
static bool isEligibleToFoldADDIForFasterLocalAccesses(SelectionDAG *DAG,
SDValue ADDIToFold) {
// Check if ADDIToFold (the ADDI that we want to fold into local-exec
// accesses), is truly an ADDI.
if (!ADDIToFold.isMachineOpcode() ||
(ADDIToFold.getMachineOpcode() != PPC::ADDI8))
return false;

// Folding is only allowed for the AIX small-local-exec TLS target attribute
// or when the 'aix-small-tls' global variable attribute is present.
// Folding is only allowed for the AIX small-local-[exec|dynamic] TLS target
// attribute or when the 'aix-small-tls' global variable attribute is present.
const PPCSubtarget &Subtarget =
DAG->getMachineFunction().getSubtarget<PPCSubtarget>();
SDValue TLSVarNode = ADDIToFold.getOperand(1);
if (!(Subtarget.hasAIXSmallLocalExecTLS() || hasAIXSmallTLSAttr(TLSVarNode)))
return false;

// The first operand of the ADDIToFold should be the thread pointer.
// This transformation is only performed if the first operand of the
// addi is the thread pointer.
SDValue TPRegNode = ADDIToFold.getOperand(0);
RegisterSDNode *TPReg = dyn_cast<RegisterSDNode>(TPRegNode.getNode());
if (!TPReg || (TPReg->getReg() != Subtarget.getThreadPointerRegister()))
if (!(Subtarget.hasAIXSmallLocalDynamicTLS() ||
Subtarget.hasAIXSmallLocalExecTLS() || hasAIXSmallTLSAttr(TLSVarNode)))
return false;

// The second operand of the ADDIToFold should be the global TLS address
Expand All @@ -7619,52 +7613,54 @@ static bool isEligibleToFoldADDIForLocalExecAccesses(SelectionDAG *DAG,
if (!GA)
return false;

// The local-exec TLS variable should only have the MO_TPREL_FLAG target flag,
// so this optimization is not performed otherwise if the flag is not set.
if (DAG->getTarget().getTLSModel(GA->getGlobal()) == TLSModel::LocalExec) {
// The first operand of the ADDIToFold should be the thread pointer.
// This transformation is only performed if the first operand of the
// addi is the thread pointer.
SDValue TPRegNode = ADDIToFold.getOperand(0);
RegisterSDNode *TPReg = dyn_cast<RegisterSDNode>(TPRegNode.getNode());
if (!TPReg || (TPReg->getReg() != Subtarget.getThreadPointerRegister()))
return false;
}

// The local-[exec|dynamic] TLS variable should only have the
// [MO_TPREL_FLAG|MO_TLSLD_FLAG] target flags, so this optimization is not
// performed otherwise if the flag is not set.
unsigned TargetFlags = GA->getTargetFlags();
if (TargetFlags != PPCII::MO_TPREL_FLAG)
if (!(TargetFlags == PPCII::MO_TPREL_FLAG ||
TargetFlags == PPCII::MO_TLSLD_FLAG))
return false;

// If all conditions are satisfied, the ADDI is valid for folding.
return true;
}

// For non-TOC-based local-exec access where an addi is feeding into another
// addi, fold this sequence into a single addi if possible.
// Before this optimization, the sequence appears as:
// addi rN, r13, sym@le
// For non-TOC-based local-[exec|dynamic] access where an addi is feeding into
// another addi, fold this sequence into a single addi if possible. Before this
// optimization, the sequence appears as:
// addi rN, r13, sym@[le|ld]
// addi rM, rN, imm
// After this optimization, we can fold the two addi into a single one:
// addi rM, r13, sym@le + imm
static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) {
// addi rM, r13, sym@[le|ld] + imm
static void foldADDIForFasterLocalAccesses(SDNode *N, SelectionDAG *DAG) {
if (N->getMachineOpcode() != PPC::ADDI8)
return;

// InitialADDI is the addi feeding into N (also an addi), and the addi that
// we want optimized out.
SDValue InitialADDI = N->getOperand(0);

if (!isEligibleToFoldADDIForLocalExecAccesses(DAG, InitialADDI))
if (!isEligibleToFoldADDIForFasterLocalAccesses(DAG, InitialADDI))
return;

// At this point, InitialADDI can be folded into a non-TOC-based local-exec
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar check already exists in isEligible..., so I removed this one.

// access. The first operand of InitialADDI should be the thread pointer,
// which has been checked in isEligibleToFoldADDIForLocalExecAccesses().
SDValue TPRegNode = InitialADDI.getOperand(0);
[[maybe_unused]] RegisterSDNode *TPReg = dyn_cast<RegisterSDNode>(TPRegNode.getNode());
[[maybe_unused]] const PPCSubtarget &Subtarget =
DAG->getMachineFunction().getSubtarget<PPCSubtarget>();
assert((TPReg && (TPReg->getReg() == Subtarget.getThreadPointerRegister())) &&
"Expecting the first operand to be a thread pointer for folding addi "
"in local-exec accesses!");

// The second operand of the InitialADDI should be the global TLS address
// (the local-exec TLS variable), with the MO_TPREL_FLAG target flag.
// This has been checked in isEligibleToFoldADDIForLocalExecAccesses().
// (the local-[exec|dynamic] TLS variable), with the
// [MO_TPREL_FLAG|MO_TLSLD_FLAG] target flag. This has been checked in
// isEligibleToFoldADDIForFasterLocalAccesses().
SDValue TLSVarNode = InitialADDI.getOperand(1);
GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(TLSVarNode);
assert(GA && "Expecting a valid GlobalAddressSDNode when folding addi into "
"local-exec accesses!");
"local-[exec|dynamic] accesses!");
unsigned TargetFlags = GA->getTargetFlags();

// The second operand of the addi that we want to preserve will be an
Expand All @@ -7676,7 +7672,7 @@ static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) {
TLSVarNode = DAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA), MVT::i64,
Offset, TargetFlags);

(void)DAG->UpdateNodeOperands(N, TPRegNode, TLSVarNode);
(void)DAG->UpdateNodeOperands(N, InitialADDI.getOperand(0), TLSVarNode);
if (InitialADDI.getNode()->use_empty())
DAG->RemoveDeadNode(InitialADDI.getNode());
}
Expand All @@ -7693,8 +7689,9 @@ void PPCDAGToDAGISel::PeepholePPC64() {
if (isVSXSwap(SDValue(N, 0)))
reduceVSXSwap(N, CurDAG);

// This optimization is performed for non-TOC-based local-exec accesses.
foldADDIForLocalExecAccesses(N, CurDAG);
// This optimization is performed for non-TOC-based local-[exec|dynamic]
// accesses.
foldADDIForFasterLocalAccesses(N, CurDAG);

unsigned FirstOp;
unsigned StorageOpcode = N->getMachineOpcode();
Expand Down Expand Up @@ -7852,13 +7849,15 @@ void PPCDAGToDAGISel::PeepholePPC64() {
ImmOpnd = CurDAG->getTargetConstant(Offset, SDLoc(ImmOpnd),
ImmOpnd.getValueType());
} else if (Offset != 0) {
// This optimization is performed for non-TOC-based local-exec accesses.
if (isEligibleToFoldADDIForLocalExecAccesses(CurDAG, Base)) {
// This optimization is performed for non-TOC-based local-[exec|dynamic]
// accesses.
if (isEligibleToFoldADDIForFasterLocalAccesses(CurDAG, Base)) {
// Add the non-zero offset information into the load or store
// instruction to be used for non-TOC-based local-exec accesses.
// instruction to be used for non-TOC-based local-[exec|dynamic]
// accesses.
GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd);
assert(GA && "Expecting a valid GlobalAddressSDNode when folding "
"addi into local-exec accesses!");
"addi into local-[exec|dynamic] accesses!");
ImmOpnd = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA),
MVT::i64, Offset,
GA->getTargetFlags());
Expand Down
Loading