Skip to content

[PowerPC] 32-bit large code-model support for toc-data #85129

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llvm/lib/Target/PowerPC/P10InstrResources.td
Original file line number Diff line number Diff line change
Expand Up @@ -881,7 +881,7 @@ def : InstRW<[P10W_FX_3C, P10W_DISP_ANY],
// 3 Cycles ALU operations, 1 input operands
def : InstRW<[P10W_FX_3C, P10W_DISP_ANY, P10FX_Read],
(instrs
ADDI, ADDI8, ADDIdtprelL32, ADDItlsldLADDR32, ADDItocL8, LI, LI8,
ADDI, ADDI8, ADDIdtprelL32, ADDItlsldLADDR32, ADDItocL, ADDItocL8, LI, LI8,
ADDIC, ADDIC8,
ADDIS, ADDIS8, ADDISdtprelHA32, ADDIStocHA, ADDIStocHA8, LIS, LIS8,
ADDME, ADDME8,
Expand Down
61 changes: 40 additions & 21 deletions llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1147,15 +1147,27 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {

MCSymbolRefExpr::VariantKind VK = GetVKForMO(MO);

// Always use TOC on AIX. Map the global address operand to be a reference
// to the TOC entry we will synthesize later. 'TOCEntry' is a label used to
// reference the storage allocated in the TOC which contains the address of
// 'MOSymbol'.
MCSymbol *TOCEntry =
lookUpOrCreateTOCEntry(MOSymbol, getTOCEntryTypeForMO(MO), VK);
const MCExpr *Exp = MCSymbolRefExpr::create(TOCEntry,
MCSymbolRefExpr::VK_PPC_U,
OutContext);
// If the symbol isn't toc-data then use the TOC on AIX.
// Map the global address operand to be a reference to the TOC entry we
// will synthesize later. 'TOCEntry' is a label used to reference the
// storage allocated in the TOC which contains the address of 'MOSymbol'.
// If the toc-data attribute is used, the TOC entry contains the data
// rather than the address of the MOSymbol.
if (![](const MachineOperand &MO) {
if (!MO.isGlobal())
return false;

const GlobalVariable *GV = dyn_cast<GlobalVariable>(MO.getGlobal());
if (!GV)
return false;

return GV->hasAttribute("toc-data");
}(MO)) {
MOSymbol = lookUpOrCreateTOCEntry(MOSymbol, getTOCEntryTypeForMO(MO), VK);
}

const MCExpr *Exp = MCSymbolRefExpr::create(
MOSymbol, MCSymbolRefExpr::VK_PPC_U, OutContext);
TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
EmitToStreamer(*OutStreamer, TmpInst);
return;
Expand Down Expand Up @@ -1272,25 +1284,32 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
EmitToStreamer(*OutStreamer, TmpInst);
return;
}
case PPC::ADDItocL:
case PPC::ADDItocL8: {
// Transform %xd = ADDItocL8 %xs, @sym
// Transform %xd = ADDItocL %xs, @sym
LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);

// Change the opcode to ADDI8. If the global address is external, then
// generate a TOC entry and reference that. Otherwise, reference the
// symbol directly.
TmpInst.setOpcode(PPC::ADDI8);
unsigned Op = MI->getOpcode();

// Change the opcode to load address for tocdata
TmpInst.setOpcode(Op == PPC::ADDItocL8 ? PPC::ADDI8 : PPC::LA);

const MachineOperand &MO = MI->getOperand(2);
assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL8.");
assert((Op == PPC::ADDItocL8)
? (MO.isGlobal() || MO.isCPI())
: MO.isGlobal() && "Invalid operand for ADDItocL8.");
assert(!(MO.isGlobal() && Subtarget->isGVIndirectSymbol(MO.getGlobal())) &&
"Interposable definitions must use indirect accesses.");

LLVM_DEBUG(assert(
!(MO.isGlobal() && Subtarget->isGVIndirectSymbol(MO.getGlobal())) &&
"Interposable definitions must use indirect access."));
// Map the operand to its corresponding MCSymbol.
const MCSymbol *const MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);

const MCExpr *Exp = MCSymbolRefExpr::create(
MOSymbol,
Op == PPC::ADDItocL8 ? MCSymbolRefExpr::VK_PPC_TOC_LO
: MCSymbolRefExpr::VK_PPC_L,
OutContext);

const MCExpr *Exp =
MCSymbolRefExpr::create(getMCSymbolForTOCPseudoMO(MO, *this),
MCSymbolRefExpr::VK_PPC_TOC_LO, OutContext);
TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
EmitToStreamer(*OutStreamer, TmpInst);
return;
Expand Down
35 changes: 27 additions & 8 deletions llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -510,7 +510,7 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
}

// Check if a SDValue has the toc-data attribute.
static bool hasTocDataAttr(SDValue Val, unsigned PointerSize) {
static bool hasTocDataAttr(SDValue Val) {
GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Val);
if (!GA)
return false;
Expand Down Expand Up @@ -6115,8 +6115,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {

assert(isAIXABI && "ELF ABI already handled");

if (hasTocDataAttr(N->getOperand(0),
CurDAG->getDataLayout().getPointerSize())) {
if (hasTocDataAttr(N->getOperand(0))) {
replaceWith(PPC::ADDItoc, N, MVT::i32);
return;
}
Expand All @@ -6128,8 +6127,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
if (isPPC64 && CModel == CodeModel::Small) {
assert(isAIXABI && "ELF ABI handled in common SelectCode");

if (hasTocDataAttr(N->getOperand(0),
CurDAG->getDataLayout().getPointerSize())) {
if (hasTocDataAttr(N->getOperand(0))) {
replaceWith(PPC::ADDItoc8, N, MVT::i64);
return;
}
Expand All @@ -6144,23 +6142,44 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
" ELF/AIX or 32-bit AIX in the following.");

// Transforms the ISD::TOC_ENTRY node for 32-bit AIX large code model mode
// or 64-bit medium (ELF-only) or large (ELF and AIX) code model code. We
// generate two instructions as described below. The first source operand
// is a symbol reference. If it must be toc-referenced according to
// or 64-bit medium (ELF-only) or large (ELF and AIX) code model code non
// toc-data symbols.
// We generate two instructions as described below. The first source
// operand is a symbol reference. If it must be toc-referenced according to
// Subtarget, we generate:
// [32-bit AIX]
// LWZtocL(@sym, ADDIStocHA(%r2, @sym))
// [64-bit ELF/AIX]
// LDtocL(@sym, ADDIStocHA8(%x2, @sym))
// Otherwise we generate:
// ADDItocL8(ADDIStocHA8(%x2, @sym), @sym)

// For large code model toc-data symbols we generate:
// [32-bit AIX]
// ADDItocL(ADDIStocHA(%x2, @sym), @sym)
// [64-bit AIX]
// Currently not supported.

SDValue GA = N->getOperand(0);
SDValue TOCbase = N->getOperand(1);

EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
SDNode *Tmp = CurDAG->getMachineNode(
isPPC64 ? PPC::ADDIStocHA8 : PPC::ADDIStocHA, dl, VT, TOCbase, GA);

// On AIX if the symbol has the toc-data attribute it will be defined
// in the TOC entry, so we use an ADDItocL similar to the medium code
// model ELF abi.
if (isAIXABI && hasTocDataAttr(GA)) {
if (isPPC64)
report_fatal_error(
"64-bit large code model toc-data not yet supported");

ReplaceNode(N, CurDAG->getMachineNode(PPC::ADDItocL, dl, VT,
SDValue(Tmp, 0), GA));
return;
}

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At line 6195, for 32-bit, should we use ADDItocL instead of ADDItocL8? Although seems there is no 32-bit case reaching that logic.(It uses MVT::i64 at this phase, on 32-bit it should fail?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Line 6195 is handling the Linux medium code model path. For tocdata 64-bit with large code model, I will have a follow up patch to use ADDItocL8 (with the appropriate enhancements made in PPCAsmPrinter.cpp). ADDItocL8 will then be used on line 6175 above.
I plan to post the 64-bit support next once this 32-bit patch is approved and merged.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. Thanks for explanation.

if (PPCLowering->isAccessedAsGotIndirect(GA)) {
// If it is accessed as got-indirect, we need an extra LWZ/LD to load
// the address.
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1077,6 +1077,7 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(
case PPC::LIS8:
case PPC::ADDIStocHA:
case PPC::ADDIStocHA8:
case PPC::ADDItocL:
case PPC::ADDItocL8:
case PPC::LOAD_STACK_GUARD:
case PPC::PPCLdFixedAddr:
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/PowerPC/PPCInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -3346,11 +3346,13 @@ def ADDIStocHA : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, tocentr
"#ADDIStocHA",
[(set i32:$rD,
(PPCtoc_entry i32:$reg, tglobaladdr:$disp))]>;
// Local Data Transform
// TOC Data Transform AIX
def ADDItoc : PPCEmitTimePseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg),
"#ADDItoc",
[(set i32:$rD,
(PPCtoc_entry tglobaladdr:$disp, i32:$reg))]>;
def ADDItocL : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, tocentry32:$disp),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, you defined a new pseudo and it will be lowered to the real instruction addi. Should we add this new instruction into

  • scheduling model, like llvm/lib/Target/PowerPC/P10InstrResources.td. (P9/P8 scheduling model's regular expression is able to cover this. P7 is bad, it also does not cover 64bit version, maybe we can just leave them for now.)
  • llvm/lib/Target/PowerPC/PPCMacroFusion.def
  • Peephole optimization in getForwardingDefMI

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added it to P10InstrResources.td and PPCMacroFusion.def.
getForwardingDefMI seems more involved and requires some investigation, so I'm thinking to leave that for a follow up patch.

"#ADDItocL", []>;

// Get Global (GOT) Base Register offset, from the word immediately preceding
// the function label.
Expand Down
10 changes: 5 additions & 5 deletions llvm/lib/Target/PowerPC/PPCMacroFusion.def
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
// {addi} followed by one of these {lxvd2x, lxvw4x, lxvdsx, lvebx, lvehx,
// lvewx, lvx, lxsdx}
FUSION_FEATURE(AddiLoad, hasAddiLoadFusion, 2, \
FUSION_OP_SET(ADDI, ADDI8, ADDItocL8), \
FUSION_OP_SET(ADDI, ADDI8, ADDItocL, ADDItocL8), \
FUSION_OP_SET(LXVD2X, LXVW4X, LXVDSX, LVEBX, LVEHX, LVEWX, \
LVX, LXSDX))

Expand Down Expand Up @@ -134,13 +134,13 @@ FUSION_FEATURE(XorisXori, hasWideImmFusion, 1, FUSION_OP_SET(XORIS, XORIS8),

// addis rx,ra,si - addi rt,rx,SI, SI >= 0
FUSION_FEATURE(AddisAddi, hasWideImmFusion, 1,
FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8),
FUSION_OP_SET(ADDI, ADDI8, ADDItocL8))
FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8, ADDIStocHA),
FUSION_OP_SET(ADDI, ADDI8, ADDItocL8, ADDItocL))

// addi rx,ra,si - addis rt,rx,SI, ra > 0, SI >= 2
FUSION_FEATURE(AddiAddis, hasWideImmFusion, 1,
FUSION_OP_SET(ADDI, ADDI8, ADDItocL8),
FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8))
FUSION_OP_SET(ADDI, ADDI8, ADDItocL8, ADDItocL),
FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8, ADDIStocHA))

// mtctr - { bcctr,bcctrl }
FUSION_FEATURE(ZeroMoveCTR, hasZeroMoveFusion, -1,
Expand Down
67 changes: 67 additions & 0 deletions llvm/test/CodeGen/PowerPC/toc-data.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@
; RUN: llc -mtriple powerpc-ibm-aix-xcoff -verify-machineinstrs -O0 < %s | FileCheck %s --check-prefix TEST32
; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -verify-machineinstrs -O0 < %s | FileCheck %s --check-prefix TEST64

; RUN: llc -mtriple powerpc-ibm-aix-xcoff -code-model=large -verify-machineinstrs < %s \
; RUN: -stop-before=ppc-vsx-copy | FileCheck %s --check-prefix CHECK32LARGE
; RUN: llc -mtriple powerpc-ibm-aix-xcoff -code-model=large -verify-machineinstrs < %s | FileCheck %s --check-prefix TEST32LARGE

; Global variables i and f have the toc-data attribute.
; In the following functions, those writing to or reading from
; variables i and f should use the toc-data access pattern.
; All remaining variables should use the regular toc access sequence.
@i = dso_local global i32 0, align 4 #0
@d = dso_local local_unnamed_addr global double 3.141590e+00, align 8
@f = dso_local local_unnamed_addr global float 0x4005BE76C0000000, align 4 #0
Expand Down Expand Up @@ -44,6 +52,16 @@ define dso_local void @write_int(i32 signext %in) {
; TEST64: la 4, i[TD](2)
; TEST64-NEXT: stw 3, 0(4)

; CHECK32LARGE: name: write_int
; CHECK32LARGE: %[[SCRATCH1:[0-9]+]]:gprc_and_gprc_nor0 = ADDIStocHA $r2, @i
; CHECK32LARGE-NEXT: %[[SCRATCH2:[0-9]+]]:gprc_and_gprc_nor0 = ADDItocL killed %[[SCRATCH1]], @i
; CHECK32LARGE-NEXT: STW %{{[0-9]+}}, 0, killed %[[SCRATCH2]] :: (store (s32) into @i)

; FIXME: peephole optimization opportunity for lower part relocation @l to the consuming stw
; TEST32LARGE: .write_int:
; TEST32LARGE: addis 4, i[TD]@u(2)
; TEST32LARGE-NEXT: la 4, i[TD]@l(4)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not related to this patch, here we should have a peephole opportunity that we can add the lower part relocation @l to the consuming stw?

Can we add a fixme here? We are trying to improve the code generation for toc-data at small code model in #76488 . But seems we also have the opportunity for the large model.

; TEST32LARGE-NEXT: stw 3, 0(4)

define dso_local i64 @read_ll() {
entry:
Expand All @@ -70,6 +88,15 @@ define dso_local i64 @read_ll() {
; TEST64: ld 3, L..C0(2)
; TEST64-NEXT: ld 3, 0(3)

; CHECK32LARGE: name: read_ll
; CHECK32LARGE: %[[SCRATCH1:[0-9]+]]:gprc_and_gprc_nor0 = ADDIStocHA $r2, @ll
; CHECK32LARGE: LWZtocL @ll, killed %[[SCRATCH1]] :: (load (s32) from got)

; TEST32LARGE: .read_ll:
; TEST32LARGE: addis 3, L..C0@u(2)
; TEST32LARGE-NEXT: lwz 4, L..C0@l(3)
; TEST32LARGE-NEXT: lwz 3, 0(4)
; TEST32LARGE-NEXT: lwz 4, 4(4)

define dso_local float @read_float() {
entry:
Expand All @@ -96,6 +123,16 @@ define dso_local float @read_float() {
; TEST64: la 3, f[TD](2)
; TEST64-NEXT: lfs 1, 0(3)

; CHECK32LARGE: name: read_float
; CHECK32LARGE: %[[SCRATCH1:[0-9]+]]:gprc_and_gprc_nor0 = ADDIStocHA $r2, @f
; CHECK32LARGE-NEXT: %[[SCRATCH2:[0-9]+]]:gprc_and_gprc_nor0 = ADDItocL killed %[[SCRATCH1]], @f
; CHECK32LARGE-NEXT: LFS 0, killed %[[SCRATCH2]] :: (dereferenceable load (s32) from @f)

; FIXME: peephole optimization opportunity for lower part relocation @l to the consuming lfs
; TEST32LARGE: .read_float:
; TEST32LARGE: addis 3, f[TD]@u(2)
; TEST32LARGE-NEXT: la 3, f[TD]@l(3)
; TEST32LARGE-NEXT: lfs 1, 0(3)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here, the la should be able to merge into the latter lfs?


define dso_local void @write_double(double %in) {
entry:
Expand All @@ -121,6 +158,14 @@ define dso_local void @write_double(double %in) {
; TEST64: ld 3, L..C1(2)
; TEST64-NEXT: stfd 1, 0(3)

; CHECK32LARGE: name: write_double
; CHECK32LARGE: %[[SCRATCH1:[0-9]+]]:gprc_and_gprc_nor0 = ADDIStocHA $r2, @d
; CHECK32LARGE: LWZtocL @d, killed %[[SCRATCH1]] :: (load (s32) from got)

; TEST32LARGE: .write_double:
; TEST32LARGE: addis 3, L..C1@u(2)
; TEST32LARGE-NEXT: lwz 3, L..C1@l(3)
; TEST32LARGE-NEXT: stfd 1, 0(3)

define dso_local nonnull ptr @addr() {
entry:
Expand All @@ -144,6 +189,15 @@ define dso_local nonnull ptr @addr() {
; TEST64: .addr
; TEST64: la 3, i[TD](2)

; CHECK32LARGE: name: addr
; CHECK32LARGE: %[[SCRATCH1:[0-9]+]]:gprc_and_gprc_nor0 = ADDIStocHA $r2, @i
; CHECK32LARGE-NEXT: %[[SCRATCH2:[0-9]+]]:gprc = ADDItocL killed %[[SCRATCH1]], @i
; CHECK32LARGE-NEXT: $r3 = COPY %[[SCRATCH2]]

; TEST32LARGE: .addr:
; TEST32LARGE: addis 3, i[TD]@u(2)
; TEST32LARGE-NEXT: la 3, i[TD]@l(3)

; TEST32: .toc
; TEST32: .tc ll[TC],ll[RW]
; TEST32-NOT: .csect ll[TD]
Expand All @@ -170,4 +224,17 @@ define dso_local nonnull ptr @addr() {
; TEST64-NEXT: .globl f[TD]
; TEST64-NOT: .tc f[TD],f[RW]

; TEST32LARGE: .toc
; TEST32LARGE: .tc ll[TE],ll[RW]
; TEST32LARGE-NOT: .csect ll[TD]
; TEST32LARGE: .tc d[TE],d[RW]
; TEST32LARGE-NOT: .csect d[TD],2
; TEST32LARGE: .csect i[TD],2
; TEST32LARGE-NEXT: .globl i[TD]
; TEST32LARGE-NEXT: .align 2
; TEST32LARGE-NOT: .tc i[TE],i[RW]
; TEST32LARGE: .csect f[TD],2
; TEST32LARGE-NEXT: .globl f[TD]
; TEST32LARGE-NOT: .tc f[TE],f[RW]

attributes #0 = { "toc-data" }