Skip to content

[AMDGPU] Add pal metadata 3.0 support to callable pal funcs #67104

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Feb 6, 2024
54 changes: 32 additions & 22 deletions llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -997,6 +997,26 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
}

// Helper function to add common PAL Metadata 3.0+
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
const SIProgramInfo &CurrentProgramInfo,
CallingConv::ID CC,
const MCSubtargetInfo &ST) {
MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);

if (AMDGPU::isCompute(CC)) {
MD->setHwStage(CC, ".trap_present",
(bool)CurrentProgramInfo.TrapHandlerEnable);
MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);

MD->setHwStage(CC, ".lds_size",
(unsigned)(CurrentProgramInfo.LdsSize *
getLdsDwGranularity(ST) * sizeof(uint32_t)));
}
}

// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
// is AMDPAL. It stores each compute/SPI register setting and other PAL
// metadata items into the PALMD::Metadata, combining with any provided by the
Expand Down Expand Up @@ -1028,24 +1048,8 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
}
} else {
MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);

if (AMDGPU::isCompute(CC)) {
MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
MD->setHwStage(CC, ".trap_present",
(bool)CurrentProgramInfo.TrapHandlerEnable);

// EXCPEnMSB?
const unsigned LdsDwGranularity = 128;
MD->setHwStage(CC, ".lds_size",
(unsigned)(CurrentProgramInfo.LdsSize * LdsDwGranularity *
sizeof(uint32_t)));
MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
} else {
MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
}
MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, *getGlobalSTI());
}

// ScratchSize is in bytes, 16 aligned.
Expand Down Expand Up @@ -1098,10 +1102,16 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
StringRef FnName = MF.getFunction().getName();
MD->setFunctionScratchSize(FnName, MFI.getStackSize());

// Set compute registers
MD->setRsrc1(CallingConv::AMDGPU_CS,
CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS));
MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.getComputePGMRSrc2());
if (MD->getPALMajorVersion() < 3) {
// Set compute registers
MD->setRsrc1(CallingConv::AMDGPU_CS,
CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS));
MD->setRsrc2(CallingConv::AMDGPU_CS,
CurrentProgramInfo.getComputePGMRSrc2());
} else {
EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS,
*getGlobalSTI());
}

// Set optional info
MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2705,6 +2705,11 @@ bool isDPALU_DPP(const MCInstrDesc &OpDesc) {
return hasAny64BitVGPROperands(OpDesc);
}

unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) {
// Currently this is 128 for all subtargets
return 128;
}

} // namespace AMDGPU

raw_ostream &operator<<(raw_ostream &OS,
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1344,6 +1344,11 @@ bool isIntrinsicSourceOfDivergence(unsigned IntrID);
/// \returns true if the intrinsic is uniform
bool isIntrinsicAlwaysUniform(unsigned IntrID);

/// \returns lds block size in terms of dwords. \p
/// This is used to calculate the lds size encoded for PAL metadata 3.0+ which
/// must be defined in terms of bytes.
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST);

} // end namespace AMDGPU

raw_ostream &operator<<(raw_ostream &OS,
Expand Down
290 changes: 290 additions & 0 deletions llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,290 @@
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s

; CHECK: .amdgpu_pal_metadata
; CHECK-NEXT: ---
; CHECK-NEXT: amdpal.pipelines:
; CHECK-NEXT: - .api: Vulkan
; CHECK-NEXT: .compute_registers:
; CHECK-NEXT: .tg_size_en: true
; CHECK-NEXT: .tgid_x_en: false
; CHECK-NEXT: .tgid_y_en: false
; CHECK-NEXT: .tgid_z_en: false
; CHECK-NEXT: .tidig_comp_cnt: 0x1
; CHECK-NEXT: .hardware_stages:
; CHECK-NEXT: .cs:
; CHECK-NEXT: .checksum_value: 0x9444d7d0
; CHECK-NEXT: .debug_mode: 0
; CHECK-NEXT: .excp_en: 0
; CHECK-NEXT: .float_mode: 0xc0
; CHECK-NEXT: .ieee_mode: true
; CHECK-NEXT: .image_op: false
; CHECK-NEXT: .lds_size: 0x200
; CHECK-NEXT: .mem_ordered: true
; CHECK-NEXT: .sgpr_limit: 0x6a
; CHECK-NEXT: .threadgroup_dimensions:
; CHECK-NEXT: - 0x1
; CHECK-NEXT: - 0x400
; CHECK-NEXT: - 0x1
; CHECK-NEXT: .trap_present: false
; CHECK-NEXT: .user_data_reg_map:
; CHECK-NEXT: - 0x10000000
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: .user_sgprs: 0x3
; CHECK-NEXT: .vgpr_limit: 0x100
; CHECK-NEXT: .wavefront_size: 0x40
; CHECK-NEXT: .wgp_mode: true
; CHECK: .registers: {}
; CHECK-NEXT: .shader_functions:
; CHECK-NEXT: dynamic_stack:
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x22
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
; CHECK-NEXT: .vgpr_count: 0x2
; CHECK-NEXT: dynamic_stack_loop:
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x22
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
; CHECK-NEXT: .vgpr_count: 0x3
; CHECK-NEXT: multiple_stack:
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x21
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x24
; CHECK-NEXT: .vgpr_count: 0x3
; CHECK-NEXT: no_stack:
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x20
; CHECK-NEXT: .stack_frame_size_in_bytes: 0
; CHECK-NEXT: .vgpr_count: 0x1
; CHECK-NEXT: no_stack_call:
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x22
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
; CHECK-NEXT: .vgpr_count: 0x3
; CHECK-NEXT: no_stack_extern_call:
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x29
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
; CHECK-NEXT: .vgpr_count: 0x58
; CHECK-NEXT: no_stack_extern_call_many_args:
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x29
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x90
; CHECK-NEXT: .vgpr_count: 0x58
; CHECK-NEXT: no_stack_indirect_call:
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x29
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
; CHECK-NEXT: .vgpr_count: 0x58
; CHECK-NEXT: simple_lds:
; CHECK-NEXT: .lds_size: 0x100
; CHECK-NEXT: .sgpr_count: 0x20
; CHECK-NEXT: .stack_frame_size_in_bytes: 0
; CHECK-NEXT: .vgpr_count: 0x1
; CHECK-NEXT: simple_lds_recurse:
; CHECK-NEXT: .lds_size: 0x100
; CHECK-NEXT: .sgpr_count: 0x24
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
; CHECK-NEXT: .vgpr_count: 0x29
; CHECK-NEXT: simple_stack:
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x21
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x14
; CHECK-NEXT: .vgpr_count: 0x2
; CHECK-NEXT: simple_stack_call:
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x22
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
; CHECK-NEXT: .vgpr_count: 0x4
; CHECK-NEXT: simple_stack_extern_call:
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x29
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
; CHECK-NEXT: .vgpr_count: 0x58
; CHECK-NEXT: simple_stack_indirect_call:
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x29
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
; CHECK-NEXT: .vgpr_count: 0x58
; CHECK-NEXT: simple_stack_recurse:
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x24
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
; CHECK-NEXT: .vgpr_count: 0x2a
; CHECK:amdpal.version:
; CHECK-NEXT: - 0x3
; CHECK-NEXT: - 0
; CHECK-NEXT:...
; CHECK-NEXT: .end_amdgpu_pal_metadata

declare amdgpu_gfx float @extern_func(float) #0
declare amdgpu_gfx float @extern_func_many_args(<64 x float>) #0

@funcptr = external hidden unnamed_addr addrspace(4) constant ptr, align 4

define amdgpu_gfx float @no_stack(float %arg0) #0 {
%add = fadd float %arg0, 1.0
ret float %add
}

define amdgpu_gfx float @simple_stack(float %arg0) #0 {
%stack = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack
%val = load volatile float, ptr addrspace(5) %stack
%add = fadd float %arg0, %val
ret float %add
}

define amdgpu_gfx float @multiple_stack(float %arg0) #0 {
%stack = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack
%val = load volatile float, ptr addrspace(5) %stack
%add = fadd float %arg0, %val
%stack2 = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack2
%val2 = load volatile float, ptr addrspace(5) %stack2
%add2 = fadd float %add, %val2
ret float %add2
}

define amdgpu_gfx float @dynamic_stack(float %arg0) #0 {
bb0:
%cmp = fcmp ogt float %arg0, 0.0
br i1 %cmp, label %bb1, label %bb2

bb1:
%stack = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack
%val = load volatile float, ptr addrspace(5) %stack
%add = fadd float %arg0, %val
br label %bb2

bb2:
%res = phi float [ 0.0, %bb0 ], [ %add, %bb1 ]
ret float %res
}

define amdgpu_gfx float @dynamic_stack_loop(float %arg0) #0 {
bb0:
br label %bb1

bb1:
%ctr = phi i32 [ 0, %bb0 ], [ %newctr, %bb1 ]
%stack = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack
%val = load volatile float, ptr addrspace(5) %stack
%add = fadd float %arg0, %val
%cmp = icmp sgt i32 %ctr, 0
%newctr = sub i32 %ctr, 1
br i1 %cmp, label %bb1, label %bb2

bb2:
ret float %add
}

define amdgpu_gfx float @no_stack_call(float %arg0) #0 {
%res = call amdgpu_gfx float @simple_stack(float %arg0)
ret float %res
}

define amdgpu_gfx float @simple_stack_call(float %arg0) #0 {
%stack = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack
%val = load volatile float, ptr addrspace(5) %stack
%res = call amdgpu_gfx float @simple_stack(float %arg0)
%add = fadd float %res, %val
ret float %add
}

define amdgpu_gfx float @no_stack_extern_call(float %arg0) #0 {
%res = call amdgpu_gfx float @extern_func(float %arg0)
ret float %res
}

define amdgpu_gfx float @simple_stack_extern_call(float %arg0) #0 {
%stack = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack
%val = load volatile float, ptr addrspace(5) %stack
%res = call amdgpu_gfx float @extern_func(float %arg0)
%add = fadd float %res, %val
ret float %add
}

define amdgpu_gfx float @no_stack_extern_call_many_args(<64 x float> %arg0) #0 {
%res = call amdgpu_gfx float @extern_func_many_args(<64 x float> %arg0)
ret float %res
}

define amdgpu_gfx float @no_stack_indirect_call(float %arg0) #0 {
%fptr = load ptr, ptr addrspace(4) @funcptr
call amdgpu_gfx void %fptr()
ret float %arg0
}

define amdgpu_gfx float @simple_stack_indirect_call(float %arg0) #0 {
%stack = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack
%val = load volatile float, ptr addrspace(5) %stack
%fptr = load ptr, ptr addrspace(4) @funcptr
call amdgpu_gfx void %fptr()
%add = fadd float %arg0, %val
ret float %add
}

define amdgpu_gfx float @simple_stack_recurse(float %arg0) #0 {
%stack = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack
%val = load volatile float, ptr addrspace(5) %stack
%res = call amdgpu_gfx float @simple_stack_recurse(float %arg0)
%add = fadd float %res, %val
ret float %add
}

@lds = internal addrspace(3) global [64 x float] undef

define amdgpu_gfx float @simple_lds(float %arg0) #0 {
%val = load float, ptr addrspace(3) @lds
ret float %val
}

define amdgpu_gfx float @simple_lds_recurse(float %arg0) #0 {
%val = load float, ptr addrspace(3) @lds
%res = call amdgpu_gfx float @simple_lds_recurse(float %val)
ret float %res
}

attributes #0 = { nounwind }

!amdgpu.pal.metadata.msgpack = !{!0}

!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"}
!1 = !{i32 7}