Skip to content

[AMDGPU] Add dynamic LDS size implicit kernel argument to CO-v5 #65273

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4114,6 +4114,9 @@ Code object V5 metadata is the same as
buffer that conforms to the requirements of the malloc/free
device library V1 version implementation.

"hidden_dynamic_lds_size"
Size of the dynamically allocated LDS memory is passed in the kernarg.

"hidden_private_base"
The high 32 bits of the flat addressing private aperture base.
Only used by GFX8 to allow conversion between private segment
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ bool MetadataVerifier::verifyKernelArgs(msgpack::DocNode &Node) {
.Case("hidden_default_queue", true)
.Case("hidden_completion_action", true)
.Case("hidden_multigrid_sync_arg", true)
.Case("hidden_dynamic_lds_size", true)
.Case("hidden_private_base", true)
.Case("hidden_shared_base", true)
.Case("hidden_queue_ptr", true)
Expand Down
10 changes: 9 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -646,7 +646,15 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs(
Offset += 8; // Skipped.
}

Offset += 72; // Reserved.
// Emit argument for hidden dynamic lds size
if (MFI.isDynamicLDSUsed()) {
emitKernelArg(DL, Int32Ty, Align(4), "hidden_dynamic_lds_size", Offset,
Args);
} else {
Offset += 4; // skipped
}

Offset += 68; // Reserved.

// hidden_private_base and hidden_shared_base are only when the subtarget has
// ApertureRegs.
Expand Down
39 changes: 30 additions & 9 deletions llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,26 @@

using namespace llvm;

static const GlobalVariable *
getKernelDynLDSGlobalFromFunction(const Function &F) {
const Module *M = F.getParent();
SmallString<64> KernelDynLDSName("llvm.amdgcn.");
KernelDynLDSName += F.getName();
KernelDynLDSName += ".dynlds";
return M->getNamedGlobal(KernelDynLDSName);
}

static bool hasLDSKernelArgument(const Function &F) {
for (const Argument &Arg : F.args()) {
Type *ArgTy = Arg.getType();
if (auto PtrTy = dyn_cast<PointerType>(ArgTy)) {
if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
return true;
}
}
return false;
}

AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F,
const AMDGPUSubtarget &ST)
: IsEntryFunction(AMDGPU::isEntryFunctionCC(F.getCallingConv())),
Expand Down Expand Up @@ -65,6 +85,10 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F,
Attribute NSZAttr = F.getFnAttribute("no-signed-zeros-fp-math");
NoSignedZerosFPMath =
NSZAttr.isStringAttribute() && NSZAttr.getValueAsString() == "true";

const GlobalVariable *DynLdsGlobal = getKernelDynLDSGlobalFromFunction(F);
if (DynLdsGlobal || hasLDSKernelArgument(F))
UsesDynamicLDS = true;
}

unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
Expand Down Expand Up @@ -139,15 +163,6 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
return Offset;
}

static const GlobalVariable *
getKernelDynLDSGlobalFromFunction(const Function &F) {
const Module *M = F.getParent();
std::string KernelDynLDSName = "llvm.amdgcn.";
KernelDynLDSName += F.getName();
KernelDynLDSName += ".dynlds";
return M->getNamedGlobal(KernelDynLDSName);
}

std::optional<uint32_t>
AMDGPUMachineFunction::getLDSKernelIdMetadata(const Function &F) {
// TODO: Would be more consistent with the abs symbols to use a range
Expand Down Expand Up @@ -210,3 +225,9 @@ void AMDGPUMachineFunction::setDynLDSAlign(const Function &F,
}
}
}

void AMDGPUMachineFunction::setUsesDynamicLDS(bool DynLDS) {
UsesDynamicLDS = DynLDS;
}

bool AMDGPUMachineFunction::isDynamicLDSUsed() const { return UsesDynamicLDS; }
7 changes: 7 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
/// stages.
Align DynLDSAlign;

// Flag to check dynamic LDS usage by kernel.
bool UsesDynamicLDS = false;

// Kernels + shaders. i.e. functions called by the hardware and not called
// by other functions.
bool IsEntryFunction = false;
Expand Down Expand Up @@ -119,6 +122,10 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
Align getDynLDSAlign() const { return DynLDSAlign; }

void setDynLDSAlign(const Function &F, const GlobalVariable &GV);

void setUsesDynamicLDS(bool DynLDS);

bool isDynamicLDSUsed() const;
};

}
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6890,6 +6890,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
// Adjust alignment for that dynamic shared memory array.
Function &F = DAG.getMachineFunction().getFunction();
MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
MFI->setUsesDynamicLDS(true);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This won't work if the dynamic LDS reference only appears in a called function and not in the parent kernel. Is that possible with the current module LDS lowering?

You also should set this if a kernel argument is an LDS pointer

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for suggestions @arsenm. Have updated the patch to identify below cases:

  • Kernel uses dynamic global -> UsesDynamicLDS flag will be set in SITargetLowering::LowerGlobalAddress.
  • Dyn lds globals (used in function and not present in parent kernel) are replaced with "llvm.amdgcn.kernel-name.dynlds" after module-lds-lowering pass. So, we can use the existing helper function "getKernelDynLDSGlobalFromFunction" from AMDGPUMachineFunction to identify if kernel uses dynamic LDS.
  • LDS passed as pointer argument to kernel -> Will be identified using utility hasLDSKernelArgument.

return SDValue(
DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
}
Expand Down
124 changes: 124 additions & 0 deletions llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-func-hidden-args-v5.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s

; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefix=CHECK %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s


; CHECK: amdhsa.kernels:
; CHECK-NEXT: - .args:
; CHECK-NEXT: - .address_space: global
; CHECK-NEXT: .name: r
; CHECK-NEXT: .offset: 0
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: global_buffer
; CHECK-NEXT: - .address_space: global
; CHECK-NEXT: .name: a
; CHECK-NEXT: .offset: 8
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: global_buffer
; CHECK-NEXT: - .address_space: global
; CHECK-NEXT: .name: b
; CHECK-NEXT: .offset: 16
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: global_buffer
; CHECK-NEXT: - .offset: 24
; CHECK-NEXT: .size: 4
; CHECK-NEXT: .value_kind: hidden_block_count_x
; CHECK-NEXT: - .offset: 28
; CHECK-NEXT: .size: 4
; CHECK-NEXT: .value_kind: hidden_block_count_y
; CHECK-NEXT: - .offset: 32
; CHECK-NEXT: .size: 4
; CHECK-NEXT: .value_kind: hidden_block_count_z
; CHECK-NEXT: - .offset: 36
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_group_size_x
; CHECK-NEXT: - .offset: 38
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_group_size_y
; CHECK-NEXT: - .offset: 40
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_group_size_z
; CHECK-NEXT: - .offset: 42
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_remainder_x
; CHECK-NEXT: - .offset: 44
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_remainder_y
; CHECK-NEXT: - .offset: 46
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_remainder_z
; CHECK-NEXT: - .offset: 64
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_global_offset_x
; CHECK-NEXT: - .offset: 72
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_global_offset_y
; CHECK-NEXT: - .offset: 80
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_global_offset_z
; CHECK-NEXT: - .offset: 88
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_grid_dims
; CHECK-NEXT: - .offset: 96
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_printf_buffer
; CHECK-NEXT: - .offset: 104
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_hostcall_buffer
; CHECK-NEXT: - .offset: 112
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_multigrid_sync_arg
; CHECK-NEXT: - .offset: 120
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_heap_v1
; CHECK-NEXT: - .offset: 128
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_default_queue
; CHECK-NEXT: - .offset: 136
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_completion_action
; CHECK: - .offset: 144
; CHECK-NEXT: .size: 4
; CHECK-NEXT: .value_kind: hidden_dynamic_lds_size
; CHECK: - .offset: 224
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_queue_ptr

; CHECK: .name: test_v5
; CHECK: .symbol: test_v5.kd

; CHECK: amdhsa.version:
; CHECK-NEXT: - 1
; CHECK-NEXT: - 2
@lds = external hidden addrspace(3) global [0 x i32], align 4

define void @funcs_dyn_lds() {
store i32 1234, ptr addrspacecast (ptr addrspace(3) @lds to ptr), align 4
ret void
}

define amdgpu_kernel void @test_v5(
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) #0 {
entry:
%a.val = load half, ptr addrspace(1) %a
%b.val = load half, ptr addrspace(1) %b
%r.val = fadd half %a.val, %b.val
store half %r.val, ptr addrspace(1) %r
call void @funcs_dyn_lds()
ret void
}

!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
!llvm.printf.fmts = !{!1, !2}
!1 = !{!"1:1:4:%d\5Cn"}
!2 = !{!"2:1:8:%g\5Cn"}

attributes #0 = { optnone noinline }

124 changes: 124 additions & 0 deletions llvm/test/CodeGen/AMDGPU/hsa-metadata-dynlds-funcarg-hidden-args-v5.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s

; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefix=CHECK %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s


; CHECK: amdhsa.kernels:
; CHECK-NEXT: - .args:
; CHECK-NEXT: - .address_space: global
; CHECK-NEXT: .name: r
; CHECK-NEXT: .offset: 0
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: global_buffer
; CHECK-NEXT: - .address_space: global
; CHECK-NEXT: .name: a
; CHECK-NEXT: .offset: 8
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: global_buffer
; CHECK-NEXT: - .address_space: global
; CHECK-NEXT: .name: b
; CHECK-NEXT: .offset: 16
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: global_buffer
; CHECK-NEXT: - .offset: 24
; CHECK-NEXT: .size: 4
; CHECK-NEXT: .value_kind: hidden_block_count_x
; CHECK-NEXT: - .offset: 28
; CHECK-NEXT: .size: 4
; CHECK-NEXT: .value_kind: hidden_block_count_y
; CHECK-NEXT: - .offset: 32
; CHECK-NEXT: .size: 4
; CHECK-NEXT: .value_kind: hidden_block_count_z
; CHECK-NEXT: - .offset: 36
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_group_size_x
; CHECK-NEXT: - .offset: 38
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_group_size_y
; CHECK-NEXT: - .offset: 40
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_group_size_z
; CHECK-NEXT: - .offset: 42
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_remainder_x
; CHECK-NEXT: - .offset: 44
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_remainder_y
; CHECK-NEXT: - .offset: 46
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_remainder_z
; CHECK-NEXT: - .offset: 64
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_global_offset_x
; CHECK-NEXT: - .offset: 72
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_global_offset_y
; CHECK-NEXT: - .offset: 80
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_global_offset_z
; CHECK-NEXT: - .offset: 88
; CHECK-NEXT: .size: 2
; CHECK-NEXT: .value_kind: hidden_grid_dims
; CHECK-NEXT: - .offset: 96
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_printf_buffer
; CHECK-NEXT: - .offset: 104
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_hostcall_buffer
; CHECK-NEXT: - .offset: 112
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_multigrid_sync_arg
; CHECK-NEXT: - .offset: 120
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_heap_v1
; CHECK-NEXT: - .offset: 128
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_default_queue
; CHECK-NEXT: - .offset: 136
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_completion_action
; CHECK: - .offset: 144
; CHECK-NEXT: .size: 4
; CHECK-NEXT: .value_kind: hidden_dynamic_lds_size
; CHECK: - .offset: 224
; CHECK-NEXT: .size: 8
; CHECK-NEXT: .value_kind: hidden_queue_ptr

; CHECK: .name: test_v5
; CHECK: .symbol: test_v5.kd

; CHECK: amdhsa.version:
; CHECK-NEXT: - 1
; CHECK-NEXT: - 2
@lds = external hidden addrspace(3) global [0 x i32], align 4

define void @funcs_dyn_lds(ptr addrspace(3) %lds_ptr) {
store i32 1234, ptr addrspace(3) %lds_ptr, align 4
ret void
}

define amdgpu_kernel void @test_v5(
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) #0 {
entry:
%a.val = load half, ptr addrspace(1) %a
%b.val = load half, ptr addrspace(1) %b
%r.val = fadd half %a.val, %b.val
store half %r.val, ptr addrspace(1) %r
call void @funcs_dyn_lds(ptr addrspace(3) @lds)
ret void
}

!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
!llvm.printf.fmts = !{!1, !2}
!1 = !{!"1:1:4:%d\5Cn"}
!2 = !{!"2:1:8:%g\5Cn"}

attributes #0 = { optnone noinline }

Loading