-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Add a new amdgcn.load.to.lds intrinsic #137425
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
bcb72e3
da53e5e
5afd162
62ef0e4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -564,6 +564,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, | |||
llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy}); | ||||
return Builder.CreateCall(F, {Addr}); | ||||
} | ||||
case AMDGPU::BI__builtin_amdgcn_load_to_lds: { | ||||
// Should this have asan instrumentation? | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Repeating comment from the PR that duplicated this one, yes but that's a backend problem? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It might be a backend problem However, llvm-project/clang/lib/CodeGen/CGBuiltin.cpp Line 4142 in a9ce60e
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (Bumping this comment) |
||||
return emitBuiltinWithOneOverloadedType<5>(*this, E, | ||||
Intrinsic::amdgcn_load_to_lds); | ||||
} | ||||
case AMDGPU::BI__builtin_amdgcn_get_fpenv: { | ||||
Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv, | ||||
{llvm::Type::getInt64Ty(getLLVMContext())}); | ||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 | ||
// REQUIRES: amdgpu-registered-target | ||
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx950 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s | ||
|
||
// COM: Most tests are in the OpenCL semastics, this is just a verification for HIP | ||
|
||
#define __device__ __attribute__((device)) | ||
#define __shared__ __attribute__((shared)) | ||
|
||
typedef unsigned int u32; | ||
|
||
// CHECK-LABEL: define dso_local void @_Z20test_load_to_lds_u32PjS_( | ||
// CHECK-SAME: ptr noundef [[SRC:%.*]], ptr noundef [[DST:%.*]]) #[[ATTR0:[0-9]+]] { | ||
// CHECK-NEXT: [[ENTRY:.*:]] | ||
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) | ||
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) | ||
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr | ||
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr | ||
// CHECK-NEXT: store ptr [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 8 | ||
// CHECK-NEXT: store ptr [[DST]], ptr [[DST_ADDR_ASCAST]], align 8 | ||
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SRC_ADDR_ASCAST]], align 8 | ||
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DST_ADDR_ASCAST]], align 8 | ||
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3) | ||
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p0(ptr [[TMP0]], ptr addrspace(3) [[TMP2]], i32 4, i32 0, i32 0) | ||
// CHECK-NEXT: ret void | ||
// | ||
__device__ void test_load_to_lds_u32(u32* src, __shared__ u32 *dst) { | ||
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0); | ||
} | ||
|
||
// CHECK-LABEL: define dso_local void @_Z30test_load_to_lds_u32_flat_destPjS_( | ||
// CHECK-SAME: ptr noundef [[SRC:%.*]], ptr noundef [[DST:%.*]]) #[[ATTR0]] { | ||
// CHECK-NEXT: [[ENTRY:.*:]] | ||
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) | ||
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) | ||
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr | ||
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr | ||
// CHECK-NEXT: store ptr [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 8 | ||
// CHECK-NEXT: store ptr [[DST]], ptr [[DST_ADDR_ASCAST]], align 8 | ||
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SRC_ADDR_ASCAST]], align 8 | ||
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DST_ADDR_ASCAST]], align 8 | ||
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3) | ||
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p0(ptr [[TMP0]], ptr addrspace(3) [[TMP2]], i32 4, i32 0, i32 0) | ||
// CHECK-NEXT: ret void | ||
// | ||
__device__ void test_load_to_lds_u32_flat_dest(u32* src, u32 *dst) { | ||
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0); | ||
} | ||
|
||
// CHECK-LABEL: define dso_local void @_Z20test_load_to_lds_128PvS_( | ||
// CHECK-SAME: ptr noundef [[SRC:%.*]], ptr noundef [[DST:%.*]]) #[[ATTR0]] { | ||
// CHECK-NEXT: [[ENTRY:.*:]] | ||
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) | ||
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) | ||
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr | ||
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr | ||
// CHECK-NEXT: store ptr [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 8 | ||
// CHECK-NEXT: store ptr [[DST]], ptr [[DST_ADDR_ASCAST]], align 8 | ||
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SRC_ADDR_ASCAST]], align 8 | ||
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DST_ADDR_ASCAST]], align 8 | ||
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3) | ||
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p0(ptr [[TMP0]], ptr addrspace(3) [[TMP2]], i32 16, i32 0, i32 0) | ||
// CHECK-NEXT: ret void | ||
// | ||
__device__ void test_load_to_lds_128(void* src, __shared__ void *dst) { | ||
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); | ||
} | ||
krzysz00 marked this conversation as resolved.
Show resolved
Hide resolved
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py | ||
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s | ||
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s | ||
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s | ||
// REQUIRES: amdgpu-registered-target | ||
|
||
krzysz00 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
typedef unsigned int u32; | ||
typedef unsigned short u16; | ||
typedef unsigned char u8; | ||
|
||
// CHECK-LABEL: @test_load_to_lds_u32( | ||
// CHECK-NEXT: entry: | ||
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) | ||
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) | ||
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr | ||
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr | ||
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8 | ||
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4 | ||
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8 | ||
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4 | ||
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0) | ||
// CHECK-NEXT: ret void | ||
// | ||
void test_load_to_lds_u32(global u32* src, local u32 *dst) { | ||
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0); | ||
} | ||
|
||
// CHECK-LABEL: @test_load_to_lds_u16( | ||
// CHECK-NEXT: entry: | ||
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) | ||
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) | ||
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr | ||
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr | ||
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8 | ||
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4 | ||
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8 | ||
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4 | ||
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0) | ||
// CHECK-NEXT: ret void | ||
// | ||
void test_load_to_lds_u16(global u16* src, local u16 *dst) { | ||
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/2, /*offset=*/0, /*aux=*/0); | ||
} | ||
|
||
// CHECK-LABEL: @test_load_to_lds_u8( | ||
// CHECK-NEXT: entry: | ||
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) | ||
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) | ||
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr | ||
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr | ||
// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8 | ||
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4 | ||
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8 | ||
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4 | ||
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0) | ||
// CHECK-NEXT: ret void | ||
// | ||
void test_load_to_lds_u8(global u8* src, local u8 *dst) { | ||
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/1, /*offset=*/0, /*aux=*/0); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -S -verify=gfx,expected -o - %s | ||
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -S -verify=gfx,expected -o - %s | ||
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -S -verify=gfx,expected -o - %s | ||
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -S -verify=gfx950,expected -o - %s | ||
// REQUIRES: amdgpu-registered-target | ||
|
||
typedef unsigned int u32; | ||
|
||
void test_load_to_lds_unsupported_size(global u32* src, local u32 *dst, u32 size, u32 offset, u32 aux) { | ||
__builtin_amdgcn_load_to_lds(src, dst, size, /*offset=*/0, /*aux=*/0); // expected-error{{argument to '__builtin_amdgcn_load_to_lds' must be a constant integer}} | ||
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, offset, /*aux=*/0); // expected-error{{argument to '__builtin_amdgcn_load_to_lds' must be a constant integer}} | ||
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, aux); // expected-error{{argument to '__builtin_amdgcn_load_to_lds' must be a constant integer}} | ||
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/5, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} | ||
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/0, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} | ||
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/3, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} | ||
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0); // gfx-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} | ||
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); // gfx-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} | ||
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/-1, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} | ||
} | ||
|
||
__attribute__((target("gfx950-insts"))) | ||
void test_load_to_lds_via_target_feature(global u32* src, local u32 *dst, u32 size, u32 offset, u32 aux) { | ||
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0); | ||
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2641,6 +2641,28 @@ def int_amdgcn_perm : | |
// GFX9 Intrinsics | ||
//===----------------------------------------------------------------------===// | ||
|
||
/// This is a general-purpose intrinsic for all operations that take a pointer | ||
/// a base location in LDS, and a data size and use it to perform a gather to LDS. | ||
/// This allows abstracting over both global pointers (address space 1) and | ||
/// the buffer-resource-wrapper pointers (address space 7 and 9). | ||
/// TODO: add support for address space 5 and scratch_load_lds. | ||
class AMDGPULoadToLDS : | ||
Intrinsic < | ||
[], | ||
[llvm_anyptr_ty, // Base pointer to load from. Varies per lane. | ||
LLVMQualPointerType<3>, // LDS base pointer to store to. Must be wave-uniform. | ||
llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950) | ||
llvm_i32_ty, // imm offset (applied to both input and LDS address) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the point of the imm offset argument? It's not semantically useful, right? You could just add this offset to the two pointer arguments before calling the intrinsic. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm sure the existing intrinsics expose it for a reason - probably because there isn't pattern-matching to strip such an offset There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ... Oh. Consider the case that you have global p + N and LDS q + N. Then the LDS combiner can rewrite this to (q' + O) + N, aka q' + (O + N). Then the two pointers won't have the same offset anymore and so it's unclear if you can slide it onto the instruction immediate There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ??? Matching addressing modes is part of the compiler's job. If you want absolute control over what goes in the immediate offset field you can write assembler! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lack of pattern matching isn't a reason to have the offset. There should be offset pattern matching regardless. I thought the argument for the current buffer intrinsic offset argument was something about the unreasonable bounds checking behaviors There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So @shiltian for reasons I may not be aware of that that's there (The buffer intrinsic's There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The global one definitely shouldn't have the offset (given it's there, we should be trying to do addressing mode folding into it) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. big shrug Is this something y'all want to try and fix here? (Also, procedural history on the buffer intrinsics points me to https://reviews.llvm.org/D124884 ( @rampitec ), which leaves it rather unclear why the immarg was added ... I think it's because for LDS, unlike the other buffer instructions, you can't do voffset => (actual voffset + imm)) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, it was added in https://reviews.llvm.org/D125279. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, thanks for finding the context! If we ever fix the immoffset issue, upgrading into making the immoffset a constant 0 and adding it to both pointers should be fine? But that'd require a sufficiently robust pattern match, which I'm not sure we're convinced of |
||
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0, | ||
// bit 1 = sc1, | ||
// bit 4 = scc)) | ||
[IntrWillReturn, IntrArgMemOnly, | ||
NoCapture<ArgIndex<0>>, ReadOnly<ArgIndex<0>>, | ||
NoCapture<ArgIndex<1>>, WriteOnly<ArgIndex<1>>, | ||
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree], | ||
"", [SDNPMemOperand]>; | ||
def int_amdgcn_load_to_lds : AMDGPULoadToLDS; | ||
|
||
class AMDGPUGlobalLoadLDS : | ||
ClangBuiltin<"__builtin_amdgcn_global_load_lds">, | ||
Intrinsic < | ||
|
Uh oh!
There was an error while loading. Please reload this page.