Skip to content

Commit da53e5e

Browse files
committed
Fix clang build, document the operation, add error-handling tests
1 parent bcb72e3 commit da53e5e

File tree

6 files changed

+94
-8
lines changed

6 files changed

+94
-8
lines changed

clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -565,6 +565,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
565565
return Builder.CreateCall(F, {Addr});
566566
}
567567
case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
568+
// Should this have asan instrumentation?
568569
return emitBuiltinWithOneOverloadedType<5>(*this, E,
569570
Intrinsic::amdgcn_load_to_lds);
570571
}

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1778,7 +1778,7 @@ void test_cvt_sr_f16_f32(global half2 *out, float src, uint seed)
17781778
// CHECK-NEXT: ret void
17791779
//
17801780
void test_load_to_lds_96(global void* src, local void *dst) {
1781-
__builtin_amdgcn_global_load_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0);
1781+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0);
17821782
}
17831783

17841784
// CHECK-LABEL: @test_load_to_lds_128(
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -S -verify=gfx,expected -o - %s
2+
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -S -verify=gfx,expected -o - %s
3+
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -S -verify=gfx,expected -o - %s
4+
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -S -verify=gfx950,expected -o - %s
5+
// REQUIRES: amdgpu-registered-target
6+
7+
typedef unsigned int u32;
8+
9+
void test_load_to_lds_unsupported_size(global u32* src, local u32 *dst, u32 size, u32 offset, u32 aux) {
10+
__builtin_amdgcn_load_to_lds(src, dst, size, /*offset=*/0, /*aux=*/0); // expected-error{{argument to '__builtin_amdgcn_load_to_lds' must be a constant integer}}
11+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, offset, /*aux=*/0); // expected-error{{argument to '__builtin_amdgcn_load_to_lds' must be a constant integer}}
12+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, aux); // expected-error{{argument to '__builtin_amdgcn_load_to_lds' must be a constant integer}}
13+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/5, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}}
14+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/0, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}}
15+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/3, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}}
16+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0); // gfx-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}}
17+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); // gfx-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}}
18+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/-1, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}}
19+
}
20+
21+
__attribute__((target("gfx950-insts")))
22+
void test_load_to_lds_via_target_feature(global u32* src, local u32 *dst, u32 size, u32 offset, u32 aux) {
23+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0);
24+
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0);
25+
}

llvm/docs/AMDGPUUsage.rst

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1216,7 +1216,15 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
12161216
The format is a 64-bit concatenation of the MODE and TRAPSTS registers.
12171217

12181218
:ref:`llvm.set.fpenv<int_set_fpenv>` Sets the floating point environment to the specifies state.
1219-
1219+
llvm.amdgcn.load.to.lds.p<1/7> Loads values from global memory (either in the form of a global
1220+
a raw fat buffer pointer) to LDS. The size of the data copied can be 1, 2,
1221+
or 4 bytes (and gfx950 also allows 12 or 16 bytes). The LDS pointer
1222+
argument should be wavefront-uniform; the global pointer need not be.
1223+
The LDS pointer is implicitly offset by 4 * lane_id bytes for sies <= 4 bytes
1224+
and 16 * lane_id bytes for larger sizes. This lowers to `global_load_lds`,
1225+
`buffer_load_* ... lds`, or `global_load__* ... lds` depnedening on address
1226+
space and architecture. `amdgcn.global.load.lds` has the same semantics as
1227+
`amdgcn.load.to.lds.p1`.
12201228
llvm.amdgcn.readfirstlane Provides direct access to v_readfirstlane_b32. Returns the value in
12211229
the lowest active lane of the input operand. Currently implemented
12221230
for i16, i32, float, half, bfloat, <2 x i16>, <2 x half>, <2 x bfloat>,

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2647,7 +2647,6 @@ def int_amdgcn_perm :
26472647
/// the buffer-resource-wrapper pointers (address space 7 and 9).
26482648
/// TODO: add support for address space 5 and scratch_load_lds.
26492649
class AMDGPULoadToLDS :
2650-
ClangBuiltin<"__builtin_amdgcn_load_to_lds">,
26512650
Intrinsic <
26522651
[],
26532652
[llvm_anyptr_ty, // Base pointer to load from. Varies per lane.

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.gfx950.ll

Lines changed: 58 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2-
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s
3-
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s
2+
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-SDAG %s
3+
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-GISEL %s
44

55
; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-SDAG %s
66
; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-GISEL %s
@@ -26,6 +26,19 @@ define amdgpu_ps void @global_load_lds_dwordx3_vaddr_saddr(ptr addrspace(1) noca
2626
; GFX950-NEXT: s_nop 0
2727
; GFX950-NEXT: global_load_lds_dwordx3 v[0:1], off offset:16 sc0
2828
; GFX950-NEXT: s_endpgm
29+
; GFX950-SDAG-LABEL: global_load_lds_dwordx3_vaddr_saddr:
30+
; GFX950-SDAG: ; %bb.0:
31+
; GFX950-SDAG-NEXT: s_mov_b32 m0, s0
32+
; GFX950-SDAG-NEXT: s_nop 0
33+
; GFX950-SDAG-NEXT: global_load_lds_dwordx3 v[0:1], off offset:16 sc0
34+
; GFX950-SDAG-NEXT: s_endpgm
35+
;
36+
; GFX950-GISEL-LABEL: global_load_lds_dwordx3_vaddr_saddr:
37+
; GFX950-GISEL: ; %bb.0:
38+
; GFX950-GISEL-NEXT: s_mov_b32 m0, s0
39+
; GFX950-GISEL-NEXT: s_nop 0
40+
; GFX950-GISEL-NEXT: global_load_lds_dwordx3 v[0:1], off offset:16 sc0
41+
; GFX950-GISEL-NEXT: s_endpgm
2942
call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 12, i32 16, i32 1)
3043
ret void
3144
}
@@ -38,6 +51,21 @@ define amdgpu_ps void @buffer_load_lds_dwordx3_vaddr_saddr(ptr addrspace(7) noca
3851
; GFX950-NEXT: s_nop 0
3952
; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 offen offset:16 sc0 lds
4053
; GFX950-NEXT: s_endpgm
54+
; GFX950-SDAG-LABEL: buffer_load_lds_dwordx3_vaddr_saddr:
55+
; GFX950-SDAG: ; %bb.0:
56+
; GFX950-SDAG-NEXT: v_add_u32_e32 v0, s4, v0
57+
; GFX950-SDAG-NEXT: s_mov_b32 m0, s5
58+
; GFX950-SDAG-NEXT: s_nop 0
59+
; GFX950-SDAG-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 offen offset:16 sc0 lds
60+
; GFX950-SDAG-NEXT: s_endpgm
61+
;
62+
; GFX950-GISEL-LABEL: buffer_load_lds_dwordx3_vaddr_saddr:
63+
; GFX950-GISEL: ; %bb.0:
64+
; GFX950-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
65+
; GFX950-GISEL-NEXT: s_mov_b32 m0, s5
66+
; GFX950-GISEL-NEXT: s_nop 0
67+
; GFX950-GISEL-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 offen offset:16 sc0 lds
68+
; GFX950-GISEL-NEXT: s_endpgm
4169
%gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
4270
call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 12, i32 16, i32 1)
4371
ret void
@@ -54,6 +82,19 @@ define amdgpu_ps void @global_load_lds_dwordx4_vaddr_saddr(ptr addrspace(1) noca
5482
; GFX950-NEXT: s_nop 0
5583
; GFX950-NEXT: global_load_lds_dwordx4 v[0:1], off offset:16 sc0
5684
; GFX950-NEXT: s_endpgm
85+
; GFX950-SDAG-LABEL: global_load_lds_dwordx4_vaddr_saddr:
86+
; GFX950-SDAG: ; %bb.0:
87+
; GFX950-SDAG-NEXT: s_mov_b32 m0, s0
88+
; GFX950-SDAG-NEXT: s_nop 0
89+
; GFX950-SDAG-NEXT: global_load_lds_dwordx4 v[0:1], off offset:16 sc0
90+
; GFX950-SDAG-NEXT: s_endpgm
91+
;
92+
; GFX950-GISEL-LABEL: global_load_lds_dwordx4_vaddr_saddr:
93+
; GFX950-GISEL: ; %bb.0:
94+
; GFX950-GISEL-NEXT: s_mov_b32 m0, s0
95+
; GFX950-GISEL-NEXT: s_nop 0
96+
; GFX950-GISEL-NEXT: global_load_lds_dwordx4 v[0:1], off offset:16 sc0
97+
; GFX950-GISEL-NEXT: s_endpgm
5798
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 16, i32 16, i32 1)
5899
ret void
59100
}
@@ -66,10 +107,22 @@ define amdgpu_ps void @buffer_load_lds_dwordx4_vaddr_saddr(ptr addrspace(7) noca
66107
; GFX950-NEXT: s_nop 0
67108
; GFX950-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 offen offset:16 sc0 lds
68109
; GFX950-NEXT: s_endpgm
110+
; GFX950-SDAG-LABEL: buffer_load_lds_dwordx4_vaddr_saddr:
111+
; GFX950-SDAG: ; %bb.0:
112+
; GFX950-SDAG-NEXT: v_add_u32_e32 v0, s4, v0
113+
; GFX950-SDAG-NEXT: s_mov_b32 m0, s5
114+
; GFX950-SDAG-NEXT: s_nop 0
115+
; GFX950-SDAG-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 offen offset:16 sc0 lds
116+
; GFX950-SDAG-NEXT: s_endpgm
117+
;
118+
; GFX950-GISEL-LABEL: buffer_load_lds_dwordx4_vaddr_saddr:
119+
; GFX950-GISEL: ; %bb.0:
120+
; GFX950-GISEL-NEXT: v_add_u32_e32 v0, s4, v0
121+
; GFX950-GISEL-NEXT: s_mov_b32 m0, s5
122+
; GFX950-GISEL-NEXT: s_nop 0
123+
; GFX950-GISEL-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 offen offset:16 sc0 lds
124+
; GFX950-GISEL-NEXT: s_endpgm
69125
%gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
70126
call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 16, i32 16, i32 1)
71127
ret void
72128
}
73-
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
74-
; GFX950-GISEL: {{.*}}
75-
; GFX950-SDAG: {{.*}}

0 commit comments

Comments
 (0)