Skip to content

Commit 3054d0d

Browse files
authored
AMDGPU: Rename and add bf16 support for global_load_tr builtins (#86202)
Make the name of a clang builtin as close to the mnemonic instruction name as possible. The data type suffix may not be enough to tell what instruction the builtin is going to produce. This patch also add the bf16 support for global_load_tr_b128 builtins.
1 parent 2091c74 commit 3054d0d

7 files changed

+94
-79
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -432,13 +432,15 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts")
432432
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts")
433433
TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
434434

435-
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32")
436-
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32")
437-
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32")
438-
439-
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64")
440-
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64")
441-
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64")
435+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32")
436+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32")
437+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32")
438+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8bf16, "V8yV8y*1", "nc", "gfx12-insts,wavefrontsize32")
439+
440+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64")
441+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64")
442+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64")
443+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4bf16, "V4yV4y*1", "nc", "gfx12-insts,wavefrontsize64")
442444

443445
//===----------------------------------------------------------------------===//
444446
// WMMA builtins.

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18531,35 +18531,45 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1853118531
llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
1853218532
return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1});
1853318533
}
18534-
case AMDGPU::BI__builtin_amdgcn_global_load_tr_i32:
18535-
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v2i32:
18536-
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4f16:
18537-
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4i16:
18538-
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8f16:
18539-
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8i16: {
18534+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
18535+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
18536+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
18537+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
18538+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
18539+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
18540+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
18541+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: {
1854018542

1854118543
llvm::Type *ArgTy;
1854218544
switch (BuiltinID) {
18543-
case AMDGPU::BI__builtin_amdgcn_global_load_tr_i32:
18545+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
1854418546
ArgTy = llvm::Type::getInt32Ty(getLLVMContext());
1854518547
break;
18546-
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v2i32:
18548+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
1854718549
ArgTy = llvm::FixedVectorType::get(
1854818550
llvm::Type::getInt32Ty(getLLVMContext()), 2);
1854918551
break;
18550-
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4f16:
18552+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
18553+
ArgTy = llvm::FixedVectorType::get(
18554+
llvm::Type::getBFloatTy(getLLVMContext()), 4);
18555+
break;
18556+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
1855118557
ArgTy = llvm::FixedVectorType::get(
1855218558
llvm::Type::getHalfTy(getLLVMContext()), 4);
1855318559
break;
18554-
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4i16:
18560+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
1855518561
ArgTy = llvm::FixedVectorType::get(
1855618562
llvm::Type::getInt16Ty(getLLVMContext()), 4);
1855718563
break;
18558-
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8f16:
18564+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
18565+
ArgTy = llvm::FixedVectorType::get(
18566+
llvm::Type::getBFloatTy(getLLVMContext()), 8);
18567+
break;
18568+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
1855918569
ArgTy = llvm::FixedVectorType::get(
1856018570
llvm::Type::getHalfTy(getLLVMContext()), 8);
1856118571
break;
18562-
case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8i16:
18572+
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
1856318573
ArgTy = llvm::FixedVectorType::get(
1856418574
llvm::Type::getInt16Ty(getLLVMContext()), 8);
1856518575
break;

clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,22 @@
66
typedef int v2i __attribute__((ext_vector_type(2)));
77
typedef half v8h __attribute__((ext_vector_type(8)));
88
typedef short v8s __attribute__((ext_vector_type(8)));
9+
typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
910

1011
typedef half v4h __attribute__((ext_vector_type(4)));
1112
typedef short v4s __attribute__((ext_vector_type(4)));
13+
typedef __bf16 v4bf16 __attribute__((ext_vector_type(4)));
1214

13-
14-
15-
void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr,
16-
global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr)
15+
void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8bf16* v8bf16_inptr,
16+
global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4bf16* v4bf16_inptr)
1717
{
18-
v2i out_1 = __builtin_amdgcn_global_load_tr_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
19-
v8s out_2 = __builtin_amdgcn_global_load_tr_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
20-
v8h out_3 = __builtin_amdgcn_global_load_tr_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
21-
22-
int out_4 = __builtin_amdgcn_global_load_tr_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_i32' needs target feature gfx12-insts,wavefrontsize64}}
23-
v4s out_5 = __builtin_amdgcn_global_load_tr_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
24-
v4h out_6 = __builtin_amdgcn_global_load_tr_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
18+
v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
19+
v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
20+
v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
21+
v8bf16 o4 = __builtin_amdgcn_global_load_tr_b128_v8bf16(v8bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8bf16' needs target feature gfx12-insts,wavefrontsize32}}
22+
23+
int out_5 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}}
24+
v4s out_6 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
25+
v4h out_7 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
26+
v4bf16 o8 = __builtin_amdgcn_global_load_tr_b128_v4bf16(v4bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4bf16' needs target feature gfx12-insts,wavefrontsize64}}
2527
}
26-

clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,12 @@
55

66
typedef half v4h __attribute__((ext_vector_type(4)));
77
typedef short v4s __attribute__((ext_vector_type(4)));
8+
typedef __bf16 v4bf16 __attribute__((ext_vector_type(4)));
89

9-
void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr)
10+
void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4bf16* v4bf16_inptr)
1011
{
11-
int out_4 = __builtin_amdgcn_global_load_tr_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_i32' needs target feature gfx12-insts,wavefrontsize64}}
12-
v4s out_5 = __builtin_amdgcn_global_load_tr_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
13-
v4h out_6 = __builtin_amdgcn_global_load_tr_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
12+
int out_1 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}}
13+
v4s out_2 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
14+
v4h out_3 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
15+
v4bf16 o4 = __builtin_amdgcn_global_load_tr_b128_v4bf16(v4bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4bf16' needs target feature gfx12-insts,wavefrontsize64}}
1416
}
15-

clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,12 @@
66
typedef int v2i __attribute__((ext_vector_type(2)));
77
typedef half v8h __attribute__((ext_vector_type(8)));
88
typedef short v8s __attribute__((ext_vector_type(8)));
9+
typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
910

10-
void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr)
11+
void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8bf16* v8bf16_inptr)
1112
{
12-
v2i out_1 = __builtin_amdgcn_global_load_tr_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
13-
v8s out_2 = __builtin_amdgcn_global_load_tr_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
14-
v8h out_3 = __builtin_amdgcn_global_load_tr_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
13+
v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
14+
v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
15+
v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
16+
v8bf16 o4 = __builtin_amdgcn_global_load_tr_b128_v8bf16(v8bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8bf16' needs target feature gfx12-insts,wavefrontsize32}}
1517
}
16-

clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,44 +5,44 @@
55
typedef int v2i __attribute__((ext_vector_type(2)));
66
typedef half v8h __attribute__((ext_vector_type(8)));
77
typedef short v8s __attribute__((ext_vector_type(8)));
8+
typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
89

9-
// Wave32
10-
11-
//
12-
// amdgcn_global_load_tr
13-
//
14-
15-
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v2i32(
10+
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_v2i32(
1611
// CHECK-GFX1200-NEXT: entry:
1712
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) [[INPTR:%.*]])
1813
// CHECK-GFX1200-NEXT: ret <2 x i32> [[TMP0]]
1914
//
20-
v2i test_amdgcn_global_load_tr_v2i32(global v2i* inptr)
15+
v2i test_amdgcn_global_load_tr_b64_v2i32(global v2i* inptr)
2116
{
22-
return __builtin_amdgcn_global_load_tr_v2i32(inptr);
17+
return __builtin_amdgcn_global_load_tr_b64_v2i32(inptr);
2318
}
2419

25-
//
26-
// amdgcn_global_load_tr
27-
//
28-
29-
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v8i16(
20+
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8i16(
3021
// CHECK-GFX1200-NEXT: entry:
3122
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1) [[INPTR:%.*]])
3223
// CHECK-GFX1200-NEXT: ret <8 x i16> [[TMP0]]
3324
//
34-
v8s test_amdgcn_global_load_tr_v8i16(global v8s* inptr)
25+
v8s test_amdgcn_global_load_tr_b128_v8i16(global v8s* inptr)
3526
{
36-
return __builtin_amdgcn_global_load_tr_v8i16(inptr);
27+
return __builtin_amdgcn_global_load_tr_b128_v8i16(inptr);
3728
}
3829

39-
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v8f16(
30+
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8f16(
4031
// CHECK-GFX1200-NEXT: entry:
4132
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1) [[INPTR:%.*]])
4233
// CHECK-GFX1200-NEXT: ret <8 x half> [[TMP0]]
4334
//
44-
v8h test_amdgcn_global_load_tr_v8f16(global v8h* inptr)
35+
v8h test_amdgcn_global_load_tr_b128_v8f16(global v8h* inptr)
4536
{
46-
return __builtin_amdgcn_global_load_tr_v8f16(inptr);
37+
return __builtin_amdgcn_global_load_tr_b128_v8f16(inptr);
4738
}
4839

40+
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8bf16(
41+
// CHECK-GFX1200-NEXT: entry:
42+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1) [[INPTR:%.*]])
43+
// CHECK-GFX1200-NEXT: ret <8 x bfloat> [[TMP0]]
44+
//
45+
v8bf16 test_amdgcn_global_load_tr_b128_v8bf16(global v8bf16* inptr)
46+
{
47+
return __builtin_amdgcn_global_load_tr_b128_v8bf16(inptr);
48+
}

clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,44 +4,44 @@
44

55
typedef half v4h __attribute__((ext_vector_type(4)));
66
typedef short v4s __attribute__((ext_vector_type(4)));
7+
typedef __bf16 v4bf16 __attribute__((ext_vector_type(4)));
78

8-
// Wave64
9-
10-
//
11-
// amdgcn_global_load_tr
12-
//
13-
14-
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_i32(
9+
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_i32(
1510
// CHECK-GFX1200-NEXT: entry:
1611
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1) [[INPTR:%.*]])
1712
// CHECK-GFX1200-NEXT: ret i32 [[TMP0]]
1813
//
19-
int test_amdgcn_global_load_tr_i32(global int* inptr)
14+
int test_amdgcn_global_load_tr_b64_i32(global int* inptr)
2015
{
21-
return __builtin_amdgcn_global_load_tr_i32(inptr);
16+
return __builtin_amdgcn_global_load_tr_b64_i32(inptr);
2217
}
2318

24-
//
25-
// amdgcn_global_load_tr
26-
//
27-
28-
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v4i16(
19+
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4i16(
2920
// CHECK-GFX1200-NEXT: entry:
3021
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1) [[INPTR:%.*]])
3122
// CHECK-GFX1200-NEXT: ret <4 x i16> [[TMP0]]
3223
//
33-
v4s test_amdgcn_global_load_tr_v4i16(global v4s* inptr)
24+
v4s test_amdgcn_global_load_tr_b128_v4i16(global v4s* inptr)
3425
{
35-
return __builtin_amdgcn_global_load_tr_v4i16(inptr);
26+
return __builtin_amdgcn_global_load_tr_b128_v4i16(inptr);
3627
}
3728

38-
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v4f16(
29+
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4f16(
3930
// CHECK-GFX1200-NEXT: entry:
4031
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1) [[INPTR:%.*]])
4132
// CHECK-GFX1200-NEXT: ret <4 x half> [[TMP0]]
4233
//
43-
v4h test_amdgcn_global_load_tr_v4f16(global v4h* inptr)
34+
v4h test_amdgcn_global_load_tr_b128_v4f16(global v4h* inptr)
4435
{
45-
return __builtin_amdgcn_global_load_tr_v4f16(inptr);
36+
return __builtin_amdgcn_global_load_tr_b128_v4f16(inptr);
4637
}
4738

39+
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4bf16(
40+
// CHECK-GFX1200-NEXT: entry:
41+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1) [[INPTR:%.*]])
42+
// CHECK-GFX1200-NEXT: ret <4 x bfloat> [[TMP0]]
43+
//
44+
v4bf16 test_amdgcn_global_load_tr_b128_v4bf16(global v4bf16* inptr)
45+
{
46+
return __builtin_amdgcn_global_load_tr_b128_v4bf16(inptr);
47+
}

0 commit comments

Comments
 (0)