Skip to content

Commit 350bda4

Browse files
authored
AMDGPU: Rename intrinsics and remove f16/bf16 versions for load transpose (#86313)
Rename the intrinsics to close to the instruction mnemonic names: Use global_load_tr_b64 and global_load_tr_b128 instead of global_load_tr. This patch also removes f16/bf16 versions of builtins/intrinsics. To simplify the design, we should avoid enumerating all possible types in implementing builtins. We can always use bitcast.
1 parent 8f86c6b commit 350bda4

15 files changed

+104
-410
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -434,13 +434,8 @@ TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
434434

435435
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32")
436436
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32")
437-
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32")
438-
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8bf16, "V8yV8y*1", "nc", "gfx12-insts,wavefrontsize32")
439-
440437
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64")
441438
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64")
442-
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64")
443-
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4bf16, "V4yV4y*1", "nc", "gfx12-insts,wavefrontsize64")
444439

445440
//===----------------------------------------------------------------------===//
446441
// WMMA builtins.

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 6 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -18540,51 +18540,35 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1854018540
}
1854118541
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
1854218542
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
18543-
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
18544-
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
1854518543
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
18546-
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
18547-
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
1854818544
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: {
1854918545

18546+
Intrinsic::ID IID;
1855018547
llvm::Type *ArgTy;
1855118548
switch (BuiltinID) {
1855218549
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
1855318550
ArgTy = llvm::Type::getInt32Ty(getLLVMContext());
18551+
IID = Intrinsic::amdgcn_global_load_tr_b64;
1855418552
break;
1855518553
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
1855618554
ArgTy = llvm::FixedVectorType::get(
1855718555
llvm::Type::getInt32Ty(getLLVMContext()), 2);
18558-
break;
18559-
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
18560-
ArgTy = llvm::FixedVectorType::get(
18561-
llvm::Type::getBFloatTy(getLLVMContext()), 4);
18562-
break;
18563-
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
18564-
ArgTy = llvm::FixedVectorType::get(
18565-
llvm::Type::getHalfTy(getLLVMContext()), 4);
18556+
IID = Intrinsic::amdgcn_global_load_tr_b64;
1856618557
break;
1856718558
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
1856818559
ArgTy = llvm::FixedVectorType::get(
1856918560
llvm::Type::getInt16Ty(getLLVMContext()), 4);
18570-
break;
18571-
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
18572-
ArgTy = llvm::FixedVectorType::get(
18573-
llvm::Type::getBFloatTy(getLLVMContext()), 8);
18574-
break;
18575-
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
18576-
ArgTy = llvm::FixedVectorType::get(
18577-
llvm::Type::getHalfTy(getLLVMContext()), 8);
18561+
IID = Intrinsic::amdgcn_global_load_tr_b128;
1857818562
break;
1857918563
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
1858018564
ArgTy = llvm::FixedVectorType::get(
1858118565
llvm::Type::getInt16Ty(getLLVMContext()), 8);
18566+
IID = Intrinsic::amdgcn_global_load_tr_b128;
1858218567
break;
1858318568
}
1858418569

1858518570
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
18586-
llvm::Function *F =
18587-
CGM.getIntrinsic(Intrinsic::amdgcn_global_load_tr, {ArgTy});
18571+
llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
1858818572
return Builder.CreateCall(F, {Addr});
1858918573
}
1859018574
case AMDGPU::BI__builtin_amdgcn_get_fpenv: {

clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,24 +4,14 @@
44
// REQUIRES: amdgpu-registered-target
55

66
typedef int v2i __attribute__((ext_vector_type(2)));
7-
typedef half v8h __attribute__((ext_vector_type(8)));
87
typedef short v8s __attribute__((ext_vector_type(8)));
9-
typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
10-
11-
typedef half v4h __attribute__((ext_vector_type(4)));
128
typedef short v4s __attribute__((ext_vector_type(4)));
13-
typedef __bf16 v4bf16 __attribute__((ext_vector_type(4)));
149

15-
void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8bf16* v8bf16_inptr,
16-
global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4bf16* v4bf16_inptr)
10+
void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global int* int_inptr, global v4s* v4s_inptr)
1711
{
1812
v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
1913
v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
20-
v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
21-
v8bf16 o4 = __builtin_amdgcn_global_load_tr_b128_v8bf16(v8bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8bf16' needs target feature gfx12-insts,wavefrontsize32}}
2214

23-
int out_5 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}}
24-
v4s out_6 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
25-
v4h out_7 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
26-
v4bf16 o8 = __builtin_amdgcn_global_load_tr_b128_v4bf16(v4bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4bf16' needs target feature gfx12-insts,wavefrontsize64}}
15+
int out_3 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}}
16+
v4s out_4 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
2717
}

clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,10 @@
33

44
// REQUIRES: amdgpu-registered-target
55

6-
typedef half v4h __attribute__((ext_vector_type(4)));
76
typedef short v4s __attribute__((ext_vector_type(4)));
8-
typedef __bf16 v4bf16 __attribute__((ext_vector_type(4)));
97

10-
void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4bf16* v4bf16_inptr)
8+
void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr)
119
{
1210
int out_1 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}}
1311
v4s out_2 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
14-
v4h out_3 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
15-
v4bf16 o4 = __builtin_amdgcn_global_load_tr_b128_v4bf16(v4bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4bf16' needs target feature gfx12-insts,wavefrontsize64}}
1612
}

clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,10 @@
44
// REQUIRES: amdgpu-registered-target
55

66
typedef int v2i __attribute__((ext_vector_type(2)));
7-
typedef half v8h __attribute__((ext_vector_type(8)));
87
typedef short v8s __attribute__((ext_vector_type(8)));
9-
typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
108

11-
void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8bf16* v8bf16_inptr)
9+
void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr)
1210
{
1311
v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
1412
v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
15-
v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
16-
v8bf16 o4 = __builtin_amdgcn_global_load_tr_b128_v8bf16(v8bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8bf16' needs target feature gfx12-insts,wavefrontsize32}}
1713
}

clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl

Lines changed: 2 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,11 @@
33
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
44

55
typedef int v2i __attribute__((ext_vector_type(2)));
6-
typedef half v8h __attribute__((ext_vector_type(8)));
76
typedef short v8s __attribute__((ext_vector_type(8)));
8-
typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
97

108
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_v2i32(
119
// CHECK-GFX1200-NEXT: entry:
12-
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) [[INPTR:%.*]])
10+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]])
1311
// CHECK-GFX1200-NEXT: ret <2 x i32> [[TMP0]]
1412
//
1513
v2i test_amdgcn_global_load_tr_b64_v2i32(global v2i* inptr)
@@ -19,30 +17,10 @@ v2i test_amdgcn_global_load_tr_b64_v2i32(global v2i* inptr)
1917

2018
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8i16(
2119
// CHECK-GFX1200-NEXT: entry:
22-
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1) [[INPTR:%.*]])
20+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1) [[INPTR:%.*]])
2321
// CHECK-GFX1200-NEXT: ret <8 x i16> [[TMP0]]
2422
//
2523
v8s test_amdgcn_global_load_tr_b128_v8i16(global v8s* inptr)
2624
{
2725
return __builtin_amdgcn_global_load_tr_b128_v8i16(inptr);
2826
}
29-
30-
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8f16(
31-
// CHECK-GFX1200-NEXT: entry:
32-
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1) [[INPTR:%.*]])
33-
// CHECK-GFX1200-NEXT: ret <8 x half> [[TMP0]]
34-
//
35-
v8h test_amdgcn_global_load_tr_b128_v8f16(global v8h* inptr)
36-
{
37-
return __builtin_amdgcn_global_load_tr_b128_v8f16(inptr);
38-
}
39-
40-
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8bf16(
41-
// CHECK-GFX1200-NEXT: entry:
42-
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1) [[INPTR:%.*]])
43-
// CHECK-GFX1200-NEXT: ret <8 x bfloat> [[TMP0]]
44-
//
45-
v8bf16 test_amdgcn_global_load_tr_b128_v8bf16(global v8bf16* inptr)
46-
{
47-
return __builtin_amdgcn_global_load_tr_b128_v8bf16(inptr);
48-
}

clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl

Lines changed: 2 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,11 @@
22
// REQUIRES: amdgpu-registered-target
33
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
44

5-
typedef half v4h __attribute__((ext_vector_type(4)));
65
typedef short v4s __attribute__((ext_vector_type(4)));
7-
typedef __bf16 v4bf16 __attribute__((ext_vector_type(4)));
86

97
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_i32(
108
// CHECK-GFX1200-NEXT: entry:
11-
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1) [[INPTR:%.*]])
9+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1) [[INPTR:%.*]])
1210
// CHECK-GFX1200-NEXT: ret i32 [[TMP0]]
1311
//
1412
int test_amdgcn_global_load_tr_b64_i32(global int* inptr)
@@ -18,30 +16,10 @@ int test_amdgcn_global_load_tr_b64_i32(global int* inptr)
1816

1917
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4i16(
2018
// CHECK-GFX1200-NEXT: entry:
21-
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1) [[INPTR:%.*]])
19+
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1) [[INPTR:%.*]])
2220
// CHECK-GFX1200-NEXT: ret <4 x i16> [[TMP0]]
2321
//
2422
v4s test_amdgcn_global_load_tr_b128_v4i16(global v4s* inptr)
2523
{
2624
return __builtin_amdgcn_global_load_tr_b128_v4i16(inptr);
2725
}
28-
29-
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4f16(
30-
// CHECK-GFX1200-NEXT: entry:
31-
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1) [[INPTR:%.*]])
32-
// CHECK-GFX1200-NEXT: ret <4 x half> [[TMP0]]
33-
//
34-
v4h test_amdgcn_global_load_tr_b128_v4f16(global v4h* inptr)
35-
{
36-
return __builtin_amdgcn_global_load_tr_b128_v4f16(inptr);
37-
}
38-
39-
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4bf16(
40-
// CHECK-GFX1200-NEXT: entry:
41-
// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1) [[INPTR:%.*]])
42-
// CHECK-GFX1200-NEXT: ret <4 x bfloat> [[TMP0]]
43-
//
44-
v4bf16 test_amdgcn_global_load_tr_b128_v4bf16(global v4bf16* inptr)
45-
{
46-
return __builtin_amdgcn_global_load_tr_b128_v4bf16(inptr);
47-
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2769,17 +2769,14 @@ class AMDGPULoadIntrinsic<LLVMType ptr_ty>:
27692769
>;
27702770

27712771
// Wave32
2772-
// <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1)) -> global_load_tr_b64
2773-
// <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1)) -> global_load_tr_b128
2774-
// <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1)) -> global_load_tr_b128
2775-
// <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1)) -> global_load_tr_b128
2772+
// <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1)) -> global_load_tr_b64
2773+
// <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1)) -> global_load_tr_b128
27762774
// Wave64
2777-
// i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1)) -> global_load_tr_b64
2778-
// <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1)) -> global_load_tr_b128
2779-
// <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1)) -> global_load_tr_b128
2780-
// <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1)) -> global_load_tr_b128
2775+
// i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1)) -> global_load_tr_b64
2776+
// <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1)) -> global_load_tr_b128
27812777

2782-
def int_amdgcn_global_load_tr : AMDGPULoadIntrinsic<global_ptr_ty>;
2778+
def int_amdgcn_global_load_tr_b64 : AMDGPULoadIntrinsic<global_ptr_ty>;
2779+
def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic<global_ptr_ty>;
27832780

27842781
// i32 @llvm.amdgcn.wave.id()
27852782
def int_amdgcn_wave_id :

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4889,7 +4889,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
48894889
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
48904890
case Intrinsic::amdgcn_atomic_cond_sub_u32:
48914891
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
4892-
case Intrinsic::amdgcn_global_load_tr:
4892+
case Intrinsic::amdgcn_global_load_tr_b64:
4893+
case Intrinsic::amdgcn_global_load_tr_b128:
48934894
return getDefaultMappingAllVGPR(MI);
48944895
case Intrinsic::amdgcn_ds_ordered_add:
48954896
case Intrinsic::amdgcn_ds_ordered_swap:

llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,8 @@ def : SourceOfDivergence<intr>;
365365
foreach intr = AMDGPUWMMAIntrinsicsGFX12 in
366366
def : SourceOfDivergence<intr>;
367367

368-
def : SourceOfDivergence<int_amdgcn_global_load_tr>;
368+
def : SourceOfDivergence<int_amdgcn_global_load_tr_b64>;
369+
def : SourceOfDivergence<int_amdgcn_global_load_tr_b128>;
369370

370371
// The dummy boolean output is divergent from the IR's perspective,
371372
// but the mask results are uniform. These produce a divergent and

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1593,16 +1593,12 @@ let OtherPredicates = [isGFX12Plus] in {
15931593
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ORDERED_ADD_B64", "int_amdgcn_global_atomic_ordered_add_b64", i64, i64, /* isIntr */ 1>;
15941594

15951595
let WaveSizePredicate = isWave32 in {
1596-
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w32, int_amdgcn_global_load_tr, v2i32>;
1597-
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8i16>;
1598-
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8f16>;
1599-
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8bf16>;
1596+
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w32, int_amdgcn_global_load_tr_b64, v2i32>;
1597+
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr_b128, v8i16>;
16001598
}
16011599
let WaveSizePredicate = isWave64 in {
1602-
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w64, int_amdgcn_global_load_tr, i32>;
1603-
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4i16>;
1604-
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4f16>;
1605-
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4bf16>;
1600+
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w64, int_amdgcn_global_load_tr_b64, i32>;
1601+
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, v4i16>;
16061602
}
16071603
}
16081604

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1355,7 +1355,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
13551355
MachineMemOperand::MOVolatile;
13561356
return true;
13571357
}
1358-
case Intrinsic::amdgcn_global_load_tr: {
1358+
case Intrinsic::amdgcn_global_load_tr_b64:
1359+
case Intrinsic::amdgcn_global_load_tr_b128: {
13591360
Info.opc = ISD::INTRINSIC_W_CHAIN;
13601361
Info.memVT = MVT::getVT(CI.getType());
13611362
Info.ptrVal = CI.getOperand(0);
@@ -1462,7 +1463,8 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
14621463
case Intrinsic::amdgcn_global_atomic_fmin:
14631464
case Intrinsic::amdgcn_global_atomic_fmin_num:
14641465
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1465-
case Intrinsic::amdgcn_global_load_tr:
1466+
case Intrinsic::amdgcn_global_load_tr_b64:
1467+
case Intrinsic::amdgcn_global_load_tr_b128:
14661468
Ptr = II->getArgOperand(0);
14671469
break;
14681470
case Intrinsic::amdgcn_global_load_lds:

0 commit comments

Comments
 (0)