AMDGPU: Rename and add bf16 support for global_load_tr builtins (#86202)

changpeng · web-flow · commit 3054d0dae7a8 · 2024-03-22T08:51:53.000-07:00
Make the name of a clang builtin as close to the mnemonic instruction
name as possible. The data type suffix may not be enough to tell what
instruction the builtin is going to produce.
  This patch also add the bf16 support for global_load_tr_b128 builtins.
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -432,13 +432,15 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
 
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32")
-
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8bf16, "V8yV8y*1", "nc", "gfx12-insts,wavefrontsize32")
+
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4bf16, "V4yV4y*1", "nc", "gfx12-insts,wavefrontsize64")
 
 //===----------------------------------------------------------------------===//
 // WMMA builtins.
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18531,35 +18531,45 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
     return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1});
   }
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_i32:
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_v2i32:
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4f16:
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4i16:
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8f16:
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8i16: {
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: {
 
     llvm::Type *ArgTy;
     switch (BuiltinID) {
-    case AMDGPU::BI__builtin_amdgcn_global_load_tr_i32:
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
       ArgTy = llvm::Type::getInt32Ty(getLLVMContext());
       break;
-    case AMDGPU::BI__builtin_amdgcn_global_load_tr_v2i32:
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
       ArgTy = llvm::FixedVectorType::get(
           llvm::Type::getInt32Ty(getLLVMContext()), 2);
       break;
-    case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4f16:
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
+      ArgTy = llvm::FixedVectorType::get(
+          llvm::Type::getBFloatTy(getLLVMContext()), 4);
+      break;
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
       ArgTy = llvm::FixedVectorType::get(
           llvm::Type::getHalfTy(getLLVMContext()), 4);
       break;
-    case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4i16:
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
       ArgTy = llvm::FixedVectorType::get(
           llvm::Type::getInt16Ty(getLLVMContext()), 4);
       break;
-    case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8f16:
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
+      ArgTy = llvm::FixedVectorType::get(
+          llvm::Type::getBFloatTy(getLLVMContext()), 8);
+      break;
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
       ArgTy = llvm::FixedVectorType::get(
           llvm::Type::getHalfTy(getLLVMContext()), 8);
       break;
-    case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8i16:
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
       ArgTy = llvm::FixedVectorType::get(
           llvm::Type::getInt16Ty(getLLVMContext()), 8);
       break;
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl
@@ -6,21 +6,22 @@
 typedef int    v2i   __attribute__((ext_vector_type(2)));
 typedef half   v8h   __attribute__((ext_vector_type(8)));
 typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
 
 typedef half   v4h   __attribute__((ext_vector_type(4)));
 typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef __bf16 v4bf16 __attribute__((ext_vector_type(4)));
 
-
-
-void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr,
-                           global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr)
+void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8bf16* v8bf16_inptr,
+                           global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4bf16* v4bf16_inptr)
 {
-  v2i out_1 = __builtin_amdgcn_global_load_tr_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
-  v8s out_2 = __builtin_amdgcn_global_load_tr_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
-  v8h out_3 = __builtin_amdgcn_global_load_tr_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
-
-  int out_4 = __builtin_amdgcn_global_load_tr_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_i32' needs target feature gfx12-insts,wavefrontsize64}}
-  v4s out_5 = __builtin_amdgcn_global_load_tr_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
-  v4h out_6 = __builtin_amdgcn_global_load_tr_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
+  v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
+  v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
+  v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
+  v8bf16 o4 = __builtin_amdgcn_global_load_tr_b128_v8bf16(v8bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8bf16' needs target feature gfx12-insts,wavefrontsize32}}
+
+  int out_5 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}}
+  v4s out_6 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
+  v4h out_7 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
+  v4bf16 o8 = __builtin_amdgcn_global_load_tr_b128_v4bf16(v4bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4bf16' needs target feature gfx12-insts,wavefrontsize64}}
 }
-
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl
@@ -5,11 +5,12 @@
 
 typedef half   v4h   __attribute__((ext_vector_type(4)));
 typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef __bf16 v4bf16 __attribute__((ext_vector_type(4)));
 
-void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr)
+void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4bf16* v4bf16_inptr)
 {
-  int out_4 = __builtin_amdgcn_global_load_tr_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_i32' needs target feature gfx12-insts,wavefrontsize64}}
-  v4s out_5 = __builtin_amdgcn_global_load_tr_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
-  v4h out_6 = __builtin_amdgcn_global_load_tr_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
+  int out_1 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}}
+  v4s out_2 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
+  v4h out_3 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
+  v4bf16 o4 = __builtin_amdgcn_global_load_tr_b128_v4bf16(v4bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4bf16' needs target feature gfx12-insts,wavefrontsize64}}
 }
-
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl
@@ -6,11 +6,12 @@
 typedef int    v2i   __attribute__((ext_vector_type(2)));
 typedef half   v8h   __attribute__((ext_vector_type(8)));
 typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
 
-void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr)
+void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8bf16* v8bf16_inptr)
 {
-  v2i out_1 = __builtin_amdgcn_global_load_tr_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
-  v8s out_2 = __builtin_amdgcn_global_load_tr_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
-  v8h out_3 = __builtin_amdgcn_global_load_tr_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
+  v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
+  v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
+  v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
+  v8bf16 o4 = __builtin_amdgcn_global_load_tr_b128_v8bf16(v8bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8bf16' needs target feature gfx12-insts,wavefrontsize32}}
 }
-
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
@@ -5,44 +5,44 @@
 typedef int    v2i   __attribute__((ext_vector_type(2)));
 typedef half   v8h   __attribute__((ext_vector_type(8)));
 typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
 
-// Wave32
-
-//
-// amdgcn_global_load_tr
-//
-
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v2i32(
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_v2i32(
 // CHECK-GFX1200-NEXT:  entry:
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) [[INPTR:%.*]])
 // CHECK-GFX1200-NEXT:    ret <2 x i32> [[TMP0]]
 //
-v2i test_amdgcn_global_load_tr_v2i32(global v2i* inptr)
+v2i test_amdgcn_global_load_tr_b64_v2i32(global v2i* inptr)
 {
-  return __builtin_amdgcn_global_load_tr_v2i32(inptr);
+  return __builtin_amdgcn_global_load_tr_b64_v2i32(inptr);
 }
 
-//
-// amdgcn_global_load_tr
-//
-
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v8i16(
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8i16(
 // CHECK-GFX1200-NEXT:  entry:
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1) [[INPTR:%.*]])
 // CHECK-GFX1200-NEXT:    ret <8 x i16> [[TMP0]]
 //
-v8s test_amdgcn_global_load_tr_v8i16(global v8s* inptr)
+v8s test_amdgcn_global_load_tr_b128_v8i16(global v8s* inptr)
 {
-  return __builtin_amdgcn_global_load_tr_v8i16(inptr);
+  return __builtin_amdgcn_global_load_tr_b128_v8i16(inptr);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v8f16(
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8f16(
 // CHECK-GFX1200-NEXT:  entry:
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1) [[INPTR:%.*]])
 // CHECK-GFX1200-NEXT:    ret <8 x half> [[TMP0]]
 //
-v8h test_amdgcn_global_load_tr_v8f16(global v8h* inptr)
+v8h test_amdgcn_global_load_tr_b128_v8f16(global v8h* inptr)
 {
-  return __builtin_amdgcn_global_load_tr_v8f16(inptr);
+  return __builtin_amdgcn_global_load_tr_b128_v8f16(inptr);
 }
 
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8bf16(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT:    ret <8 x bfloat> [[TMP0]]
+//
+v8bf16 test_amdgcn_global_load_tr_b128_v8bf16(global v8bf16* inptr)
+{
+  return __builtin_amdgcn_global_load_tr_b128_v8bf16(inptr);
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
@@ -4,44 +4,44 @@
 
 typedef half   v4h   __attribute__((ext_vector_type(4)));
 typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef __bf16 v4bf16 __attribute__((ext_vector_type(4)));
 
-// Wave64
-
-//
-// amdgcn_global_load_tr
-//
-
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_i32(
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_i32(
 // CHECK-GFX1200-NEXT:  entry:
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1) [[INPTR:%.*]])
 // CHECK-GFX1200-NEXT:    ret i32 [[TMP0]]
 //
-int test_amdgcn_global_load_tr_i32(global int* inptr)
+int test_amdgcn_global_load_tr_b64_i32(global int* inptr)
 {
-  return __builtin_amdgcn_global_load_tr_i32(inptr);
+  return __builtin_amdgcn_global_load_tr_b64_i32(inptr);
 }
 
-//
-// amdgcn_global_load_tr
-//
-
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v4i16(
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4i16(
 // CHECK-GFX1200-NEXT:  entry:
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1) [[INPTR:%.*]])
 // CHECK-GFX1200-NEXT:    ret <4 x i16> [[TMP0]]
 //
-v4s test_amdgcn_global_load_tr_v4i16(global v4s* inptr)
+v4s test_amdgcn_global_load_tr_b128_v4i16(global v4s* inptr)
 {
-  return __builtin_amdgcn_global_load_tr_v4i16(inptr);
+  return __builtin_amdgcn_global_load_tr_b128_v4i16(inptr);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v4f16(
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4f16(
 // CHECK-GFX1200-NEXT:  entry:
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1) [[INPTR:%.*]])
 // CHECK-GFX1200-NEXT:    ret <4 x half> [[TMP0]]
 //
-v4h test_amdgcn_global_load_tr_v4f16(global v4h* inptr)
+v4h test_amdgcn_global_load_tr_b128_v4f16(global v4h* inptr)
 {
-  return __builtin_amdgcn_global_load_tr_v4f16(inptr);
+  return __builtin_amdgcn_global_load_tr_b128_v4f16(inptr);
 }
 
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4bf16(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT:    ret <4 x bfloat> [[TMP0]]
+//
+v4bf16 test_amdgcn_global_load_tr_b128_v4bf16(global v4bf16* inptr)
+{
+  return __builtin_amdgcn_global_load_tr_b128_v4bf16(inptr);
+}