llvm · changpeng · Mar 22, 2024 · Mar 21, 2024 · shiltian · Mar 21, 2024
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -432,13 +432,15 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
 
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32")
-
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8bf16, "V8yV8y*1", "nc", "gfx12-insts,wavefrontsize32")
+
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4bf16, "V4yV4y*1", "nc", "gfx12-insts,wavefrontsize64")
 
 //===----------------------------------------------------------------------===//
 // WMMA builtins.

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18531,35 +18531,45 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
     return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1});
   }
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_i32:
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_v2i32:
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4f16:
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4i16:
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8f16:
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8i16: {
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: {
 
     llvm::Type *ArgTy;
     switch (BuiltinID) {
-    case AMDGPU::BI__builtin_amdgcn_global_load_tr_i32:
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
       ArgTy = llvm::Type::getInt32Ty(getLLVMContext());
       break;
-    case AMDGPU::BI__builtin_amdgcn_global_load_tr_v2i32:
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
       ArgTy = llvm::FixedVectorType::get(
           llvm::Type::getInt32Ty(getLLVMContext()), 2);
       break;
-    case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4f16:
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
+      ArgTy = llvm::FixedVectorType::get(
+          llvm::Type::getBFloatTy(getLLVMContext()), 4);
+      break;
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
       ArgTy = llvm::FixedVectorType::get(
           llvm::Type::getHalfTy(getLLVMContext()), 4);
       break;
-    case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4i16:
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
       ArgTy = llvm::FixedVectorType::get(
           llvm::Type::getInt16Ty(getLLVMContext()), 4);
       break;
-    case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8f16:
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
+      ArgTy = llvm::FixedVectorType::get(
+          llvm::Type::getBFloatTy(getLLVMContext()), 8);
+      break;
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
       ArgTy = llvm::FixedVectorType::get(
           llvm::Type::getHalfTy(getLLVMContext()), 8);
       break;
-    case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8i16:
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
       ArgTy = llvm::FixedVectorType::get(
           llvm::Type::getInt16Ty(getLLVMContext()), 8);
       break;

diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl
@@ -6,21 +6,22 @@
 typedef int    v2i   __attribute__((ext_vector_type(2)));
 typedef half   v8h   __attribute__((ext_vector_type(8)));
 typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
 
 typedef half   v4h   __attribute__((ext_vector_type(4)));
 typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef __bf16 v4bf16 __attribute__((ext_vector_type(4)));
 
-
-
-void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr,
-                           global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr)
+void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8bf16* v8bf16_inptr,
+                           global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4bf16* v4bf16_inptr)
 {
-  v2i out_1 = __builtin_amdgcn_global_load_tr_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
-  v8s out_2 = __builtin_amdgcn_global_load_tr_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
-  v8h out_3 = __builtin_amdgcn_global_load_tr_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
-
-  int out_4 = __builtin_amdgcn_global_load_tr_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_i32' needs target feature gfx12-insts,wavefrontsize64}}
-  v4s out_5 = __builtin_amdgcn_global_load_tr_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
-  v4h out_6 = __builtin_amdgcn_global_load_tr_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
+  v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
+  v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
+  v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
+  v8bf16 o4 = __builtin_amdgcn_global_load_tr_b128_v8bf16(v8bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8bf16' needs target feature gfx12-insts,wavefrontsize32}}
+
+  int out_5 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}}
+  v4s out_6 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
+  v4h out_7 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
+  v4bf16 o8 = __builtin_amdgcn_global_load_tr_b128_v4bf16(v4bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4bf16' needs target feature gfx12-insts,wavefrontsize64}}
 }
-
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl
@@ -5,11 +5,12 @@
 
 typedef half   v4h   __attribute__((ext_vector_type(4)));
 typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef __bf16 v4bf16 __attribute__((ext_vector_type(4)));
 
-void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr)
+void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4bf16* v4bf16_inptr)
 {
-  int out_4 = __builtin_amdgcn_global_load_tr_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_i32' needs target feature gfx12-insts,wavefrontsize64}}
-  v4s out_5 = __builtin_amdgcn_global_load_tr_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
-  v4h out_6 = __builtin_amdgcn_global_load_tr_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
+  int out_1 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}}
+  v4s out_2 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
+  v4h out_3 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
+  v4bf16 o4 = __builtin_amdgcn_global_load_tr_b128_v4bf16(v4bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4bf16' needs target feature gfx12-insts,wavefrontsize64}}
 }
-
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl
@@ -6,11 +6,12 @@
 typedef int    v2i   __attribute__((ext_vector_type(2)));
 typedef half   v8h   __attribute__((ext_vector_type(8)));
 typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
 
-void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr)
+void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8bf16* v8bf16_inptr)
 {
-  v2i out_1 = __builtin_amdgcn_global_load_tr_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
-  v8s out_2 = __builtin_amdgcn_global_load_tr_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
-  v8h out_3 = __builtin_amdgcn_global_load_tr_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
+  v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
+  v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
+  v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
+  v8bf16 o4 = __builtin_amdgcn_global_load_tr_b128_v8bf16(v8bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8bf16' needs target feature gfx12-insts,wavefrontsize32}}
 }
-
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
@@ -5,44 +5,44 @@
 typedef int    v2i   __attribute__((ext_vector_type(2)));
 typedef half   v8h   __attribute__((ext_vector_type(8)));
 typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
 
-// Wave32
-
-//
-// amdgcn_global_load_tr
-//
-
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v2i32(
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_v2i32(
 // CHECK-GFX1200-NEXT:  entry:
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) [[INPTR:%.*]])
 // CHECK-GFX1200-NEXT:    ret <2 x i32> [[TMP0]]
 //
-v2i test_amdgcn_global_load_tr_v2i32(global v2i* inptr)
+v2i test_amdgcn_global_load_tr_b64_v2i32(global v2i* inptr)
 {
-  return __builtin_amdgcn_global_load_tr_v2i32(inptr);
+  return __builtin_amdgcn_global_load_tr_b64_v2i32(inptr);
 }
 
-//
-// amdgcn_global_load_tr
-//
-
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v8i16(
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8i16(
 // CHECK-GFX1200-NEXT:  entry:
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1) [[INPTR:%.*]])
 // CHECK-GFX1200-NEXT:    ret <8 x i16> [[TMP0]]
 //
-v8s test_amdgcn_global_load_tr_v8i16(global v8s* inptr)
+v8s test_amdgcn_global_load_tr_b128_v8i16(global v8s* inptr)
 {
-  return __builtin_amdgcn_global_load_tr_v8i16(inptr);
+  return __builtin_amdgcn_global_load_tr_b128_v8i16(inptr);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v8f16(
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8f16(
 // CHECK-GFX1200-NEXT:  entry:
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1) [[INPTR:%.*]])
 // CHECK-GFX1200-NEXT:    ret <8 x half> [[TMP0]]
 //
-v8h test_amdgcn_global_load_tr_v8f16(global v8h* inptr)
+v8h test_amdgcn_global_load_tr_b128_v8f16(global v8h* inptr)
 {
-  return __builtin_amdgcn_global_load_tr_v8f16(inptr);
+  return __builtin_amdgcn_global_load_tr_b128_v8f16(inptr);
 }
 
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8bf16(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT:    ret <8 x bfloat> [[TMP0]]
+//
+v8bf16 test_amdgcn_global_load_tr_b128_v8bf16(global v8bf16* inptr)
+{
+  return __builtin_amdgcn_global_load_tr_b128_v8bf16(inptr);
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
@@ -4,44 +4,44 @@
 
 typedef half   v4h   __attribute__((ext_vector_type(4)));
 typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef __bf16 v4bf16 __attribute__((ext_vector_type(4)));
 
-// Wave64
-
-//
-// amdgcn_global_load_tr
-//
-
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_i32(
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_i32(
 // CHECK-GFX1200-NEXT:  entry:
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1) [[INPTR:%.*]])
 // CHECK-GFX1200-NEXT:    ret i32 [[TMP0]]
 //
-int test_amdgcn_global_load_tr_i32(global int* inptr)
+int test_amdgcn_global_load_tr_b64_i32(global int* inptr)
 {
-  return __builtin_amdgcn_global_load_tr_i32(inptr);
+  return __builtin_amdgcn_global_load_tr_b64_i32(inptr);
 }
 
-//
-// amdgcn_global_load_tr
-//
-
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v4i16(
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4i16(
 // CHECK-GFX1200-NEXT:  entry:
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1) [[INPTR:%.*]])
 // CHECK-GFX1200-NEXT:    ret <4 x i16> [[TMP0]]
 //
-v4s test_amdgcn_global_load_tr_v4i16(global v4s* inptr)
+v4s test_amdgcn_global_load_tr_b128_v4i16(global v4s* inptr)
 {
-  return __builtin_amdgcn_global_load_tr_v4i16(inptr);
+  return __builtin_amdgcn_global_load_tr_b128_v4i16(inptr);
 }
 
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v4f16(
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4f16(
 // CHECK-GFX1200-NEXT:  entry:
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1) [[INPTR:%.*]])
 // CHECK-GFX1200-NEXT:    ret <4 x half> [[TMP0]]
 //
-v4h test_amdgcn_global_load_tr_v4f16(global v4h* inptr)
+v4h test_amdgcn_global_load_tr_b128_v4f16(global v4h* inptr)
 {
-  return __builtin_amdgcn_global_load_tr_v4f16(inptr);
+  return __builtin_amdgcn_global_load_tr_b128_v4f16(inptr);
 }
 
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4bf16(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT:    ret <4 x bfloat> [[TMP0]]
+//
+v4bf16 test_amdgcn_global_load_tr_b128_v4bf16(global v4bf16* inptr)
+{
+  return __builtin_amdgcn_global_load_tr_b128_v4bf16(inptr);
+}