llvm
diff --git a/‎builtin.diff
Lines changed: 84 additions & 0 deletions b/‎builtin.diff
Lines changed: 84 additions & 0 deletions
diff --git a/‎clang/test/CodeGenOpenCL/builtins-amdgcn-gfx13-load-mcast.cl
Lines changed: 14 additions & 0 deletions b/‎clang/test/CodeGenOpenCL/builtins-amdgcn-gfx13-load-mcast.cl
Lines changed: 14 additions & 0 deletions
diff --git a/‎clang/test/CodeGenOpenCL/builtins-amdgcn.cl
Lines changed: 2 additions & 1 deletion b/‎clang/test/CodeGenOpenCL/builtins-amdgcn.cl
Lines changed: 2 additions & 1 deletion
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
Lines changed: 82 additions & 48 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
Lines changed: 82 additions & 48 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
Lines changed: 8 additions & 8 deletions b/‎llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
Lines changed: 8 additions & 8 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/annotate-existing-abi-attributes.ll
Lines changed: 10 additions & 10 deletions b/‎llvm/test/CodeGen/AMDGPU/annotate-existing-abi-attributes.ll
Lines changed: 10 additions & 10 deletions
@@ -0,0 +1,84 @@
+diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
+index 39fef9e4601f..55befac12b46 100644
+--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
++++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
+@@ -635,5 +635,7 @@ TARGET_BUILTIN(__builtin_amdgcn_bitop3_b16, "ssssIUi", "nc", "bitop3-insts")
+ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf16_f32, "V2yV2yfUiIb", "nc", "f32-to-f16bf16-cvt-sr-insts")
+ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16bf16-cvt-sr-insts")
+
++TARGET_BUILTIN(__builtin_amdgcn_load_mcast_b32, "vi*3i*Iii", "", "gfx950-insts")
++
+ #undef BUILTIN
+ #undef TARGET_BUILTIN
+diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+index ad012d98635f..601319b2c225 100644
+--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
++++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+@@ -1083,6 +1083,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
+
+     return Builder.CreateBitCast(RMW, OrigTy);
+   }
++  case AMDGPU::BI__builtin_amdgcn_load_mcast_b32: {
++    Intrinsic::ID IID;
++    switch (BuiltinID) {
++    case AMDGPU::BI__builtin_amdgcn_load_mcast_b32:
++      IID = Intrinsic::amdgcn_load_mcast_b32;
++      break;
++    }
++    SmallVector<Value *, 3> Args;
++    auto *SrcType = ConvertType(E->getArg(1)->getType());
++    for (int i = 0, e = E->getNumArgs(); i != e; ++i)
++      Args.push_back(EmitScalarExpr(E->getArg(i)));
++    llvm::Function *F = CGM.getIntrinsic(IID, {SrcType});
++    return Builder.CreateCall(F, {Args});
++  }
+   case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
+   case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtnl: {
+     llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
+diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx13-load-mcast.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx13-load-mcast.cl
+new file mode 100644
+index 000000000000..f6b058cb3fb5
+--- /dev/null
++++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx13-load-mcast.cl
+@@ -0,0 +1,14 @@
++// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
++// REQUIRES: amdgpu-registered-target
++// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX13
++
++void test_amdgcn_load_mcast_b32(local void* outptr, global void* inptr, int mask)
++{
++  __builtin_amdgcn_load_mcast_b32(outptr, inptr, 10, mask);
++}
++
++//void test_amdgcn_load_mcast_local_b32(local void* outptr, local void* inptr, int mask)
++//{
++//  __builtin_amdgcn_load_mcast_b32(outptr, inptr, 10, mask);
++//}
++
+diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+index a57eb4a6dba4..a765ef8c3a42 100644
+--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
++++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+@@ -3499,6 +3499,22 @@ def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic<
+   [IntrNoMem, IntrSpeculatable]
+ >;
+
++class AMDGPULoadMCast:
++  Intrinsic<
++    [],
++    [local_ptr_ty,      // laneshared destination pointer
++     llvm_anyptr_ty,    // Global, LDS, or DDS pointer to load from
++     llvm_i32_ty,       // gfx12+ cachepolicy:
++                        //   bits [0-2] = th
++                        //   bits [3-4] = scope
++     llvm_i32_ty],      // workgroup broadcast mask (in M0)
++    [IntrArgMemOnly, ReadOnly<ArgIndex<1>>, IntrConvergent, NoCapture<ArgIndex<0>>,
++     NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>],
++    "", [SDNPMemOperand]
++  >;
++
++def int_amdgcn_load_mcast_b32         : AMDGPULoadMCast;
++
+ /// Emit an addrspacecast without null pointer checking.
+ /// Should only be inserted by a pass based on analysis of an addrspacecast's src.
+ def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic<
@@ -0,0 +1,14 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX13
+
+void test_amdgcn_load_mcast_b32(local int* outptr, global int* inptr, int mask)
+{
+  __builtin_amdgcn_load_mcast_b32(outptr, inptr, 10, mask);
+}
+
+//void test_amdgcn_load_mcast_local_b32(local void* outptr, local void* inptr, int mask)
+//{
+//  __builtin_amdgcn_load_mcast_b32(outptr, inptr, 10, mask);
+//}
+
@@ -903,4 +903,5 @@ void test_set_fpenv(unsigned long env) {
 
 // CHECK-DAG: [[$GRID_RANGE]] = !{i32 1, i32 0}
 // CHECK-DAG: [[$WS_RANGE]] = !{i16 1, i16 1025}
-// CHECK-DAG: attributes #[[$NOUNWIND_READONLY]] = { convergent mustprogress nocallback nofree nounwind willreturn memory(none) }
+// CHECK-SPIRV-DAG: attributes #[[$NOUNWIND_READONLY]] = { convergent mustprogress nocallback nofree nounwind willreturn memory(none) }
+// CHECK-AMDGCN-DAG: attributes #[[$NOUNWIND_READONLY]] = { convergent mustprogress nocallback nofree nounwind willreturn memory(none) "amdgpu-waves-per-eu"="4,10" }
@@ -1117,47 +1117,25 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
     Function *F = getAssociatedFunction();
     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
 
-    auto TakeRange = [&](std::pair<unsigned, unsigned> R) {
-      auto [Min, Max] = R;
-      ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
-      IntegerRangeState RangeState(Range);
-      clampStateAndIndicateChange(this->getState(), RangeState);
-      indicateOptimisticFixpoint();
-    };
-
-    std::pair<unsigned, unsigned> MaxWavesPerEURange{
-        1U, InfoCache.getMaxWavesPerEU(*F)};
-
     // If the attribute exists, we will honor it if it is not the default.
     if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) {
+      std::pair<unsigned, unsigned> MaxWavesPerEURange{
+          1U, InfoCache.getMaxWavesPerEU(*F)};
       if (*Attr != MaxWavesPerEURange) {
-        TakeRange(*Attr);
+        auto [Min, Max] = *Attr;
+        ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
+        IntegerRangeState RangeState(Range);
+        this->getState() = RangeState;
+        indicateOptimisticFixpoint();
         return;
       }
     }
 
-    // Unlike AAAMDFlatWorkGroupSize, it's getting trickier here. Since the
-    // calculation of waves per EU involves flat work group size, we can't
-    // simply use an assumed flat work group size as a start point, because the
-    // update of flat work group size is in an inverse direction of waves per
-    // EU. However, we can still do something if it is an entry function. Since
-    // an entry function is a terminal node, and flat work group size either
-    // from attribute or default will be used anyway, we can take that value and
-    // calculate the waves per EU based on it. This result can't be updated by
-    // no means, but that could still allow us to propagate it.
-    if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
-      std::pair<unsigned, unsigned> FlatWorkGroupSize;
-      if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F))
-        FlatWorkGroupSize = *Attr;
-      else
-        FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize(*F);
-      TakeRange(InfoCache.getEffectiveWavesPerEU(*F, MaxWavesPerEURange,
-                                                 FlatWorkGroupSize));
-    }
+    if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
+      indicatePessimisticFixpoint();
   }
 
   ChangeStatus updateImpl(Attributor &A) override {
-    auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
     ChangeStatus Change = ChangeStatus::UNCHANGED;
 
     auto CheckCallSite = [&](AbstractCallSite CS) {
@@ -1166,24 +1144,21 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
       LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
                         << "->" << Func->getName() << '\n');
 
-      const auto *CallerInfo = A.getAAFor<AAAMDWavesPerEU>(
+      const auto *CallerAA = A.getAAFor<AAAMDWavesPerEU>(
           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
-      const auto *AssumedGroupSize = A.getAAFor<AAAMDFlatWorkGroupSize>(
-          *this, IRPosition::function(*Func), DepClassTy::REQUIRED);
-      if (!CallerInfo || !AssumedGroupSize || !CallerInfo->isValidState() ||
-          !AssumedGroupSize->isValidState())
+      if (!CallerAA || !CallerAA->isValidState())
         return false;
 
-      unsigned Min, Max;
-      std::tie(Min, Max) = InfoCache.getEffectiveWavesPerEU(
-          *Caller,
-          {CallerInfo->getAssumed().getLower().getZExtValue(),
-           CallerInfo->getAssumed().getUpper().getZExtValue() - 1},
-          {AssumedGroupSize->getAssumed().getLower().getZExtValue(),
-           AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});
-      ConstantRange CallerRange(APInt(32, Min), APInt(32, Max + 1));
-      IntegerRangeState CallerRangeState(CallerRange);
-      Change |= clampStateAndIndicateChange(this->getState(), CallerRangeState);
+      auto Assumed = this->getAssumed();
+      unsigned Min = std::max(Assumed.getLower().getZExtValue(),
+                              CallerAA->getAssumed().getLower().getZExtValue());
+      unsigned Max = std::max(Assumed.getUpper().getZExtValue(),
+                              CallerAA->getAssumed().getUpper().getZExtValue());
+      ConstantRange Range(APInt(32, Min), APInt(32, Max));
+      IntegerRangeState RangeState(Range);
+      this->getState() = RangeState;
+      Change |= this->getState() == Assumed ? ChangeStatus::UNCHANGED
+                                            : ChangeStatus::CHANGED;
 
       return true;
     };
@@ -1342,6 +1317,59 @@ static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
   }
 }
 
+static void checkWavesPerEU(Module &M, TargetMachine &TM) {
+  for (Function &F : M) {
+    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+
+    auto FlatWgrpSizeAttr =
+        AMDGPU::getIntegerPairAttribute(F, "amdgpu-flat-work-group-size");
+    auto WavesPerEUAttr = AMDGPU::getIntegerPairAttribute(
+        F, "amdgpu-waves-per-eu", /*OnlyFirstRequired=*/true);
+
+    unsigned MinWavesPerEU = ST.getMinWavesPerEU();
+    unsigned MaxWavesPerEU = ST.getMaxWavesPerEU();
+
+    unsigned MinFlatWgrpSize = 1U;
+    unsigned MaxFlatWgrpSize = 1024U;
+    if (FlatWgrpSizeAttr.has_value()) {
+      MinFlatWgrpSize = FlatWgrpSizeAttr->first;
+      MaxFlatWgrpSize = *(FlatWgrpSizeAttr->second);
+    }
+
+    // Start with the max range.
+    unsigned Min = MinWavesPerEU;
+    unsigned Max = MaxWavesPerEU;
+
+    // If the attribute exists, set them to the value from the attribute.
+    if (WavesPerEUAttr.has_value()) {
+      Min = WavesPerEUAttr->first;
+      if (WavesPerEUAttr->second.has_value())
+        Max = *(WavesPerEUAttr->second);
+    }
+
+    // Compute the range from flat workgroup size.
+    auto [MinFromFlatWgrpSize, MaxFromFlatWgrpSize] =
+        ST.getWavesPerEU(F, std::make_pair(MinFlatWgrpSize, MaxFlatWgrpSize));
+
+    // For the lower bound, we have to "tighten" it.
+    Min = std::max(Min, MinFromFlatWgrpSize);
+    // For the upper bound, we have to "extend" it.
+    Max = std::max(Max, MaxFromFlatWgrpSize);
+
+    // Clamp the range to the max range.
+    Min = std::max(Min, MinWavesPerEU);
+    Max = std::min(Max, MaxWavesPerEU);
+
+    // Update the attribute if it is not the max.
+    if (Min != MinWavesPerEU || Max != MaxWavesPerEU) {
+      SmallString<10> Buffer;
+      raw_svector_ostream OS(Buffer);
+      OS << Min << ',' << Max;
+      F.addFnAttr("amdgpu-waves-per-eu", OS.str());
+    }
+  }
+}
+
 static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
                     AMDGPUAttributorOptions Options,
                     ThinOrFullLTOPhase LTOPhase) {
@@ -1417,8 +1445,14 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
     }
   }
 
-  ChangeStatus Change = A.run();
-  return Change == ChangeStatus::CHANGED;
+  bool Changed = A.run() == ChangeStatus::CHANGED;
+
+  if (Changed && (LTOPhase == ThinOrFullLTOPhase::None ||
+                  LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
+                  LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink))
+    checkWavesPerEU(M, TM);
+
+  return Changed;
 }
 
 class AMDGPUAttributorLegacy : public ModulePass {
 
@@ -252,13 +252,13 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
 }
 
 
-attributes #0 = { "amdgpu-agpr-alloc"="0" }
+attributes #0 = { "amdgpu-no-agpr" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="0" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-no-agpr" }
 ;.
@@ -117,14 +117,14 @@ define void @call_no_dispatch_id() {
   ret void
 }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-workgroup-id-x" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { "amdgpu-no-workgroup-id-y" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-workgroup-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-no-dispatch-ptr" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-queue-ptr" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-implicitarg-ptr" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-workitem-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-workgroup-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-no-workgroup-id-y" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-workgroup-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-no-dispatch-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-queue-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-dispatch-id" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.