address comments

Wolfram70 · Wolfram70 · commit 0a33861c8179 · 2025-04-15T14:12:16.000+05:30
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -214,15 +214,15 @@ def NVVM_ClusterDimZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ncluster
 //===----------------------------------------------------------------------===//
 // CTA index and range within Cluster
 def NVVM_BlockInClusterIdXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.x", [NVVMRequiresSM<90>]>;
-def NVVM_BlockInClusterIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.y">;
-def NVVM_BlockInClusterIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.z">;
-def NVVM_ClusterDimBlocksXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.x">;
-def NVVM_ClusterDimBlocksYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.y">;
+def NVVM_BlockInClusterIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.y", [NVVMRequiresSM<90>]>;
+def NVVM_BlockInClusterIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.z", [NVVMRequiresSM<90>]>;
+def NVVM_ClusterDimBlocksXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.x", [NVVMRequiresSM<90>]>;
+def NVVM_ClusterDimBlocksYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.y", [NVVMRequiresSM<90>]>;
 def NVVM_ClusterDimBlocksZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.z">;
 
 //===----------------------------------------------------------------------===//
 // CTA index and across Cluster dimensions
-def NVVM_ClusterId : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctarank">;
+def NVVM_ClusterId : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctarank", [NVVMRequiresSM<90>]>;
 def NVVM_ClusterDim : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctarank">;
 
 //===----------------------------------------------------------------------===//
@@ -323,7 +323,7 @@ def NVVM_MBarrierInitOp : NVVM_PTXBuilder_Op<"mbarrier.init">,
 }
 
 /// mbarrier.init instruction with shared pointer type
-def NVVM_MBarrierInitSharedOp : NVVM_PTXBuilder_Op<"mbarrier.init.shared">,
+def NVVM_MBarrierInitSharedOp : NVVM_PTXBuilder_Op<"mbarrier.init.shared", [NVVMRequiresSM<80>, DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>]>,
   Arguments<(ins LLVM_PointerShared:$addr, I32:$count, PtxPredicate:$predicate)> {
   string llvmBuilder = [{
       createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_init_shared, {$addr, $count});
@@ -545,7 +545,7 @@ def NVVM_ClusterArriveOp : NVVM_Op<"cluster.arrive"> {
   let assemblyFormat = "attr-dict";
 }
 
-def NVVM_ClusterArriveRelaxedOp : NVVM_Op<"cluster.arrive.relaxed"> {
+def NVVM_ClusterArriveRelaxedOp : NVVM_Op<"cluster.arrive.relaxed", [NVVMRequiresSM<90>]> {
   let arguments = (ins OptionalAttr<UnitAttr>:$aligned);
 
   let summary = "Cluster Barrier Relaxed Arrive Op";
@@ -571,7 +571,7 @@ def NVVM_ClusterArriveRelaxedOp : NVVM_Op<"cluster.arrive.relaxed"> {
   let assemblyFormat = "attr-dict";
 }
 
-def NVVM_ClusterWaitOp : NVVM_Op<"cluster.wait"> {
+def NVVM_ClusterWaitOp : NVVM_Op<"cluster.wait", [NVVMRequiresSM<90>]> {
   let arguments = (ins OptionalAttr<UnitAttr>:$aligned);
 
   let summary = "Cluster Barrier Wait Op";
@@ -776,7 +776,7 @@ def ShflKind : I32EnumAttr<"ShflKind", "NVVM shuffle kind",
 def ShflKindAttr : EnumAttr<NVVM_Dialect, ShflKind, "shfl_kind">;
 
 def NVVM_ShflOp :
-  NVVM_Op<"shfl.sync">,
+  NVVM_Op<"shfl.sync", [NVVMRequiresSM<30>]>,
   Results<(outs LLVM_Type:$res)>,
   Arguments<(ins I32:$thread_mask,
                  LLVM_Type:$val,
@@ -1880,7 +1880,7 @@ def NVVM_CpAsyncBulkCommitGroupOp : NVVM_Op<"cp.async.bulk.commit.group">,
   }];
 }
 
-def NVVM_CpAsyncBulkWaitGroupOp : NVVM_Op<"cp.async.bulk.wait_group">,
+def NVVM_CpAsyncBulkWaitGroupOp : NVVM_Op<"cp.async.bulk.wait_group", [NVVMRequiresSM<90>]>,
   Arguments<(ins 
     ConfinedAttr<I32Attr, [IntMinValue<0>]>:$group, 
     OptionalAttr<UnitAttr>:$read)> {
@@ -1910,7 +1910,7 @@ def NVVM_CpAsyncBulkWaitGroupOp : NVVM_Op<"cp.async.bulk.wait_group">,
 def NVVM_CpAsyncBulkTensorGlobalToSharedClusterOp : 
   NVVM_Op<"cp.async.bulk.tensor.shared.cluster.global", 
   [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>, 
-  AttrSizedOperandSegments]>,
+  AttrSizedOperandSegments, NVVMRequiresSM<90>]>,
   Arguments<(ins  LLVM_PointerShared:$dstMem,
                   LLVM_AnyPointer:$tmaDescriptor,
                   Variadic<I32>:$coordinates,
@@ -2347,8 +2347,7 @@ def NVVM_CpAsyncBulkSharedCTAToGlobalOp :
 // NVVM Wgmma Ops
 //===----------------------------------------------------------------------===//
 
-def NVVM_WgmmaFenceAlignedOp : NVVM_Op<"wgmma.fence.aligned",
-                              [NVVMRequiresSM<90, /*ArchAccelerated*/"true">]> {
+def NVVM_WgmmaFenceAlignedOp : NVVM_Op<"wgmma.fence.aligned", [NVVMRequiresSM90a]> {
   let arguments = (ins);
   let description = [{
     Enforce an ordering of register accesses between warpgroup level matrix 
@@ -2362,8 +2361,7 @@ def NVVM_WgmmaFenceAlignedOp : NVVM_Op<"wgmma.fence.aligned",
   }];
 }
 
-def NVVM_WgmmaGroupSyncAlignedOp : NVVM_Op<"wgmma.commit.group.sync.aligned",  
-                              [NVVMRequiresSM<90, /*ArchAccelerated*/"true">]> {
+def NVVM_WgmmaGroupSyncAlignedOp : NVVM_Op<"wgmma.commit.group.sync.aligned", [NVVMRequiresSM90a]> {
   let assemblyFormat = "attr-dict";
   let description = [{
     Commits all prior uncommitted warpgroup level matrix multiplication operations.
@@ -2375,7 +2373,7 @@ def NVVM_WgmmaGroupSyncAlignedOp : NVVM_Op<"wgmma.commit.group.sync.aligned",
   }];
 }
 
-def NVVM_WgmmaWaitGroupSyncOp : NVVM_Op<"wgmma.wait.group.sync.aligned">{
+def NVVM_WgmmaWaitGroupSyncOp : NVVM_Op<"wgmma.wait.group.sync.aligned", [NVVMRequiresSM90a]> {
   let arguments = (ins I64Attr:$group);
   let assemblyFormat = "attr-dict $group";
   let description = [{
@@ -2571,7 +2569,7 @@ def NVVM_GriddepcontrolLaunchDependentsOp
 
 def NVVM_MapaOp: NVVM_Op<"mapa",
     [TypesMatchWith<"`res` and `a` should have the same type",
-                    "a", "res", "$_self">]> {
+                    "a", "res", "$_self">, NVVMRequiresSM<90>]> {
   let results = (outs AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$res);
   let arguments = (ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$a, I32:$b);
 
@@ -2662,7 +2660,7 @@ def Tcgen05WaitKindAttr :
   let assemblyFormat = "`<` $value `>`";
 }
 
-def NVVM_Tcgen05AllocOp : NVVM_Op<"tcgen05.alloc"> {
+def NVVM_Tcgen05AllocOp : NVVM_Op<"tcgen05.alloc", [NVVMRequiresSM<100, "true", "false">]> {
   let summary = "Tcgen05 alloc operation";
   let description = [{
     The `tcgen05.alloc` Op allocates tensor core memory for
@@ -2692,7 +2690,7 @@ def NVVM_Tcgen05AllocOp : NVVM_Op<"tcgen05.alloc"> {
   }];
 }
 
-def NVVM_Tcgen05DeallocOp : NVVM_Op<"tcgen05.dealloc"> {
+def NVVM_Tcgen05DeallocOp : NVVM_Op<"tcgen05.dealloc", [NVVMRequiresSM<100, "true", "false">]> {
   let summary = "Tcgen05 dealloc operation";
   let description = [{
     The `tcgen05.dealloc` Op de-allocates the tensor core memory
@@ -2720,7 +2718,7 @@ def NVVM_Tcgen05DeallocOp : NVVM_Op<"tcgen05.dealloc"> {
   }];
 }
 
-def NVVM_Tcgen05RelinquishAllocPermitOp : NVVM_Op<"tcgen05.relinquish_alloc_permit"> {
+def NVVM_Tcgen05RelinquishAllocPermitOp : NVVM_Op<"tcgen05.relinquish_alloc_permit", [NVVMRequiresSM<100, "true", "false">]> {
   let summary = "Tcgen05 Op to relinquish the right to allocate";
   let description = [{
     The `tcgen05.relinquish_alloc_permit` Op specifies that the CTA
@@ -2743,7 +2741,7 @@ def NVVM_Tcgen05RelinquishAllocPermitOp : NVVM_Op<"tcgen05.relinquish_alloc_perm
   }];
 }
 
-def NVVM_Tcgen05FenceOp : NVVM_Op<"tcgen05.fence"> {
+def NVVM_Tcgen05FenceOp : NVVM_Op<"tcgen05.fence", [NVVMRequiresSM<100, "true", "false">]> {
   let summary = "Tcgen05 fence operations";
   let description = [{
     The `tcgen05.fence<before>` orders all prior async tcgen05 operations
@@ -2765,7 +2763,7 @@ def NVVM_Tcgen05FenceOp : NVVM_Op<"tcgen05.fence"> {
   }];
 }
 
-def NVVM_Tcgen05WaitOp : NVVM_Op<"tcgen05.wait"> {
+def NVVM_Tcgen05WaitOp : NVVM_Op<"tcgen05.wait", [NVVMRequiresSM<100, "true", "false">]> {
   let summary = "Tcgen05 wait operations";
   let description = [{
     The `tcgen05.wait<load>` causes the executing thread to block until
@@ -2787,7 +2785,7 @@ def NVVM_Tcgen05WaitOp : NVVM_Op<"tcgen05.wait"> {
   }];
 }
 
-def NVVM_Tcgen05CommitOp : NVVM_Op<"tcgen05.commit"> {
+def NVVM_Tcgen05CommitOp : NVVM_Op<"tcgen05.commit", [NVVMRequiresSM<100, "true", "false">]> {
   let summary = "Tcgen05 commit operations";
   let description = [{
     The `tcgen05.commit` makes the mbarrier object, specified by
@@ -2825,7 +2823,7 @@ def NVVM_Tcgen05CommitOp : NVVM_Op<"tcgen05.commit"> {
   }];
 }
 
-def NVVM_Tcgen05ShiftOp : NVVM_Op<"tcgen05.shift"> {
+def NVVM_Tcgen05ShiftOp : NVVM_Op<"tcgen05.shift", [NVVMRequiresSM<100, "true", "false">]> {
   let summary = "Tcgen05 shift operation";
   let description = [{
     The `tcgen05.shift` is an asynchronous instruction which initiates
@@ -2891,7 +2889,7 @@ def Tcgen05CpSrcFormatAttr : EnumAttr<NVVM_Dialect, Tcgen05CpSrcFormat, "tcgen05
   let assemblyFormat = "`<` $value `>`";
 }
 
-def NVVM_Tcgen05CpOp : NVVM_Op<"tcgen05.cp"> {
+def NVVM_Tcgen05CpOp : NVVM_Op<"tcgen05.cp", [NVVMRequiresSM<100, "true", "false">]> {
   let summary = "Tcgen05 copy operation";
   let description = [{
     Instruction tcgen05.cp initiates an asynchronous copy operation from
@@ -2961,7 +2959,7 @@ def Tcgen05LdStShapeAttr: EnumAttr<NVVM_Dialect, Tcgen05LdStShape, "tcgen05_ldst
 // NVVM tcgen05.ld Op
 //===----------------------------------------------------------------------===//
 
-def NVVM_Tcgen05LdOp : NVVM_Op<"tcgen05.ld"> {
+def NVVM_Tcgen05LdOp : NVVM_Op<"tcgen05.ld", [NVVMRequiresSM<100, "true", "false">]> {
   let summary = "tensor memory load instructions";
   let arguments = (ins
     // Attributes
@@ -3051,7 +3049,7 @@ def NVVM_Tcgen05LdOp : NVVM_Op<"tcgen05.ld"> {
 // NVVM tcgen05.st Op
 //===----------------------------------------------------------------------===//
 
-def NVVM_Tcgen05StOp : NVVM_Op<"tcgen05.st"> {
+def NVVM_Tcgen05StOp : NVVM_Op<"tcgen05.st", [NVVMRequiresSM<100, "true", "false">]> {
   let summary = "tensor memory store instructions";
   let arguments = (ins
     // Attributes
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMTraits.h b/mlir/include/mlir/Dialect/LLVMIR/NVVMTraits.h
@@ -21,14 +21,21 @@ namespace mlir {
 
 namespace NVVM {
 
+// Structure to store and check compatibility of SM versions.
 struct NVVMCheckSMVersion {
   int archVersion;
   bool archAccelerated;
+  bool exactMatch;
 
-  NVVMCheckSMVersion() {}
-  NVVMCheckSMVersion(StringRef smVersion) { parse(smVersion); }
-  NVVMCheckSMVersion(int archVersion, bool archAccelerated)
-      : archVersion(archVersion), archAccelerated(archAccelerated) {}
+  NVVMCheckSMVersion()
+      : archVersion(0), archAccelerated(false), exactMatch(false) {}
+  NVVMCheckSMVersion(StringRef smVersion, bool exactMatch = false)
+      : exactMatch(exactMatch) {
+    parse(smVersion);
+  }
+  NVVMCheckSMVersion(int archVersion, bool archAccelerated, bool exactMatch)
+      : archVersion(archVersion), archAccelerated(archAccelerated),
+        exactMatch(exactMatch) {}
 
   // Parses the SM version string and sets the archVersion (integer) and
   // the archAccelerated flag.
@@ -40,11 +47,12 @@ struct NVVMCheckSMVersion {
   }
 
   bool isCompatible(const NVVMCheckSMVersion &targetSM) const {
-    // for arch-conditional SMs, they should exactly match to be valid
-    if (archAccelerated || targetSM.archAccelerated)
+    if (exactMatch)
       return (*this) == targetSM;
-
-    return archVersion <= targetSM.archVersion;
+    
+    return archAccelerated ? 
+      archVersion <= targetSM.archVersion && targetSM.archAccelerated :
+      archVersion <= targetSM.archVersion;
   }
 
   bool operator==(const NVVMCheckSMVersion &other) const {
@@ -61,16 +69,18 @@ namespace mlir {
 
 namespace OpTrait {
 
-template <int Version, bool ArchAccelerated = false>
+template <int MinVersion, bool ArchAccelerated = false, bool ExactMatch = false>
 class NVVMRequiresSM {
 public:
   template <typename ConcreteOp>
-  class Impl : public OpTrait::TraitBase<
-                   ConcreteOp, NVVMRequiresSM<Version, ArchAccelerated>::Impl>,
-               public mlir::NVVM::RequiresSMInterface::Trait<ConcreteOp> {
+  class Impl
+      : public OpTrait::TraitBase<
+            ConcreteOp,
+            NVVMRequiresSM<MinVersion, ArchAccelerated, ExactMatch>::Impl>,
+        public mlir::NVVM::RequiresSMInterface::Trait<ConcreteOp> {
   public:
     const NVVM::NVVMCheckSMVersion getRequiredMinSMVersion() const {
-      return NVVM::NVVMCheckSMVersion(Version, ArchAccelerated);
+      return NVVM::NVVMCheckSMVersion(MinVersion, ArchAccelerated, ExactMatch);
     }
   };
 };
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMTraits.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMTraits.td
@@ -27,8 +27,12 @@ def RequiresSMInterface: OpInterface<"RequiresSMInterface"> {
   ];
 }
 
-class NVVMRequiresSM<int Version, string ArchAccelerated = "false"> :
+class NVVMRequiresSM<int minVersion, string isArchAccelerated = "false",
+                    string exactMatch = "false"> :
   ParamNativeOpTrait<"NVVMRequiresSM",
-                    !cast<string>(Version) # "," # ArchAccelerated>;
+                    !cast<string>(minVersion) # "," # isArchAccelerated # ","
+                      # exactMatch>;
+                      
+def NVVMRequiresSM90a : NVVMRequiresSM<90, "true", "true">;
 
 #endif //NVVM_TRAITS
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1567,8 +1567,12 @@ LogicalResult NVVMTargetAttr::verifyTarget(Operation *gpuModule) {
   if (!gpuModuleOp)
     return emitError(gpuModule->getLoc(),
                      "NVVM target attribute must be attached to a GPU module");
-
+  
   NVVMCheckSMVersion targetSMVersion(getChip());
+  if (targetSMVersion.archVersion < 20)
+    return emitError(gpuModule->getLoc(),
+                     "Minimum NVVM target SM version is sm_20");
+
   gpuModuleOp->walk([&](Operation *op) {
     if (auto reqOp = llvm::dyn_cast<NVVM::RequiresSMInterface>(op)) {
       NVVMCheckSMVersion requirement = reqOp.getRequiredMinSMVersion();
diff --git a/mlir/test/Dialect/LLVMIR/nvvm-check-targetSM.mlir b/mlir/test/Dialect/LLVMIR/nvvm-check-targetSM.mlir
@@ -13,10 +13,19 @@ gpu.module @check_valid_SM_greater_2 [#nvvm.target<chip = "sm_90">] {
   test.nvvm_requires_sm_80
 }
 
-gpu.module @check_valid_SM_arch_acc [#nvvm.target<chip = "sm_90a">] {
+gpu.module @check_valid_SM_arch_acc_exact_1 [#nvvm.target<chip = "sm_90a">] {
   test.nvvm_requires_sm_90a
 }
 
+gpu.module @check_valid_SM_arch_acc_atleast_1 [#nvvm.target<chip = "sm_90a">] {
+  test.nvvm_requires_sm_atleast_90_aa
+}
+
+gpu.module @check_valid_SM_arch_acc_atleast_2 [#nvvm.target<chip = "sm_100a">] {
+  test.nvvm_requires_sm_atleast_90_aa
+}
+
+
 gpu.module @disable_verify_target1 [#nvvm.target<chip = "sm_90", verifyTarget = false>] {
   test.nvvm_requires_sm_90a
 }
@@ -25,7 +34,9 @@ gpu.module @disable_verify_target2 [#nvvm.target<chip = "sm_70", verifyTarget =
   test.nvvm_requires_sm_80
 }
 
-
+gpu.module @disable_verify_target3 [#nvvm.target<chip = "sm_90", verifyTarget = false>] {
+  test.nvvm_requires_sm_atleast_90_aa
+}
 
 // -----
 
@@ -43,14 +54,28 @@ gpu.module @check_invalid_SM_lesser_2 [#nvvm.target<chip = "sm_75">] {
 
 // -----
 
-gpu.module @check_invalid_SM_arch_acc_1 [#nvvm.target<chip = "sm_90">] {
+gpu.module @check_invalid_SM_arch_acc_exact_1 [#nvvm.target<chip = "sm_90">] {
   // expected-error @below {{is not supported on sm_90}}
   test.nvvm_requires_sm_90a
 }
 
 // -----
 
-gpu.module @check_invalid_SM_arch_acc_2 [#nvvm.target<chip = "sm_80">] {
+gpu.module @check_invalid_SM_arch_acc_exact_2 [#nvvm.target<chip = "sm_80">] {
   // expected-error @below {{is not supported on sm_80}}
   test.nvvm_requires_sm_90a
 }
+
+// -----
+
+gpu.module @check_invalid_SM_arch_acc_atleast_1 [#nvvm.target<chip = "sm_80">] {
+  // expected-error @below {{is not supported on sm_80}}
+  test.nvvm_requires_sm_atleast_90_aa
+}
+
+// -----
+
+gpu.module @check_invalid_SM_arch_acc_atleast_2 [#nvvm.target<chip = "sm_90">] {
+  // expected-error @below {{is not supported on sm_90}}
+  test.nvvm_requires_sm_atleast_90_aa
+}
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -2739,8 +2739,14 @@ def TestNVVMRequiresSMOp : TEST_Op<"nvvm_requires_sm_80",
   let assemblyFormat = "attr-dict";
 }
 
-def TestNVVMRequiresSMArchCondOp : TEST_Op<"nvvm_requires_sm_90a",
-                                          [NVVMRequiresSM<90, "true">]> {
+def TestNVVMRequiresAtleastSMArchCondOp : 
+    TEST_Op<"nvvm_requires_sm_atleast_90_aa", [NVVMRequiresSM<90, "true">]> {
+  let arguments = (ins );
+  let assemblyFormat = "attr-dict";
+}
+
+def TestNVVMRequiresExactSMArchCondOp : TEST_Op<"nvvm_requires_sm_90a",
+                                          [NVVMRequiresSM90a]> {
   let arguments = (ins );
   let assemblyFormat = "attr-dict";
 }