[LLVM][NVPTX] Add NVPTX codegen support for fence.proxy.tensormap (llvm#100748)

schwarzschild-radius · gonzalobg · web-flow · commit bfef7ccf03dc · 2024-08-07T20:25:45.000+05:30
This commit adds LLVM Intrinsics and NVPTX codegen support for
`fence.proxy.tensormap` with lit tests under fence-proxy-tensormap.ll.
Also, added Intrinsics documentation in NVPTXUsage.rst

---------

Co-authored-by: gonzalobg &lt;65027571+gonzalobg@users.noreply.github.com&gt;
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
@@ -251,6 +251,41 @@ Overview:
 The '``@llvm.nvvm.barrier0()``' intrinsic emits a PTX ``bar.sync 0``
 instruction, equivalent to the ``__syncthreads()`` call in CUDA.
 
+Membar/Fences
+-------------
+
+
+'``llvm.nvvm.fence.proxy.tensormap_generic.*``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.fence.proxy.tensormap_generic.release.cta()
+  declare void @llvm.nvvm.fence.proxy.tensormap_generic.release.cluster()
+  declare void @llvm.nvvm.fence.proxy.tensormap_generic.release.gpu()
+  declare void @llvm.nvvm.fence.proxy.tensormap_generic.release.sys()
+
+  declare void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cta(ptr %addr, i32 %size)
+  declare void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cluster(ptr %addr, i32 %size)
+  declare void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.gpu(ptr %addr, i32 %size)
+  declare void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.sys(ptr %addr, i32 %size)
+
+Overview:
+"""""""""
+
+The ``@llvm.nvvm.fence.proxy.tensormap_generic.*`` is a uni-directional fence used to establish ordering between a prior memory access performed via the generic `proxy<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#proxies>_` and a subsequent memory access performed via the tensormap proxy. ``nvvm.fence.proxy.tensormap_generic.release`` can form a release sequence that synchronizes with an acquire sequence that contains the ``nvvm.fence.proxy.tensormap_generic.acquire`` proxy fence. The following table describes the mapping between LLVM Intrinsic and the PTX instruction:
+
+  ====================================================== =========================================================
+  NVVM Intrinsic                                         PTX Instruction
+  ====================================================== =========================================================
+  ``@llvm.nvvm.fence.proxy.tensormap_generic.release.*`` ``fence.proxy.tensormap::generic.release.*``
+  ``@llvm.nvvm.fence.proxy.tensormap_generic.acquire.*`` ``fence.proxy.tensormap::generic.acquire.* [addr], size``
+  ====================================================== =========================================================
+
+The address operand ``addr`` and the operand ``size`` together specify the memory range ``[addr, addr+size)`` on which the ordering guarantees on the memory accesses across the proxies is to be provided. The only supported value for the ``size`` operand is ``128`` and must be an immediate. Generic Addressing is used unconditionally, and the address specified by the operand addr must fall within the ``.global`` state space. Otherwise, the behavior is undefined. For more information, see `PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar>`_.
 
 Other Intrinsics
 ----------------
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1418,6 +1418,20 @@ let TargetPrefix = "nvvm" in {
   def int_nvvm_fence_sc_cluster:
       Intrinsic<[], [], [IntrNoCallback]>;
 
+// Proxy fence (uni-directional)
+foreach scope = ["cta", "cluster", "gpu", "sys"] in {
+
+  def int_nvvm_fence_proxy_tensormap_generic_release_ # scope:
+        Intrinsic<[], [], [IntrNoCallback],
+        "llvm.nvvm.fence.proxy.tensormap_generic.release." # scope>;
+
+  def int_nvvm_fence_proxy_tensormap_generic_acquire_ # scope:
+        Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty],
+                  [IntrNoCallback, IntrArgMemOnly, ImmArg<ArgIndex<1>>],
+                  "llvm.nvvm.fence.proxy.tensormap_generic.acquire." # scope>;
+
+}
+
 // Async Copy
 def int_nvvm_cp_async_mbarrier_arrive :
     ClangBuiltin<"__nvvm_cp_async_mbarrier_arrive">,
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
@@ -6329,6 +6329,14 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
           "llvm.threadlocal.address operand isThreadLocal() must be true");
     break;
   }
+  case Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_cta:
+  case Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_cluster:
+  case Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_gpu:
+  case Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_sys: {
+    unsigned size = cast<ConstantInt>(Call.getArgOperand(1))->getZExtValue();
+    Check(size == 128, " The only supported value for size operand is 128");
+    break;
+  }
   };
 
   // Verify that there aren't any unmediated control transfers between funclets.
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -335,6 +335,48 @@ def INT_FENCE_SC_CLUSTER:
        MEMBAR<"fence.sc.cluster;", int_nvvm_fence_sc_cluster>,
        Requires<[hasPTX<78>, hasSM<90>]>;
 
+// Proxy fence (uni-directional)
+// fence.proxy.tensormap.release variants
+
+class FENCE_PROXY_TENSORMAP_GENERIC_RELEASE<string Scope, Intrinsic Intr> :
+        NVPTXInst<(outs), (ins),
+                  "fence.proxy.tensormap::generic.release." # Scope # ";", [(Intr)]>,
+        Requires<[hasPTX<83>, hasSM<90>]>;
+
+def INT_FENCE_PROXY_TENSORMAP_GENERIC_RELEASE_CTA:
+      FENCE_PROXY_TENSORMAP_GENERIC_RELEASE<"cta",
+        int_nvvm_fence_proxy_tensormap_generic_release_cta>;
+def INT_FENCE_PROXY_TENSORMAP_GENERIC_RELEASE_CLUSTER:
+      FENCE_PROXY_TENSORMAP_GENERIC_RELEASE<"cluster",
+        int_nvvm_fence_proxy_tensormap_generic_release_cluster>;
+def INT_FENCE_PROXY_TENSORMAP_GENERIC_RELEASE_GPU:
+      FENCE_PROXY_TENSORMAP_GENERIC_RELEASE<"gpu",
+        int_nvvm_fence_proxy_tensormap_generic_release_gpu>;
+def INT_FENCE_PROXY_TENSORMAP_GENERIC_RELEASE_SYS:
+      FENCE_PROXY_TENSORMAP_GENERIC_RELEASE<"sys",
+        int_nvvm_fence_proxy_tensormap_generic_release_sys>;
+
+// fence.proxy.tensormap.acquire variants
+
+class FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE<string Scope, Intrinsic Intr> :
+        NVPTXInst<(outs), (ins Int64Regs:$addr),
+                  "fence.proxy.tensormap::generic.acquire." # Scope # " [$addr], 128;",
+                  [(Intr Int64Regs:$addr, (i32 128))]>,
+        Requires<[hasPTX<83>, hasSM<90>]>;
+
+def INT_FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE_CTA :
+      FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE<"cta",
+        int_nvvm_fence_proxy_tensormap_generic_acquire_cta>;
+def INT_FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE_CLUSTER :
+      FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE<"cluster",
+        int_nvvm_fence_proxy_tensormap_generic_acquire_cluster>;
+def INT_FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE_GPU :
+      FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE<"gpu",
+        int_nvvm_fence_proxy_tensormap_generic_acquire_gpu>;
+def INT_FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE_SYS :
+      FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE<"sys",
+        int_nvvm_fence_proxy_tensormap_generic_acquire_sys>;
+
 //-----------------------------------
 // Async Copy Functions
 //-----------------------------------
diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-12.3 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | %ptxas-verify -arch=sm_90 %}
+
+; CHECK-LABEL: test_fence_proxy_tensormap_generic_release
+define void @test_fence_proxy_tensormap_generic_release() {
+  ; CHECK: fence.proxy.tensormap::generic.release.cta;
+  call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cta();
+
+  ; CHECK: fence.proxy.tensormap::generic.release.cluster;
+  call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cluster();
+
+  ; CHECK: fence.proxy.tensormap::generic.release.gpu;
+  call void @llvm.nvvm.fence.proxy.tensormap_generic.release.gpu();
+
+  ; CHECK: fence.proxy.tensormap::generic.release.sys;
+  call void @llvm.nvvm.fence.proxy.tensormap_generic.release.sys();
+
+  ret void
+}
+
+; CHECK-LABEL: test_fence_proxy_tensormap_generic_acquire
+define void @test_fence_proxy_tensormap_generic_acquire(ptr addrspace(0) %addr) {
+  ; CHECK: fence.proxy.tensormap::generic.acquire.cta [%rd{{[0-9]+}}], 128;
+  call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cta(ptr addrspace(0) %addr, i32 128);
+
+  ; CHECK: fence.proxy.tensormap::generic.acquire.cluster [%rd{{[0-9]+}}], 128;
+  call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cluster(ptr addrspace(0) %addr, i32 128);
+
+  ; CHECK: fence.proxy.tensormap::generic.acquire.gpu [%rd{{[0-9]+}}], 128;
+  call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.gpu(ptr addrspace(0) %addr, i32 128);
+
+  ; CHECK: fence.proxy.tensormap::generic.acquire.sys [%rd{{[0-9]+}}], 128;
+  call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.sys(ptr addrspace(0) %addr, i32 128);
+
+  ret void
+}