llvm
diff --git a/‎mlir/test/Examples/NVGPU/Ch0.py
Lines changed: 50 additions & 0 deletions b/‎mlir/test/Examples/NVGPU/Ch0.py
Lines changed: 50 additions & 0 deletions
diff --git a/‎mlir/test/Examples/NVGPU/Ch1.py
Lines changed: 66 additions & 0 deletions b/‎mlir/test/Examples/NVGPU/Ch1.py
Lines changed: 66 additions & 0 deletions
diff --git a/‎mlir/test/Examples/NVGPU/Ch2.py
Lines changed: 93 additions & 0 deletions b/‎mlir/test/Examples/NVGPU/Ch2.py
Lines changed: 93 additions & 0 deletions
diff --git a/‎mlir/test/Examples/NVGPU/Ch3.py
Lines changed: 129 additions & 0 deletions b/‎mlir/test/Examples/NVGPU/Ch3.py
Lines changed: 129 additions & 0 deletions
@@ -0,0 +1,50 @@
+# RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
+# RUN:   %PYTHON %s | FileCheck %s
+
+# ===----------------------------------------------------------------------===//
+#  Chapter 0 : Hello World
+# ===----------------------------------------------------------------------===//
+#
+# This program demonstrates Hello World:
+#   1. Build MLIR function with arguments
+#   2. Build MLIR GPU kernel
+#   3. Print from a GPU thread
+#   4. Pass arguments, JIT compile and run the MLIR function
+#
+# ===----------------------------------------------------------------------===//
+
+
+from mlir.dialects import gpu
+from tools.nvdsl import *
+
+
+# 1. The decorator generates a MLIR func.func.
+# Everything inside the Python function becomes the body of the func.
+# The decorator also translates `alpha` to an `index` type.
+@NVDSL.mlir_func
+def main(alpha):
+    # 2. The decorator generates a MLIR gpu.launch.
+    # Everything inside the Python function becomes the body of the gpu.launch.
+    # This allows for late outlining of the GPU kernel, enabling optimizations
+    # like constant folding from host to device.
+    @NVDSL.mlir_gpu_launch(grid=(1, 1, 1), block=(4, 1, 1))
+    def kernel():
+        tidx = gpu.thread_id(gpu.Dimension.x)
+        # + operator generates arith.addi
+        myValue = alpha + tidx
+        # Print from a GPU thread
+        gpu.printf("GPU thread %llu has %llu\n", [tidx, myValue])
+
+    # 3. Call the GPU kernel
+    kernel()
+
+
+alpha = 100
+# 4. The `mlir_func` decorator JIT compiles the IR and executes the MLIR function.
+main(alpha)
+
+
+# CHECK: GPU thread 0 has 100
+# CHECK: GPU thread 1 has 101
+# CHECK: GPU thread 2 has 102
+# CHECK: GPU thread 3 has 103
@@ -0,0 +1,66 @@
+# RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
+# RUN:   %PYTHON %s | FileCheck %s
+
+# ===----------------------------------------------------------------------===//
+#  Chapter 1 : 2D Saxpy
+# ===----------------------------------------------------------------------===//
+#
+# This program demonstrates 2D Saxpy:
+#  1. Use GPU dialect to allocate and copy memory host to gpu and vice versa
+#  2. Computes 2D SAXPY kernel using operator overloading
+#  3. Pass numpy arrays to MLIR as memref arguments
+#  4. Verify MLIR program with reference computation in python
+#
+# ===----------------------------------------------------------------------===//
+
+
+from mlir import ir
+from mlir.dialects import gpu, memref
+from tools.nvdsl import *
+import numpy as np
+
+
+@NVDSL.mlir_func
+def saxpy(x, y, alpha):
+    # 1. Use MLIR GPU dialect to allocate and copy memory
+    token_ty = ir.Type.parse("!gpu.async.token")
+    t1 = gpu.wait(token_ty, [])
+    x_dev, t2 = gpu.alloc(x.type, token_ty, [t1], [], [])
+    y_dev, t3 = gpu.alloc(y.type, token_ty, [t2], [], [])
+    t4 = gpu.memcpy(token_ty, [t3], x_dev, x)
+    t5 = gpu.memcpy(token_ty, [t4], y_dev, y)
+    t6 = gpu.wait(token_ty, [t5])
+
+    # 2. Compute 2D SAXPY kernel
+    @NVDSL.mlir_gpu_launch(grid=(M, 1, 1), block=(N, 1, 1))
+    def saxpy_kernel():
+        bidx = gpu.block_id(gpu.Dimension.x)
+        tidx = gpu.thread_id(gpu.Dimension.x)
+        x_val = memref.load(x_dev, [bidx, tidx])
+        y_val = memref.load(y_dev, [bidx, tidx])
+
+        # SAXPY: y[i] += a * x[i];
+        y_val += x_val * alpha
+
+        memref.store(y_val, y_dev, [bidx, tidx])
+
+    saxpy_kernel()
+
+    t7 = gpu.memcpy(token_ty, [t6], y, y_dev)
+    gpu.wait(token_ty, [t7])
+
+
+# 3. Pass numpy arrays to MLIR
+M = 256
+N = 32
+alpha = 2.0
+x = np.random.randn(M, N).astype(np.float32)
+y = np.ones((M, N), np.float32)
+saxpy(x, y, alpha)
+
+#  4. Verify MLIR with reference computation
+ref = np.ones((M, N), np.float32)
+ref += x * alpha
+np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
+print("PASS")
+# CHECK-NOT: Mismatched elements
@@ -0,0 +1,93 @@
+# RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
+# RUN:   %PYTHON %s | FileCheck %s
+
+# ===----------------------------------------------------------------------===//
+#  Chapter 2 : 2D Saxpy with TMA
+# ===----------------------------------------------------------------------===//
+#
+# This program demonstrates 2D Saxpy. It is same as Chapter 1,
+# but it loads data using TMA (Tensor Memory Accelerator)
+#
+# This chapter introduces demonstrates:
+#  1. Computes 2D SAXPY in the same way as Ch1.py but loads data using TMA
+#  2. Create and initialize 1 asynchronous transactional barrier (mbarrier)
+#  3. Thread-0 Load request data load from TMA for each thread block
+#  4. Each thread block loads <1x32xf32> for x and y.
+#  5. Wait for completion of TMA load with mbarrier
+#
+# ===----------------------------------------------------------------------===//
+
+from mlir import ir
+from mlir.dialects import nvgpu, scf, arith, memref, vector, gpu
+from tools.nvdsl import *
+from mlir import runtime as rt
+from mlir.extras import types as T
+import numpy as np
+
+
+@NVDSL.mlir_func
+def saxpy(x, y, alpha):
+    token_ty = ir.Type.parse("!gpu.async.token")
+    t1 = gpu.wait(token_ty, [])
+    x_dev, t2 = gpu.alloc(x.type, token_ty, [t1], [], [])
+    y_dev, t3 = gpu.alloc(y.type, token_ty, [t2], [], [])
+    t4 = gpu.memcpy(token_ty, [t3], x_dev, x)
+    t5 = gpu.memcpy(token_ty, [t4], y_dev, y)
+    t6 = gpu.wait(token_ty, [t5])
+
+    x_tma = TMA([1, N], x.type)
+    y_tma = TMA([1, N], y.type)
+    x_tma.create_descriptor(x_dev)
+    y_tma.create_descriptor(y_dev)
+    sz_x = get_type_size(x_tma.tma_memref)
+    sz_y = get_type_size(x_tma.tma_memref)
+    sz = sz_x + sz_y
+
+    @NVDSL.mlir_gpu_launch(grid=(M, 1, 1), block=(N, 1, 1), smem=sz)
+    def saxpy_tma_kernel():
+        bidx = gpu.block_id(gpu.Dimension.x)
+        tidx = gpu.thread_id(gpu.Dimension.x)
+        isThread0 = tidx == 0
+
+        # 1. Create and initialize asynchronous transactional barrier (mbarrier)
+        mbar_group = Mbarriers(number_of_barriers=1)
+        mbar_group[0].init(1, predicate=isThread0)
+
+        # 2. Execute Tensor Memory Accelerator (TMA) Load
+        x_smem = get_dynamic_shared_memory([1, N], T.f32())
+        y_smem = get_dynamic_shared_memory([1, N], T.f32(), offset=sz_x)
+        x_tma.load(x_smem, mbar_group[0], coords=[0, bidx], predicate=isThread0)
+        y_tma.load(y_smem, mbar_group[0], coords=[0, bidx], predicate=isThread0)
+        mbar_group[0].arrive(txcount=sz, predicate=isThread0)
+
+        # 3. Wait for completion of TMA load with mbarrier
+        mbar_group[0].try_wait()
+
+        x_val = memref.load(x_smem, [const(0), tidx])
+        y_val = memref.load(y_smem, [const(0), tidx])
+
+        # SAXPY: y[i] += a * x[i];
+        y_val += x_val * alpha
+
+        memref.store(y_val, y_dev, [bidx, tidx])
+
+    saxpy_tma_kernel()
+
+    t7 = gpu.memcpy(token_ty, [t6], y, y_dev)
+    gpu.wait(token_ty, [t7])
+
+
+# 3. Pass numpy arrays to MLIR
+M = 256
+N = 32
+alpha = 2.0
+x = np.random.randn(M, N).astype(np.float32)
+y = np.ones((M, N), np.float32)
+saxpy(x, y, alpha)
+
+#  4. Verify MLIR with reference computation
+ref = np.ones((M, N), np.float32)
+ref += x * alpha
+np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
+print("PASS")
+# CHECK-NOT: Mismatched elements
@@ -0,0 +1,129 @@
+# RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
+# RUN:   %PYTHON %s | FileCheck %s
+
+# ===----------------------------------------------------------------------===//
+#  Chapter 3 : GEMM 128x128x64 with Tensor Core
+# ===----------------------------------------------------------------------===//
+#
+# This program demonstrates a GEMM operation with 128x128x64 matrix multiplication
+#
+# This chapter introduces demonstrates:
+# 1. Execute TMA Load for two input matrices
+# 2. Performs Tensor Core GEMM 128x128x64 by warpgroup
+# 3. Stores fragmented registers to global memory by warpgroup
+#
+# ===----------------------------------------------------------------------===//
+
+
+from mlir import ir
+from mlir.dialects import nvgpu, scf, arith, memref, vector, gpu
+from tools.nvdsl import *
+from mlir.extras import types as T
+import numpy as np
+
+
+def tma_load(
+    mbar_group: Mbarriers,
+    a_tma: TMA,
+    b_tma: TMA,
+    p,
+):
+    """
+    TMA loads two input matrices from global memory to shared memory. It performs the following operations:
+
+       - tma.load a_shared_memory[0] at coordinate [0, 0]  (Loads 128x64)
+       - tma.load b_shared_memory[0] at coordinate [0, 0]  (Loads 64x64)
+       - tma.load b_shared_memory[0] at coordinate [64, 0] (Loads 64x64)
+
+       mbarrier.arrive ta_count = 128x64xf16 + 64x128xf16
+    """
+
+    size_tma_a = get_type_size(a_tma.tma_memref)
+    size_tma_b = get_type_size(b_tma.tma_memref)
+    ta_count = size_tma_a + (size_tma_b * 2)
+
+    off_b = size_tma_a
+    off_b2 = off_b + size_tma_b
+    a_elem_ty = a_tma.tma_memref.element_type
+    b_elem_ty = b_tma.tma_memref.element_type
+    a = get_dynamic_shared_memory(a_tma.tma_memref.shape, a_elem_ty)
+    b1 = get_dynamic_shared_memory(b_tma.tma_memref.shape, b_elem_ty, off_b)
+    b2 = get_dynamic_shared_memory(b_tma.tma_memref.shape, b_elem_ty, off_b2)
+
+    mbar_group[0].arrive(ta_count, predicate=p)
+
+    a_tma.load(a, mbar_group[0], coords=[0, 0], predicate=p)
+    b_tma.load(b1, mbar_group[0], coords=[0, 0], predicate=p)
+    b_tma.load(b2, mbar_group[0], coords=[64, 0], predicate=p)
+
+
+@NVDSL.mlir_func
+def gemm_128_128_64(a, b, d):
+    token_ty = ir.Type.parse("!gpu.async.token")
+    t1 = gpu.wait(token_ty, [])
+    a_dev, t2 = gpu.alloc(a.type, token_ty, [t1], [], [])
+    b_dev, t3 = gpu.alloc(b.type, token_ty, [t2], [], [])
+    d_dev, t4 = gpu.alloc(d.type, token_ty, [t3], [], [])
+    t5 = gpu.memcpy(token_ty, [t4], a_dev, a)
+    t6 = gpu.memcpy(token_ty, [t5], b_dev, b)
+    t7 = gpu.wait(token_ty, [t6])
+
+    sw = nvgpu.TensorMapSwizzleKind.SWIZZLE_128B
+    a_tma = TMA([128, 64], a.type, swizzle=sw)
+    b_tma = TMA([64, 64], b.type, swizzle=sw)
+    a_tma.create_descriptor(a_dev)
+    b_tma.create_descriptor(b_dev)
+    a_size = get_type_size(a.type)
+    b_size = get_type_size(b.type)
+    smem_size_in_bytes = a_size + b_size
+
+    @NVDSL.mlir_gpu_launch(grid=(1, 1, 1), block=(128, 1, 1), smem=smem_size_in_bytes)
+    def gemm_tma_kernel():
+        tidx = gpu.thread_id(gpu.Dimension.x)
+
+        mbar_group = Mbarriers(number_of_barriers=1)
+        isThread0 = tidx == 0
+
+        mbar_group[0].init(1, predicate=isThread0)
+        a_tma.prefetch(predicate=isThread0)
+        b_tma.prefetch(predicate=isThread0)
+
+        a_smem = get_dynamic_shared_memory((M, K), T.f16())
+        b_smem = get_dynamic_shared_memory((K, N), T.f16(), offset=a_size)
+
+        # 1. TMA Load for two input matrices
+        tma_load(mbar_group, a_tma, b_tma, isThread0)
+
+        # 2. All threads wait TMA load completion
+        mbar_group[0].try_wait()
+
+        # 3. Performs Tensor Core GEMM 128x128x64 by warpgroup
+        A = WGMMAMatrix(WGMMAType.Descriptor, [M, K], desc=a_tma, smem=a_smem)
+        B = WGMMAMatrix(WGMMAType.Descriptor, [K, N], desc=b_tma, smem=b_smem)
+        D = WGMMAMatrix(WGMMAType.Accumulator, shape=[M, N], ty=T.f32())
+
+        # Matrix Multiply
+        D += A @ B
+
+        # 4. Stores fragmented registers to global memory by warpgroup
+        D.store_accumulator(d_dev)
+
+    gemm_tma_kernel()
+
+    t8 = gpu.memcpy(token_ty, [t7], d, d_dev)
+    gpu.wait(None, [t8])
+
+
+# Python pass arguments to MLIR
+M = 128
+N = 128
+K = 64
+a = np.random.randn(M, K).astype(np.float16)
+b = np.random.randn(K, N).astype(np.float16)
+d = np.zeros((M, N), np.float32)
+gemm_128_128_64(a, b, d)
+
+ref_d = a.astype(np.float16) @ b.astype(np.float16)
+np.testing.assert_allclose(d, ref_d, rtol=5e-03, atol=1e-01)
+print("PASS")
+# CHECK-NOT: Mismatched elements