llvm
diff --git a/‎mlir/test/Integration/GPU/CUDA/sm90/python/matmul.py
Lines changed: 118 additions & 59 deletions b/‎mlir/test/Integration/GPU/CUDA/sm90/python/matmul.py
Lines changed: 118 additions & 59 deletions
@@ -86,35 +86,52 @@
 from mlir import runtime as rt
 
 
-def generate_matmul(input_type=np.float16,
-                    output_type=np.float32,
-                    M=4096,
-                    N=4096,
-                    K=4096,
-                    BLOCK_M=128,
-                    BLOCK_N=128,
-                    BLOCK_K=64,
-                    use_warp_specilization=True,
-                    saveIR=False,
-                    max_num_stages=3):
-    with matmulBuilder.ir.Context() as ctx, matmulBuilder.ir.Location.unknown(
-    ):
+def generate_matmul(
+    input_type=np.float16,
+    output_type=np.float32,
+    M=4096,
+    N=4096,
+    K=4096,
+    BLOCK_M=128,
+    BLOCK_N=128,
+    BLOCK_K=64,
+    use_warp_specilization=True,
+    saveIR=False,
+    max_num_stages=3,
+):
+    with matmulBuilder.ir.Context() as ctx, matmulBuilder.ir.Location.unknown():
         if use_warp_specilization:
             mlir_nvgpu_module = matmulBuilder.generate_matmul_ws(
-                input_type, output_type, M, N, K, BLOCK_M, BLOCK_N, BLOCK_K,
-                max_num_stages)
+                input_type,
+                output_type,
+                M,
+                N,
+                K,
+                BLOCK_M,
+                BLOCK_N,
+                BLOCK_K,
+                max_num_stages,
+            )
         else:
             mlir_nvgpu_module = matmulBuilder.generate_matmul_multistage(
-                input_type, output_type, M, N, K, BLOCK_M, BLOCK_N, BLOCK_K,
-                max_num_stages)
+                input_type,
+                output_type,
+                M,
+                N,
+                K,
+                BLOCK_M,
+                BLOCK_N,
+                BLOCK_K,
+                max_num_stages,
+            )
 
         mlir_nvgpu_module.operation.verify()
 
         # Save generated IR
         if saveIR:
             # print(mlir_nvgpu_module)
             original_stdout = sys.stdout
-            with open('gemm.mlir', 'w') as f:
+            with open("gemm.mlir", "w") as f:
                 sys.stdout = f
                 print(mlir_nvgpu_module)
                 sys.stdout = original_stdout
@@ -123,43 +140,77 @@ def generate_matmul(input_type=np.float16,
         options = f"cubin-chip=sm_90a cubin-features=+ptx80 opt-level=3"
         support_lib = os.getenv("SUPPORT_LIB")
         if not os.path.exists(support_lib):
-            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),
-                                    support_lib)
-        compiler = nvgpucompiler.NvgpuCompiler(options,
-                                               opt_level=3,
-                                               shared_libs=[support_lib])
+            raise FileNotFoundError(
+                errno.ENOENT, os.strerror(errno.ENOENT), support_lib
+            )
+        compiler = nvgpucompiler.NvgpuCompiler(
+            options, opt_level=3, shared_libs=[support_lib]
+        )
 
         # Compile
         engine = compiler.compile_and_jit(mlir_nvgpu_module)
         return engine
 
 
-def matmul(input_type=np.float16,
-           output_type=np.float32,
-           M=128,
-           N=128,
-           K=128,
-           BLOCK_M=128,
-           BLOCK_N=128,
-           BLOCK_K=64,
-           use_warp_specilization=True,
-           saveIR=False,
-           max_num_stages=3,
-           print_results=False,
-           no_verify=False):
+def matmul(
+    input_type=np.float16,
+    output_type=np.float32,
+    M=128,
+    N=128,
+    K=128,
+    BLOCK_M=128,
+    BLOCK_N=128,
+    BLOCK_K=64,
+    use_warp_specilization=True,
+    saveIR=False,
+    max_num_stages=3,
+    print_results=False,
+    no_verify=False,
+):
     # Print the configuration
     ity = "f16" if input_type == np.float16 else "f32"
     oty = "f16" if output_type == np.float16 else "f32"
     gemmty = "Warp Specilization" if use_warp_specilization else "Multistage"
-    print("===-- Running GEMM " + gemmty + " " + oty + " += " + ity + " * " +
-          ity + ", Size " + str(M) + "x" + str(N) + "x" + str(K) + ", Tile " +
-          str(BLOCK_M) + "x" + str(BLOCK_N) + "x" + str(BLOCK_K) +
-          ", stages " + str(max_num_stages) + " --===")
+    print(
+        "===-- Running GEMM "
+        + gemmty
+        + " "
+        + oty
+        + " += "
+        + ity
+        + " * "
+        + ity
+        + ", Size "
+        + str(M)
+        + "x"
+        + str(N)
+        + "x"
+        + str(K)
+        + ", Tile "
+        + str(BLOCK_M)
+        + "x"
+        + str(BLOCK_N)
+        + "x"
+        + str(BLOCK_K)
+        + ", stages "
+        + str(max_num_stages)
+        + " --==="
+    )
 
     # Build IR and compile
-    engine = generate_matmul(input_type, output_type, M, N, K, BLOCK_M,
-                             BLOCK_N, BLOCK_K, use_warp_specilization, saveIR,
-                             max_num_stages)
+    engine = generate_matmul(
+        input_type,
+        output_type,
+        M,
+        N,
+        K,
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        use_warp_specilization,
+        saveIR,
+        max_num_stages,
+    )
 
     # Allocate matrices and invoke the matmul
     c = np.zeros((M, N), output_type)
@@ -168,13 +219,17 @@ def matmul(input_type=np.float16,
     mem_a = ctypes.pointer(ctypes.pointer(rt.get_ranked_memref_descriptor(a)))
     mem_b = ctypes.pointer(ctypes.pointer(rt.get_ranked_memref_descriptor(b)))
     mem_c = ctypes.pointer(ctypes.pointer(rt.get_ranked_memref_descriptor(c)))
-    kernelName = "mlir_matmul_warpspecialized" if use_warp_specilization else "mlir_matmul_multistage"
+    kernelName = (
+        "mlir_matmul_warpspecialized"
+        if use_warp_specilization
+        else "mlir_matmul_multistage"
+    )
 
     # Launch the MLIR generated kernel
     engine.invoke(kernelName, mem_a, mem_b, mem_c)
 
     float_formatter = "{:.2f}".format
-    np.set_printoptions(formatter={'float_kind': float_formatter})
+    np.set_printoptions(formatter={"float_kind": float_formatter})
 
     if print_results:
         print(c)
@@ -190,18 +245,22 @@ def matmul(input_type=np.float16,
 
 
 # GEMM Multistage       f32 += f16 * f16
-matmul(np.float16,
-       np.float32,
-       128,
-       128,
-       4096,
-       max_num_stages=3,
-       use_warp_specilization=False)
+matmul(
+    np.float16,
+    np.float32,
+    128,
+    128,
+    4096,
+    max_num_stages=3,
+    use_warp_specilization=False,
+)
 # GEMM Warp Specilized  f32 += f16 * f16
-matmul(np.float16,
-       np.float32,
-       256,
-       1024,
-       512,
-       max_num_stages=3,
-       use_warp_specilization=True)
+matmul(
+    np.float16,
+    np.float32,
+    256,
+    1024,
+    512,
+    max_num_stages=3,
+    use_warp_specilization=True,
+)