format

grypp · grypp · commit ad767a649d0a · 2024-03-03T08:42:00.000Z
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/python/matmul.py b/mlir/test/Integration/GPU/CUDA/sm90/python/matmul.py
@@ -252,118 +252,90 @@ def matmul(
 
     print("PASS ")
 
+
 # Takes longer time to run
 def test_long():
-    for stages in range(1,7):
+    for stages in range(1, 7):
         for M in [128, 512, 1024, 4096, 8192]:
             for N in [128, 512, 1024, 4096, 8192]:
                 for K in [64, 128, 512, 1024, 4096, 8192]:
                     matmul(
                         np.float16,
                         np.float32,
-                        M,N,
+                        M,
+                        N,
                         K,
                         max_num_stages=stages,
                         use_warp_specialization=False,
                         no_verify=True,
-                        saveIR=True
                     )
                     matmul(
                         np.float16,
                         np.float32,
-                        M,N,
+                        M,
+                        N,
                         K,
                         max_num_stages=stages,
-                        use_warp_specialization=True,                    
+                        use_warp_specialization=True,
                     )
 
+
 def test_short():
     for stages in [1, 3]:
         for M in [128, 512]:
-            for N in [128, 512]:
-                for K in [64, 512]:
+            for N in [128]:
+                for K in [64, 256]:
                     matmul(
                         np.float16,
                         np.float32,
-                        M,N,
+                        M,
+                        N,
                         K,
                         max_num_stages=stages,
                         use_warp_specialization=False,
-                        no_verify=True,
-                        saveIR=True
                     )
                     matmul(
                         np.float16,
                         np.float32,
-                        M,N,
+                        M,
+                        N,
                         K,
                         max_num_stages=stages,
-                        use_warp_specialization=True,                    
+                        use_warp_specialization=True,
                     )
 
+
 # CHECK: ===-- Running GEMM Multistage f32 += f16 * f16, Size 128x128x64, Tile 128x128x64, stages 1 --===
-# CHECK: PASS 
+# CHECK: PASS
 # CHECK: ===-- Running GEMM Warp specialization f32 += f16 * f16, Size 128x128x64, Tile 128x128x64, stages 1 --===
-# CHECK: PASS 
-# CHECK: ===-- Running GEMM Multistage f32 += f16 * f16, Size 128x128x512, Tile 128x128x64, stages 1 --===
-# CHECK: PASS 
-# CHECK: ===-- Running GEMM Warp specialization f32 += f16 * f16, Size 128x128x512, Tile 128x128x64, stages 1 --===
-# CHECK: PASS 
-# CHECK: ===-- Running GEMM Multistage f32 += f16 * f16, Size 128x512x64, Tile 128x128x64, stages 1 --===
-# CHECK: PASS 
-# CHECK: ===-- Running GEMM Warp specialization f32 += f16 * f16, Size 128x512x64, Tile 128x128x64, stages 1 --===
-# CHECK: PASS 
-# CHECK: ===-- Running GEMM Multistage f32 += f16 * f16, Size 128x512x512, Tile 128x128x64, stages 1 --===
-# CHECK: PASS 
-# CHECK: ===-- Running GEMM Warp specialization f32 += f16 * f16, Size 128x512x512, Tile 128x128x64, stages 1 --===
-# CHECK: PASS 
+# CHECK: PASS
+# CHECK: ===-- Running GEMM Multistage f32 += f16 * f16, Size 128x128x256, Tile 128x128x64, stages 1 --===
+# CHECK: PASS
+# CHECK: ===-- Running GEMM Warp specialization f32 += f16 * f16, Size 128x128x256, Tile 128x128x64, stages 1 --===
+# CHECK: PASS
 # CHECK: ===-- Running GEMM Multistage f32 += f16 * f16, Size 512x128x64, Tile 128x128x64, stages 1 --===
-# CHECK: PASS 
+# CHECK: PASS
 # CHECK: ===-- Running GEMM Warp specialization f32 += f16 * f16, Size 512x128x64, Tile 128x128x64, stages 1 --===
-# CHECK: PASS 
-# CHECK: ===-- Running GEMM Multistage f32 += f16 * f16, Size 512x128x512, Tile 128x128x64, stages 1 --===
-# CHECK: PASS 
-# CHECK: ===-- Running GEMM Warp specialization f32 += f16 * f16, Size 512x128x512, Tile 128x128x64, stages 1 --===
-# CHECK: PASS 
-# CHECK: ===-- Running GEMM Multistage f32 += f16 * f16, Size 512x512x64, Tile 128x128x64, stages 1 --===
-# CHECK: PASS 
-# CHECK: ===-- Running GEMM Warp specialization f32 += f16 * f16, Size 512x512x64, Tile 128x128x64, stages 1 --===
-# CHECK: PASS 
-# CHECK: ===-- Running GEMM Multistage f32 += f16 * f16, Size 512x512x512, Tile 128x128x64, stages 1 --===
-# CHECK: PASS 
-# CHECK: ===-- Running GEMM Warp specialization f32 += f16 * f16, Size 512x512x512, Tile 128x128x64, stages 1 --===
-# CHECK: PASS 
+# CHECK: PASS
+# CHECK: ===-- Running GEMM Multistage f32 += f16 * f16, Size 512x128x256, Tile 128x128x64, stages 1 --===
+# CHECK: PASS
+# CHECK: ===-- Running GEMM Warp specialization f32 += f16 * f16, Size 512x128x256, Tile 128x128x64, stages 1 --===
+# CHECK: PASS
 # CHECK: ===-- Running GEMM Multistage f32 += f16 * f16, Size 128x128x64, Tile 128x128x64, stages 1 --===
-# CHECK: PASS 
+# CHECK: PASS
 # CHECK: ===-- Running GEMM Warp specialization f32 += f16 * f16, Size 128x128x64, Tile 128x128x64, stages 1 --===
-# CHECK: PASS 
-# CHECK: ===-- Running GEMM Multistage f32 += f16 * f16, Size 128x128x512, Tile 128x128x64, stages 3 --===
-# CHECK: PASS 
-# CHECK: ===-- Running GEMM Warp specialization f32 += f16 * f16, Size 128x128x512, Tile 128x128x64, stages 3 --===
-# CHECK: PASS 
-# CHECK: ===-- Running GEMM Multistage f32 += f16 * f16, Size 128x512x64, Tile 128x128x64, stages 2 --===
-# CHECK: PASS 
-# CHECK: ===-- Running GEMM Warp specialization f32 += f16 * f16, Size 128x512x64, Tile 128x128x64, stages 2 --===
-# CHECK: PASS 
-# CHECK: ===-- Running GEMM Multistage f32 += f16 * f16, Size 128x512x512, Tile 128x128x64, stages 3 --===
-# CHECK: PASS 
-# CHECK: ===-- Running GEMM Warp specialization f32 += f16 * f16, Size 128x512x512, Tile 128x128x64, stages 3 --===
-# CHECK: PASS 
+# CHECK: PASS
+# CHECK: ===-- Running GEMM Multistage f32 += f16 * f16, Size 128x128x256, Tile 128x128x64, stages 3 --===
+# CHECK: PASS
+# CHECK: ===-- Running GEMM Warp specialization f32 += f16 * f16, Size 128x128x256, Tile 128x128x64, stages 3 --===
+# CHECK: PASS
 # CHECK: ===-- Running GEMM Multistage f32 += f16 * f16, Size 512x128x64, Tile 128x128x64, stages 2 --===
-# CHECK: PASS 
+# CHECK: PASS
 # CHECK: ===-- Running GEMM Warp specialization f32 += f16 * f16, Size 512x128x64, Tile 128x128x64, stages 2 --===
-# CHECK: PASS 
-# CHECK: ===-- Running GEMM Multistage f32 += f16 * f16, Size 512x128x512, Tile 128x128x64, stages 3 --===
-# CHECK: PASS 
-# CHECK: ===-- Running GEMM Warp specialization f32 += f16 * f16, Size 512x128x512, Tile 128x128x64, stages 3 --===
-# CHECK: PASS 
-# CHECK: ===-- Running GEMM Multistage f32 += f16 * f16, Size 512x512x64, Tile 128x128x64, stages 3 --===
-# CHECK: PASS 
-# CHECK: ===-- Running GEMM Warp specialization f32 += f16 * f16, Size 512x512x64, Tile 128x128x64, stages 3 --===
-# CHECK: PASS 
-# CHECK: ===-- Running GEMM Multistage f32 += f16 * f16, Size 512x512x512, Tile 128x128x64, stages 3 --===
-# CHECK: PASS 
-# CHECK: ===-- Running GEMM Warp specialization f32 += f16 * f16, Size 512x512x512, Tile 128x128x64, stages 3 --===
+# CHECK: PASS
+# CHECK: ===-- Running GEMM Multistage f32 += f16 * f16, Size 512x128x256, Tile 128x128x64, stages 3 --===
+# CHECK: PASS
+# CHECK: ===-- Running GEMM Warp specialization f32 += f16 * f16, Size 512x128x256, Tile 128x128x64, stages 3 --===
 # CHECK: PASS
 
-test_short()
+test_short()
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/python/tools/matmulBuilder.py b/mlir/test/Integration/GPU/CUDA/sm90/python/tools/matmulBuilder.py
@@ -117,11 +117,7 @@ def make_kernel_name(
     num_stages=3,
     use_warp_specialization=False,
 ):
-    kernelName = (
-        "warpspecialized"
-        if use_warp_specialization
-        else "multistage"
-    )
+    kernelName = "warpspecialized" if use_warp_specialization else "multistage"
     return (
         kernelName
         + "_"
@@ -140,6 +136,7 @@ def make_kernel_name(
         + str(num_stages)
     )
 
+
 def generate_matmul_ws(
     input_type=np.float16,
     output_type=np.float32,
@@ -644,7 +641,7 @@ def generate_matmul_ws(
             t8 = gpu.wait(token_ty, [launch_op])
             t9 = gpu.memcpy(token_ty, [t8], c_host, c_device)
             gpu.dealloc(token_ty, [t8], a_device)
-            gpu.dealloc(token_ty, [t8], b_device)            
+            gpu.dealloc(token_ty, [t8], b_device)
             gpu.wait(token_ty, [t9])
             gpu.dealloc(token_ty, [t8], c_device)
             func.ReturnOp([])
@@ -1162,7 +1159,7 @@ def generate_matmul_multistage(
             t8 = gpu.wait(token_ty, [launch_op])
             t9 = gpu.memcpy(token_ty, [t8], c_host, c_device)
             gpu.dealloc(token_ty, [t8], a_device)
-            gpu.dealloc(token_ty, [t8], b_device)            
+            gpu.dealloc(token_ty, [t8], b_device)
             gpu.wait(token_ty, [t9])
             gpu.dealloc(token_ty, [t8], c_device)
             func.ReturnOp([])