intel
diff --git a/‎include/gc/Transforms/Passes.td
Lines changed: 30 additions & 0 deletions b/‎include/gc/Transforms/Passes.td
Lines changed: 30 additions & 0 deletions
diff --git a/‎lib/gc/Transforms/GPU/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎lib/gc/Transforms/GPU/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
@@ -120,6 +120,36 @@ def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> {
            "Call finish() after each kernel launch.">
     ];
 }
+
+def GpuTilingAndFusion : Pass<"gpu-tiling", "func::FuncOp"> {
+  let summary = "GPU tiling and fusion path.";
+  let description = [{
+    This pass tiles linalg operations and creates two nested csf.forall loops. When converting to gpu.launch,
+    the inner loop is mapped to the block sizes and the outer - to grid sizes. The tiles calculation is based
+    on the GPU device properties, retrieved from the DLTI attributes. If the DLTI attributes are not specified,
+    defaults to the pass options.
+  }];
+  let options = [
+    Option<"numEus", "num-eus", "size_t",
+           /*default=*/"448",
+           "Number of Execution Units.">,
+    Option<"numEusPerSlice", "num-eus-per-slice", "size_t",
+           /*default=*/"8",
+           "Number of Execution Units per slice.">,
+    Option<"numThreadsPerEu", "num-threads-per-eu", "size_t",
+           /*default=*/"8",
+           "Number of threads per Execution Unit.">,
+    Option<"cacheSize", "cache-size", "size_t",
+           /*default=*/"131072",
+           "Execution Unit cache size.">,
+    Option<"vectorWidth", "vector-width", "size_t",
+           /*default=*/"512",
+           "The maximum width of EU's vector registers.">,
+    Option<"workGroupSize", "work-group-size", "size_t",
+           /*default=*/"64",
+           "The maximum workgroup size.">
+    ];
+}
 #endif // GC_USE_IMEX
 
 def IterativeTilingAndFusion : Pass<"iterative-tiling-and-fusion",
 
@@ -13,6 +13,7 @@ set_property(GLOBAL APPEND PROPERTY IMEX_LIBS ${IMEX_LIBS})
 gc_add_mlir_library(GcGpuPasses
   AddContextArg.cpp
   AllocsToSLM.cpp
+  GpuTilingAndFusion.cpp
   GpuToGpuOcl.cpp
   LinalgToXeGPU.cpp
   Pipeline.cpp