intel
diff --git a/‎include/gc/Transforms/Passes.td
Lines changed: 29 additions & 0 deletions b/‎include/gc/Transforms/Passes.td
Lines changed: 29 additions & 0 deletions
diff --git a/‎lib/gc/Transforms/GPU/CMakeLists.txt
Lines changed: 2 additions & 0 deletions b/‎lib/gc/Transforms/GPU/CMakeLists.txt
Lines changed: 2 additions & 0 deletions
diff --git a/‎lib/gc/Transforms/GPU/GpuLoopTiling.cpp
Lines changed: 74 additions & 0 deletions b/‎lib/gc/Transforms/GPU/GpuLoopTiling.cpp
Lines changed: 74 additions & 0 deletions
@@ -120,6 +120,35 @@ def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> {
            "Call finish() after each kernel launch.">
     ];
 }
+
+def GpuTilingAndFusion : Pass<"gpu-tiling", "func::FuncOp"> {
+  let summary = "GPU tiling and fusion path.";
+  let description = [{
+    This path tiles linalg operations and wraps into foreach loops.
+    The tiles calculation is based on the Execution Unit cache size and the number of threads per EU.
+  }];
+  let options = [
+    Option<"euMem", "eu-mem", "size_t",
+           /*default=*/"131072",
+           "Execution Unit cache size.">,
+    Option<"euThreads", "eu-threads", "size_t",
+           /*default=*/"8",
+           "Number of threads per EU.">
+    ];
+}
+
+def GpuLoopTiling : Pass<"gpu-loop-tiling", "func::FuncOp"> {
+  let summary = "Create nested parallel loops to be mapped to GPU.";
+  let description = [{
+    This path tiles the loops created by the GpuTilingAndFusion pass and converted to parallel loops.
+    Each tile of the outer loop is divided by the number of threads per EU.
+  }];
+  let options = [
+    Option<"euThreads", "eu-threads", "size_t",
+           /*default=*/"8",
+           "Number of threads per Execution Unit.">
+    ];
+}
 #endif // GC_USE_IMEX
 
 def IterativeTilingAndFusion : Pass<"iterative-tiling-and-fusion",
 
@@ -13,6 +13,8 @@ set_property(GLOBAL APPEND PROPERTY IMEX_LIBS ${IMEX_LIBS})
 gc_add_mlir_library(GcGpuPasses
   AddContextArg.cpp
   AllocsToSLM.cpp
+  GpuLoopTiling.cpp
+  GpuTilingAndFusion.cpp
   GpuToGpuOcl.cpp
   LinalgToXeGPU.cpp
   Pipeline.cpp
 
@@ -0,0 +1,74 @@
+//===-- GpuLoopTiling.cpp - DESC --------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
+#include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Conversion/Passes.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/Utils.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/SCF/Transforms/Transforms.h"
+#include "mlir/Transforms/RegionUtils.h"
+
+#include "./GpuUtils.h"
+#include "gc/Utils/Log.h"
+
+using namespace mlir;
+// using namespace mlir::gc::gpu;
+
+namespace mlir::gc {
+#define GEN_PASS_DECL_GPULOOPTILING
+#define GEN_PASS_DEF_GPULOOPTILING
+#include "gc/Transforms/Passes.h.inc"
+} // namespace mlir::gc
+
+namespace {
+
+struct GpuLoopTiling final : GpuPass<GpuLoopTiling>,
+                             gc::impl::GpuLoopTilingBase<GpuLoopTiling> {
+  friend GpuPass;
+  explicit GpuLoopTiling() : GpuLoopTiling(gc::GpuLoopTilingOptions{}) {}
+  explicit GpuLoopTiling(const gc::GpuLoopTilingOptions &opts)
+      : GpuPass(), GpuLoopTilingBase(opts) {}
+
+  void runOnOperation() override {
+    IRRewriter rewriter(&getContext());
+    auto euThreads = static_cast<double>(getEuThreads(rewriter));
+    getOperation().walk<WalkOrder::PreOrder>([&](scf::ParallelOp loop) {
+      if (!loop->getParentOfType<scf::ParallelOp>()) {
+        tile(loop, euThreads);
+      }
+      return WalkResult::skip();
+    });
+    if (failed(simplifyRegions(rewriter, getOperation()->getRegions()))) {
+      gcLogD("Failed to simplify regions");
+    }
+  }
+
+private:
+  static void tile(scf::ParallelOp loop, double euThreads) {
+    SmallVector<int64_t> tileSizes;
+    auto steps = loop.getStep();
+    tileSizes.reserve(steps.size());
+
+    for (auto step : steps) {
+      if (auto v = getConstIdxValue(step)) {
+        tileSizes.push_back(static_cast<int64_t>(
+            std::ceil(static_cast<double>(v) / euThreads)));
+      } else {
+        tileSizes.push_back(32);
+      }
+    }
+
+    tileParallelLoop(loop, tileSizes, false);
+  }
+};
+} // namespace