intel
diff --git a/‎include/gc/Transforms/Passes.td
Lines changed: 22 additions & 11 deletions b/‎include/gc/Transforms/Passes.td
Lines changed: 22 additions & 11 deletions
diff --git a/‎lib/gc/Transforms/GPU/GpuLoopTiling.cpp
Lines changed: 17 additions & 22 deletions b/‎lib/gc/Transforms/GPU/GpuLoopTiling.cpp
Lines changed: 17 additions & 22 deletions
@@ -124,29 +124,40 @@ def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> {
 def GpuTilingAndFusion : Pass<"gpu-tiling", "func::FuncOp"> {
   let summary = "GPU tiling and fusion path.";
   let description = [{
-    This path tiles linalg operations and wraps into foreach loops.
-    The tiles calculation is based on the Execution Unit cache size and the number of threads per EU.
+    This pass tiles linalg operations and creates an inner loop that is mapped to the block sizes, when converting
+    to gpu.launch. The tiles calculation is based on the GPU device properties, retrieved from the DLTI attributes.
+    If the DLTI attributes are not specified, defaults to the pass options.
   }];
   let options = [
-    Option<"euMem", "eu-mem", "size_t",
+    Option<"numEus", "num-eus", "size_t",
+           /*default=*/"448",
+           "Number of Execution Units.">,
+    Option<"numEusPerSlice", "num-eus-per-slice", "size_t",
+           /*default=*/"8",
+           "Number of Execution Units per slice.">,
+    Option<"numThreadsPerEu", "num-threads-per-eu", "size_t",
+           /*default=*/"8",
+           "Number of threads per Execution Unit.">,
+    Option<"cacheSize", "cache-size", "size_t",
            /*default=*/"131072",
            "Execution Unit cache size.">,
-    Option<"euThreads", "eu-threads", "size_t",
-           /*default=*/"8",
-           "Number of threads per EU.">
+    Option<"vectorWidth", "vector-width", "size_t",
+           /*default=*/"512",
+           "The maximum width of EU's vector registers.">
     ];
 }
 
 def GpuLoopTiling : Pass<"gpu-loop-tiling", "func::FuncOp"> {
   let summary = "Create nested parallel loops to be mapped to GPU.";
   let description = [{
-    This path tiles the loops created by the GpuTilingAndFusion pass and converted to parallel loops.
-    Each tile of the outer loop is divided by the number of threads per EU.
+    This pass tiles the loops created by the GpuTilingAndFusion pass and converted to parallel loops. The tiles
+    calculation is based on the max_work_group_size DLTI attribute. If the attribute is not specified,
+    defaults to the pass options.
   }];
   let options = [
-    Option<"euThreads", "eu-threads", "size_t",
-           /*default=*/"8",
-           "Number of threads per Execution Unit.">
+    Option<"workGroupSize", "work-group-size", "size_t",
+           /*default=*/"64",
+           "The maximum workgroup size.">
     ];
 }
 #endif // GC_USE_IMEX
 
@@ -22,7 +22,7 @@
 #include "gc/Utils/Log.h"
 
 using namespace mlir;
-// using namespace mlir::gc::gpu;
+using namespace mlir::gc;
 
 namespace mlir::gc {
 #define GEN_PASS_DECL_GPULOOPTILING
@@ -41,34 +41,29 @@ struct GpuLoopTiling final : GpuPass<GpuLoopTiling>,
 
   void runOnOperation() override {
     IRRewriter rewriter(&getContext());
-    auto euThreads = static_cast<double>(getEuThreads(rewriter));
-    getOperation().walk<WalkOrder::PreOrder>([&](scf::ParallelOp loop) {
+    auto wgSize = getWorkGroupSize(rewriter);
+    getOperation().walk<WalkOrder::PreOrder>([wgSize](scf::ParallelOp loop) {
       if (!loop->getParentOfType<scf::ParallelOp>()) {
-        tile(loop, euThreads);
+        SmallVector<int64_t> tiles;
+        auto steps = loop.getStep();
+        tiles.reserve(steps.size());
+
+        for (auto step : steps) {
+          if (auto v = getConstIdxValue(step)) {
+            tiles.push_back(v);
+          } else {
+            tiles.push_back(32);
+          }
+        }
+
+        adjustTiles(wgSize, tiles);
+        tileParallelLoop(loop, tiles, false);
       }
       return WalkResult::skip();
     });
     if (failed(simplifyRegions(rewriter, getOperation()->getRegions()))) {
       gcLogD("Failed to simplify regions");
     }
   }
-
-private:
-  static void tile(scf::ParallelOp loop, double euThreads) {
-    SmallVector<int64_t> tileSizes;
-    auto steps = loop.getStep();
-    tileSizes.reserve(steps.size());
-
-    for (auto step : steps) {
-      if (auto v = getConstIdxValue(step)) {
-        tileSizes.push_back(static_cast<int64_t>(
-            std::ceil(static_cast<double>(v) / euThreads)));
-      } else {
-        tileSizes.push_back(32);
-      }
-    }
-
-    tileParallelLoop(loop, tileSizes, false);
-  }
 };
 } // namespace