[Transform][Fusion] loose isTiledOpInLoop check to cover no insert_slice case (#311)

Yun-Fly · web-flow · commit 8b4a5e74beae · 2024-09-02T11:12:02.000+08:00
Sometimes, the `insert_slice` maybe eliminated due to the same size of `SRC` and `DEST`.
diff --git a/lib/gc/Transforms/IterativeTilingAndFusion.cpp b/lib/gc/Transforms/IterativeTilingAndFusion.cpp
@@ -529,7 +529,7 @@ tileAndFuseConsumerOfOpResult(RewriterBase &rewriter, OpResult result,
       return false;
     unsigned index = std::distance(uses.begin(), iter);
     SmallVector<unsigned> indices =
-        llvm::to_vector(llvm::seq<unsigned>(0, numberUses));
+        llvm::to_vector(llvm::seq<unsigned>(numberUses));
     indices.push_back(indices[index]);
     indices.erase(indices.begin() + index);
     operand->get().shuffleUseList(indices);
@@ -636,12 +636,9 @@ static LogicalResult isTiledOpInLoop(Operation *targetOp) {
     return failure();
 
   // 3. check whether has either extract or insert slice op
-  auto walkResult = forOp->walk(
-      [](tensor::ExtractSliceOp) { return WalkResult::interrupt(); });
-  if (!walkResult.wasInterrupted())
-    return failure();
-  walkResult = forOp->walk([](OffsetSizeAndStrideOpInterface op) {
-    return isa<tensor::InsertSliceOp, tensor::ParallelInsertSliceOp>(op)
+  auto walkResult = forOp->walk([](OffsetSizeAndStrideOpInterface op) {
+    return isa<tensor::ExtractSliceOp, tensor::InsertSliceOp,
+               tensor::ParallelInsertSliceOp>(op)
                ? WalkResult::interrupt()
                : WalkResult::advance();
   });
diff --git a/test/mlir/test/gc/Transforms/iterative-tiling-and-fusion.mlir b/test/mlir/test/gc/Transforms/iterative-tiling-and-fusion.mlir
@@ -95,6 +95,83 @@ module {
 
 // -----
 
+#map = affine_map<(d0) -> (d0 * 32)>
+#map1 = affine_map<(d0) -> (d0 * 16)>
+module {
+  /// CHECK-LABEL: @fuse_mlp_vnni
+  func.func @fuse_mlp_vnni(%arg0: tensor<128x1024xbf16>, %arg1: tensor<1024x512xbf16>, %arg2: tensor<512xbf16>) -> tensor<128x512xbf16> attributes {llvm.emit_c_interface} {
+    %c2 = arith.constant 2 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x512xbf16>
+    /// CHECK: tensor.empty
+    %0 = tensor.empty() : tensor<128x512xbf16>
+    /// CHECK: tensor.empty
+    %1 = tensor.empty() : tensor<16x64x16x32xbf16>
+    %pack = tensor.pack %arg1 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 32] into %1 : tensor<1024x512xbf16> -> tensor<16x64x16x32xbf16>
+    /// CHECK: tensor.empty
+    %2 = tensor.empty() : tensor<16x64x8x32x2xbf16>
+    %pack_0 = tensor.pack %pack inner_dims_pos = [2] inner_tiles = [2] into %2 : tensor<16x64x16x32xbf16> -> tensor<16x64x8x32x2xbf16>
+    /// CHECK:   %[[FINAL_RESULT:.*]]:3 = scf.forall (%{{.*}}) in (16)
+    %3 = scf.forall (%arg3) in (16) shared_outs(%arg4 = %0) -> (tensor<128x512xbf16>) {
+      %9 = affine.apply #map(%arg3)
+      %extracted_slice = tensor.extract_slice %arg4[0, %9] [128, 32] [1, 1] : tensor<128x512xbf16> to tensor<128x32xbf16>
+      /// CHECK: tensor.empty
+      %10 = tensor.empty() : tensor<128x32xf32>
+      /// CHECK: linalg.copy
+      %11 = linalg.copy ins(%extracted_slice : tensor<128x32xbf16>) outs(%10 : tensor<128x32xf32>) -> tensor<128x32xf32>
+      /// CHECK:   %[[TMP_RESULT:.*]]:2 = scf.for
+      %12:2 = scf.for %arg5 = %c0 to %c64 step %c2 iter_args(%arg6 = %11, %arg7 = %extracted_slice) -> (tensor<128x32xf32>, tensor<128x32xbf16>) {
+        %14 = affine.apply #map1(%arg5)
+        %extracted_slice_1 = tensor.extract_slice %arg0[0, %14] [128, 32] [1, 1] : tensor<128x1024xbf16> to tensor<128x32xbf16>
+        /// CHECK: %[[PACK_OUT:.*]] = tensor.pack
+        /// CHECK: %[[PACK_OUT_VNNI:.*]] = tensor.pack %[[PACK_OUT]]
+        %extracted_slice_2 = tensor.extract_slice %pack_0[%arg3, %arg5, 0, 0, 0] [1, 2, 8, 32, 2] [1, 1, 1, 1, 1] : tensor<16x64x8x32x2xbf16> to tensor<1x2x8x32x2xbf16>
+        /// CHECK: %[[COLLAPSE_OUT:.*]] = tensor.collapse_shape %[[PACK_OUT_VNNI]]
+        %collapsed = tensor.collapse_shape %extracted_slice_2 [[0, 1], [2], [3], [4]] : tensor<1x2x8x32x2xbf16> into tensor<2x8x32x2xbf16>
+        /// CHECK: %[[EXPAND_OUT:.*]] = tensor.expand_shape
+        %expanded = tensor.expand_shape %extracted_slice_1 [[0], [1, 2]] output_shape [128, 2, 16] : tensor<128x32xbf16> into tensor<128x2x16xbf16>
+        %15 = tensor.empty() : tensor<2x128x16xbf16>
+        /// CHECK: %[[TRANSPOSE_OUT:.*]] = linalg.transpose ins(%[[EXPAND_OUT]] :
+        %transposed = linalg.transpose ins(%expanded : tensor<128x2x16xbf16>) outs(%15 : tensor<2x128x16xbf16>) permutation = [1, 0, 2] 
+        /// CHECK: %[[MATMUL_OUT:.*]] = linalgx.batch_reduce_matmul_vnni ins(%[[TRANSPOSE_OUT]], %[[COLLAPSE_OUT]] :
+        %16 = linalgx.batch_reduce_matmul_vnni ins(%transposed, %collapsed : tensor<2x128x16xbf16>, tensor<2x8x32x2xbf16>) outs(%arg6 : tensor<128x32xf32>) -> tensor<128x32xf32>
+        %17 = arith.addi %arg5, %c2 : index
+        %18 = arith.cmpi sge, %17, %c64 : index
+        /// CHECK: %[[IF_RESULT:.*]] = scf.if
+        %19 = scf.if %18 -> (tensor<128x32xbf16>) {
+          %20 = linalg.copy ins(%16 : tensor<128x32xf32>) outs(%arg7 : tensor<128x32xbf16>) -> tensor<128x32xbf16>
+          scf.yield %20 : tensor<128x32xbf16>
+        } else {
+          scf.yield %arg7 : tensor<128x32xbf16>
+        }
+        /// CHECK: scf.yield %[[MATMUL_OUT]], %[[IF_RESULT]] :
+        scf.yield %16, %19 : tensor<128x32xf32>, tensor<128x32xbf16>
+      }
+      /// CHECK: %[[BROADCAST_OUT:.*]] = linalg.broadcast
+      /// CHECK: %[[ADD_OUT:.*]] = linalg.add ins(%[[TMP_RESULT]]#1, %[[BROADCAST_OUT]] :
+      /// CHECK: %[[MAX_OUT:.*]] = linalg.max ins(%[[ADD_OUT]], 
+      %13 = affine.apply #map(%arg3)
+      scf.forall.in_parallel {
+        /// CHECK: tensor.parallel_insert_slice %[[MAX_OUT]]
+        /// CHECK: tensor.parallel_insert_slice
+        /// CHECK: tensor.parallel_insert_slice
+        tensor.parallel_insert_slice %12#1 into %arg4[0, %13] [128, 32] [1, 1] : tensor<128x32xbf16> into tensor<128x512xbf16>
+      }
+    }
+    %4 = tensor.empty() : tensor<128x512xbf16>
+    %broadcasted = linalg.broadcast ins(%arg2 : tensor<512xbf16>) outs(%4 : tensor<128x512xbf16>) dimensions = [0] 
+    %5 = tensor.empty() : tensor<128x512xbf16>
+    %6 = linalg.add ins(%3, %broadcasted : tensor<128x512xbf16>, tensor<128x512xbf16>) outs(%5 : tensor<128x512xbf16>) -> tensor<128x512xbf16>
+    %7 = tensor.empty() : tensor<128x512xbf16>
+    %8 = linalg.max ins(%6, %cst : tensor<128x512xbf16>, tensor<128x512xbf16>) outs(%7 : tensor<128x512xbf16>) -> tensor<128x512xbf16>
+    /// CHECK: return %[[FINAL_RESULT]]#2
+    return %8 : tensor<128x512xbf16>
+  }
+}
+
+// -----
+
 #map = affine_map<(d0) -> (d0 * 128)>
 module {
   /// CHECK-LABEL: @fuse_multiple_consumer