correct the way to broadcast a scalar value

Hsiangkai · Hsiangkai · commit b08accd6b17c · 2024-07-01T00:16:45.000+01:00
diff --git a/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp b/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp
@@ -686,21 +686,43 @@ Value outputTransform(RewriterBase &rewriter, Location loc, Value value,
       matmulRetValue = matmulOp.getResult(0);
     }
 
-    // Multiply scalar factor.
-    Value scalarFactor = builder.create<arith::ConstantOp>(
-        loc, FloatAttr::get(elementType, leftScalarFactor * rightScalarFactor));
-    auto matmulType = RankedTensorType::get({retRows, retCols}, elementType);
-    auto init = builder.create<tensor::EmptyOp>(loc, matmulType.getShape(),
-                                                elementType);
-    Value broadcastedScalar =
-        builder.create<tensor::FromElementsOp>(loc, matmulType, scalarFactor);
-    auto scaledMatmul = builder.create<linalg::MulOp>(
-        loc, matmulType, ValueRange{broadcastedScalar, matmulRetValue},
-        ValueRange{init});
+    if (leftScalarFactor * rightScalarFactor != 1) {
+      // Multiply scalar factor.
+      Value scalarFactor = builder.create<arith::ConstantOp>(
+          loc,
+          FloatAttr::get(elementType, leftScalarFactor * rightScalarFactor));
+      auto matmulType = RankedTensorType::get({retRows, retCols}, elementType);
+      auto init = builder.create<tensor::EmptyOp>(loc, matmulType.getShape(),
+                                                  elementType);
+
+      auto identityAffineMap = rewriter.getMultiDimIdentityMap(2);
+      SmallVector<AffineMap> affineMaps = {
+          AffineMap::get(2, 0, init.getContext()), identityAffineMap};
+      auto broadcastedScalar =
+          rewriter
+              .create<linalg::GenericOp>(
+                  loc, matmulType, ValueRange{scalarFactor}, ValueRange{init},
+                  affineMaps,
+                  llvm::ArrayRef<utils::IteratorType>{
+                      utils::IteratorType::parallel,
+                      utils::IteratorType::parallel},
+                  [&](OpBuilder &nestedBuilder, Location nestedLoc,
+                      ValueRange args) {
+                    nestedBuilder.create<linalg::YieldOp>(nestedLoc, args[0]);
+                  })
+              .getResult(0);
+
+      matmulRetValue = builder
+                           .create<linalg::MulOp>(
+                               loc, matmulType,
+                               ValueRange{broadcastedScalar, matmulRetValue},
+                               ValueRange{init})
+                           .getResult(0);
+    }
 
     // Insert (H, W) to (N, H, W, F).
-    Value combinedVal = insert2DData(builder, loc, scaledMatmul.getResult(0),
-                                     args[0], NIter, FIter, retRows, retCols,
+    Value combinedVal = insert2DData(builder, loc, matmulRetValue, args[0],
+                                     NIter, FIter, retRows, retCols,
                                      /*outLoopIdx=*/0,
                                      /*inLoopIdx=*/3, /*heightIdx=*/1,
                                      /*widthIdx=*/2, /*destSize=*/4);
diff --git a/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir b/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir
@@ -14,9 +14,11 @@ func.func @conv2d_4x4_3x3(%arg0: tensor<2x6x6x5xf32>, %arg1: tensor<2x3x3x5xf32>
   return %6 : tensor<2x4x4x2xf32>
 }
 
+// CHECK: #[[$MAP0:.+]] = affine_map<(d0, d1) -> ()>
+// CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1) -> (d0, d1)>
 // CHECK-LABEL: func.func @conv2d_4x4_3x3
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x6x6x5xf32>, %[[ARG1:.*]]: tensor<2x3x3x5xf32>, %[[ARG2:.*]]: tensor<2x4x4x2xf32>) -> tensor<2x4x4x2xf32> {
-// CHECK-DAG:   %[[CST:.*]] = arith.constant dense<1.024000e+03> : tensor<4x4xf32>
+// CHECK-DAG:   %[[CST:.*]] = arith.constant 1.024000e+03 : f32
 // CHECK-DAG:   %[[CST_0:.*]] = arith.constant dense<{{\[}}[1.250000e-01, 0.000000e+00, 0.000000e+00, 0.000000e+00], [2.500000e-01, -2.500000e-01, 2.500000e-01, -2.500000e-01], [2.500000e-01, 2.500000e-01, 2.500000e-01, 2.500000e-01], [1.250000e-01, -2.500000e-01, 5.000000e-01, -1.000000e+00], [1.250000e-01, 2.500000e-01, 5.000000e-01, 1.000000e+00], [0.000000e+00, 0.000000e+00, 0.000000e+00, 5.000000e-01]]> : tensor<6x4xf32>
 // CHECK-DAG:   %[[CST_1:.*]] = arith.constant dense<{{\[}}[1.250000e-01, 2.500000e-01, 2.500000e-01, 1.250000e-01, 1.250000e-01, 0.000000e+00], [0.000000e+00, -2.500000e-01, 2.500000e-01, -2.500000e-01, 2.500000e-01, 0.000000e+00], [0.000000e+00, 2.500000e-01, 2.500000e-01, 5.000000e-01, 5.000000e-01, 0.000000e+00], [0.000000e+00, -2.500000e-01, 2.500000e-01, -1.000000e+00, 1.000000e+00, 5.000000e-01]]> : tensor<4x6xf32>
 // CHECK-DAG:   %[[CST_2:.*]] = arith.constant dense<{{\[}}[2.500000e-01, 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00], [0.000000e+00, 2.500000e-01, -2.500000e-01, 2.500000e-01, -2.500000e-01, 2.500000e-01], [-3.125000e-01, -2.500000e-01, -2.500000e-01, -1.250000e-01, -1.250000e-01, 0.000000e+00], [0.000000e+00, -6.250000e-02, 6.250000e-02, -2.500000e-01, 2.500000e-01, -3.125000e-01], [6.250000e-02, 6.250000e-02, 6.250000e-02, 1.250000e-01, 1.250000e-01, 0.000000e+00], [0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, 6.250000e-02]]> : tensor<6x6xf32>
@@ -73,7 +75,11 @@ func.func @conv2d_4x4_3x3(%arg0: tensor<2x6x6x5xf32>, %arg1: tensor<2x3x3x5xf32>
 // CHECK-NEXT:      %[[S10:.*]] = tensor.empty() : tensor<4x4xf32>
 // CHECK-NEXT:      %[[S11:.*]] = linalg.matmul ins(%[[S9]], %[[CST_0]] : tensor<4x6xf32>, tensor<6x4xf32>) outs(%[[S10]] : tensor<4x4xf32>) -> tensor<4x4xf32>
 // CHECK-NEXT:      %[[S12:.*]] = tensor.empty() : tensor<4x4xf32>
-// CHECK-NEXT:      %[[S13:.*]] = linalg.mul ins(%cst, %[[S11]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S12]] : tensor<4x4xf32>) -> tensor<4x4xf32>
+// CHECK-NEXT:      %[[BROADCAST:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S12]] : tensor<4x4xf32>) {
+// CHECK-NEXT:              ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32):
+// CHECK-NEXT:                linalg.yield %[[IN]] : f32
+// CHECK-NEXT:              } -> tensor<4x4xf32>
+// CHECK-NEXT:      %[[S13:.*]] = linalg.mul ins(%[[BROADCAST]], %[[S11]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S12]] : tensor<4x4xf32>) -> tensor<4x4xf32>
 // CHECK-NEXT:      %[[S14:.*]] = tensor.empty() : tensor<1x4x4x1xf32>
 // CHECK-NEXT:      %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S13]] into %[[S14]][0, 0, 0, 0] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<4x4xf32> into tensor<1x4x4x1xf32>
 // CHECK-NEXT:      %[[INSERTED_SLICE_8:.*]] = tensor.insert_slice %[[INSERTED_SLICE]] into %[[ARG6]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<1x4x4x1xf32> into tensor<2x4x4x2xf32>