Add relu_nc and test for unfusing

Ferdinand Lemaire · Ferdinand Lemaire · commit 0f9a31eb3330 · 2022-12-09T14:36:17.000Z
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
@@ -2478,7 +2478,7 @@ metadata: !LinalgOpMetadata
     The partial multiplication results are reduced into a 2D output.
 
     Numeric casting is performed on the operands to the inner multiply, promoting
-    them to the same data type as the accumulator/output.
+    them to the same data type as the accumulator/output."
   implements:
   - LinalgContractionOpInterface
 structured_op: !LinalgStructuredOpConfig
@@ -4097,7 +4097,7 @@ structured_op: !LinalgStructuredOpConfig
     kind: input_tensor
     type_var: T1
     shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9] -> (s0, s9, s1 *
-     s2 + s3 * s4, s5 * s6 + s7 * s8)>
+      s2 + s3 * s4, s5 * s6 + s7 * s8)>
   - !LinalgOperandDefConfig
     name: K
     kind: input_tensor
@@ -5837,3 +5837,49 @@ structured_op: !LinalgStructuredOpConfig
                   scalar_arg: W
             - !ScalarExpression
               scalar_arg: B
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: relu_nc
+  cpp_class_name: ReluNcOp
+  doc: |-
+    Applies the ReLU activation function to every value in the tensor.
+
+    Layout:
+      * Input: NC
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: IFM
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1] -> (s0, s1)>
+  - !LinalgOperandDefConfig
+    name: OFM
+    kind: output_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1] -> (s0, s1)>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1)[s0, s1] -> (d0, d1)>
+    - affine_map<(d0, d1)[s0, s1] -> (d0, d1)>
+  iterator_types:
+  - parallel
+  - parallel
+  assignments:
+  - !ScalarAssign
+    arg: OFM
+    value: !ScalarExpression
+      scalar_fn:
+        kind: binary
+        fn_name: max_signed
+        operands:
+        - !ScalarExpression
+          scalar_arg: IFM
+        - !ScalarExpression
+          scalar_fn:
+            kind: type
+            fn_name: cast_signed
+            type_var: T1
+            operands:
+            - !ScalarExpression
+              scalar_const: '0.000000e+00 : f64'
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Unfuse.cpp b/mlir/lib/Dialect/Linalg/Transforms/Unfuse.cpp
@@ -712,7 +712,7 @@ struct LinearReluLowering : OpRewritePattern<LinearReluOp> {
 
     Value linearResult = unfuseLinear<LinearReluOp>(op, rewriter);
 
-    rewriter.replaceOpWithNewOp<Relu2DNchwOp>(
+    rewriter.replaceOpWithNewOp<ReluNcOp>(
         op,
         /*resultTensorTypes=*/linearResult.getType(),
         /*inputs=*/linearResult,
@@ -734,7 +734,8 @@ struct LinalgUnfusePass : public impl::LinalgUnfuseBase<LinalgUnfusePass> {
                  Conv2DTensorAddLreluAveragePoolLowering,
                  Conv2DActivationMaxpoolOpLowering<Conv2DLreluMaxpoolOp>,
                  Conv2DActivationMaxpoolOpLowering<Conv2DReluMaxpoolOp>,
-                 SoftmaxLowering, GlobalAveragePool2DLowering, LinearLowering>(
+                 SoftmaxLowering, GlobalAveragePool2DLowering, LinearLowering,
+                 LinearReluLowering>(
         &getContext());
 
     (void)applyPatternsAndFoldGreedily(getOperation().getBody(),
diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
@@ -1396,4 +1396,19 @@ def linear_relu(
   domain(D.W, D.H, D.K)
   # implementation is incorrect the addition of the bias should happen after
   # the multiplication, not on each element
-  O[D.W, D.K] += I[D.W, D.H]*W[D.K, D.H] + B[D.K] 
+  O[D.W, D.K] += I[D.W, D.H]*W[D.K, D.H] + B[D.K] 
+
+  
+@linalg_structured_op
+def relu_nc(
+    IFM=TensorDef(T1, Batch, S.C ),
+    OFM=TensorDef(T1, Batch, S.C, output=True )):
+  """Applies the ReLU activation function to every value in the tensor.
+  
+  Layout:
+    * Input: NC
+  """
+  domain(D.b, D.c)
+  OFM[D.b, D.c] = BinaryFn.max_signed(
+    IFM[D.b, D.c], TypeFn.cast_signed(T1, const(0.0))
+  )
diff --git a/mlir/test/Dialect/Linalg/unfuse.mlir b/mlir/test/Dialect/Linalg/unfuse.mlir
@@ -448,6 +448,26 @@ func.func @unfuse_linear(%input: tensor<1x2048xf32>, %weights: tensor<1000x2048x
 // CHECK:  %[[bias2dshape:.+]] = tensor.empty() : tensor<1x1000xf32>
 // CHECK:  %[[bias2d:.+]] = linalg.broadcast_1d_to_2d ins(%arg2 : tensor<1000xf32>) outs(%2 : tensor<1x1000xf32>) -> tensor<1x1000xf32>
 // CHECK:  %[[out:.+]] = linalg.matmul ins(%[[input]], %[[tweights]] : tensor<1x2048xf32>, tensor<2048x1000xf32>) outs(%[[bias2d]] : tensor<1x1000xf32>) -> tensor<1x1000xf32
+// CHECK: return %[[out]]
+
+    return %result : tensor<1x1000xf32>
+}
+
+// -----
+
+// CHECK:  func.func @unfuse_linearRelu
+// CHECK-SAME: %[[input:.+]]: tensor<1x2048xf32>, %[[weights:.+]]: tensor<1000x2048xf32>, %[[bias:.+]]: tensor<1000xf32>
+func.func @unfuse_linearRelu(%input: tensor<1x2048xf32>, %weights: tensor<1000x2048xf32>, %bias: tensor<1000xf32>) -> tensor<1x1000xf32> {
+    %zero = arith.constant 0.0 : f32
+    %init = tensor.splat %zero : tensor<1x1000xf32>
+    %result = linalg.linear_relu ins(%input, %weights, %bias: tensor<1x2048xf32>, tensor<1000x2048xf32>, tensor<1000xf32>) outs(%init: tensor<1x1000xf32>) -> tensor<1x1000xf32>
+
+// CHECK:  %[[tweightshape:.+]] = tensor.empty() : tensor<2048x1000xf32>
+// CHECK:  %[[tweights:.+]] = linalg.transpose2d ins(%arg1 : tensor<1000x2048xf32>) outs(%0 : tensor<2048x1000xf32>) -> tensor<2048x1000xf32>
+// CHECK:  %[[bias2dshape:.+]] = tensor.empty() : tensor<1x1000xf32>
+// CHECK:  %[[bias2d:.+]] = linalg.broadcast_1d_to_2d ins(%arg2 : tensor<1000xf32>) outs(%2 : tensor<1x1000xf32>) -> tensor<1x1000xf32>
+// CHECK:  %[[matmul:.+]] = linalg.matmul ins(%[[input]], %[[tweights]] : tensor<1x2048xf32>, tensor<2048x1000xf32>) outs(%[[bias2d]] : tensor<1x1000xf32>) -> tensor<1x1000xf32
+// CHECK:  %[[out:.*]] = linalg.relu_nc ins(%[[matmul]] : tensor<1x1000xf32>) outs(%[[matmul]] : tensor<1x1000xf32>) -> tensor<1x1000xf32>
 // CHECK: return %[[out]]
 
     return %result : tensor<1x1000xf32>