[mlir][vectorize] Support affine.apply in SuperVectorize

Hsiangkai · Hsiangkai · commit 7bdcc9f3553b · 2024-02-01T22:54:15.000Z
We have no need to vectorize affine.apply inside the vectorizing loop.
However, we still need to generate it in the original scalar form.
We have to replace all its operands with the generated scalar operands
in the vectorizing loop, e.g., induction variables.
diff --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
@@ -721,8 +721,7 @@ struct VectorizationState {
   /// Example:
   ///   * 'replaced': induction variable of a loop to be vectorized.
   ///   * 'replacement': new induction variable in the new vector loop.
-  void registerValueScalarReplacement(BlockArgument replaced,
-                                      BlockArgument replacement);
+  void registerValueScalarReplacement(Value replaced, Value replacement);
 
   /// Registers the scalar replacement of a scalar result returned from a
   /// reduction loop. 'replacement' must be scalar.
@@ -854,8 +853,8 @@ void VectorizationState::registerValueVectorReplacementImpl(Value replaced,
 /// Example:
 ///   * 'replaced': induction variable of a loop to be vectorized.
 ///   * 'replacement': new induction variable in the new vector loop.
-void VectorizationState::registerValueScalarReplacement(
-    BlockArgument replaced, BlockArgument replacement) {
+void VectorizationState::registerValueScalarReplacement(Value replaced,
+                                                        Value replacement) {
   registerValueScalarReplacementImpl(replaced, replacement);
 }
 
@@ -978,6 +977,32 @@ static arith::ConstantOp vectorizeConstant(arith::ConstantOp constOp,
   return newConstOp;
 }
 
+/// We have no need to vectorize affine.apply. However, we still need to
+/// generate it and replace the operands with values in valueScalarReplacement.
+static Operation *vectorizeAffineApplyOp(AffineApplyOp applyOp,
+                                         VectorizationState &state) {
+  SmallVector<Value, 8> updatedOperands;
+  for (Value operand : applyOp.getOperands()) {
+    Value updatedOperand = operand;
+    if (state.valueScalarReplacement.contains(operand)) {
+      updatedOperand = state.valueScalarReplacement.lookupOrDefault(operand);
+    } else if (state.valueVectorReplacement.contains(operand)) {
+      LLVM_DEBUG(
+          dbgs() << "\n[early-vect]+++++ affine.apply on vector operand\n");
+      return nullptr;
+    }
+    updatedOperands.push_back(updatedOperand);
+  }
+
+  auto newApplyOp = state.builder.create<AffineApplyOp>(
+      applyOp.getLoc(), applyOp.getAffineMap(), updatedOperands);
+
+  // Register the new affine.apply result.
+  state.registerValueScalarReplacement(applyOp.getResult(),
+                                       newApplyOp.getResult());
+  return newApplyOp;
+}
+
 /// Creates a constant vector filled with the neutral elements of the given
 /// reduction. The scalar type of vector elements will be taken from
 /// `oldOperand`.
@@ -1493,6 +1518,8 @@ static Operation *vectorizeOneOperation(Operation *op,
     return vectorizeAffineYieldOp(yieldOp, state);
   if (auto constant = dyn_cast<arith::ConstantOp>(op))
     return vectorizeConstant(constant, state);
+  if (auto applyOp = dyn_cast<AffineApplyOp>(op))
+    return vectorizeAffineApplyOp(applyOp, state);
 
   // Other ops with regions are not supported.
   if (op->getNumRegions() != 0)
diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_affine_apply.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_affine_apply.mlir
@@ -0,0 +1,65 @@
+// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=8 test-fastest-varying=0" -split-input-file | FileCheck %s
+
+// CHECK-DAG: #[[$MAP_ID0:map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 mod 12)>
+// CHECK-DAG: #[[$MAP_ID1:map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 mod 16)>
+
+// CHECK-LABEL: vec_affine_apply
+// CHECK-SAME:  (%[[ARG0:.*]]: memref<8x12x16xf32>, %[[ARG1:.*]]: memref<8x24x48xf32>) {
+func.func @vec_affine_apply(%arg0: memref<8x12x16xf32>, %arg1: memref<8x24x48xf32>) {
+// CHECK:       affine.for %[[ARG2:.*]] = 0 to 8 {
+// CHECK-NEXT:    affine.for %[[ARG3:.*]] = 0 to 24 {
+// CHECK-NEXT:      affine.for %[[ARG4:.*]] = 0 to 48 step 8 {
+// CHECK-NEXT:        %[[S0:.*]] = affine.apply #[[$MAP_ID0]](%[[ARG3]])
+// CHECK-NEXT:        %[[S1:.*]] = affine.apply #[[$MAP_ID1]](%[[ARG4]])
+// CHECK-NEXT:        %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-NEXT:        %[[S2:.*]] = vector.transfer_read %[[ARG0]][%[[ARG2]], %[[S0]], %[[S1]]], %[[CST]] : memref<8x12x16xf32>, vector<8xf32>
+// CHECK-NEXT:        vector.transfer_write %[[S2]], %[[ARG1]][%[[ARG2]], %[[ARG3]], %[[ARG4]]] : vector<8xf32>, memref<8x24x48xf32>
+// CHECK-NEXT:      }
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  return
+  affine.for %arg2 = 0 to 8 {
+    affine.for %arg3 = 0 to 24 {
+      affine.for %arg4 = 0 to 48 {
+        %0 = affine.apply affine_map<(d0) -> (d0 mod 12)>(%arg3)
+        %1 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg4)
+        %2 = affine.load %arg0[%arg2, %0, %1] : memref<8x12x16xf32>
+        affine.store %2, %arg1[%arg2, %arg3, %arg4] : memref<8x24x48xf32>
+      }
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: no_vec_affine_apply
+// CHECK-SAME:  (%[[ARG0:.*]]: memref<8x12x16xi32>, %[[ARG1:.*]]: memref<8x24x48xi32>) {
+func.func @no_vec_affine_apply(%arg0: memref<8x12x16xi32>, %arg1: memref<8x24x48xi32>) {
+// CHECK:      affine.for %[[ARG2:.*]] = 0 to 8 {
+// CHECK-NEXT:   affine.for %[[ARG3:.*]] = 0 to 24 {
+// CHECK-NEXT:     affine.for %[[ARG4:.*]] = 0 to 48 {
+// CHECK-NEXT:       %[[S0:.*]] = affine.apply #[[$MAP_ID0]](%[[ARG3]])
+// CHECK-NEXT:       %[[S1:.*]] = affine.apply #[[$MAP_ID1]](%[[ARG4]])
+// CHECK-NEXT:       %[[S2:.*]] = affine.load %[[ARG0]][%[[ARG2]], %[[S0]], %[[S1]]] : memref<8x12x16xi32>
+// CHECK-NEXT:       %[[S3:.*]] = arith.index_cast %[[S2]] : i32 to index
+// CHECK-NEXT:       %[[S4:.*]] = affine.apply #[[$MAP_ID1]](%[[S3]])
+// CHECK-NEXT:       %[[S5:.*]] = arith.index_cast %[[S4]] : index to i32
+// CHECK-NEXT:       affine.store %[[S5]], %[[ARG1]][%[[ARG2]], %[[ARG3]], %[[ARG4]]] : memref<8x24x48xi32>
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT: }
+// CHECK-NEXT: return
+  affine.for %arg2 = 0 to 8 {
+    affine.for %arg3 = 0 to 24 {
+      affine.for %arg4 = 0 to 48 {
+        %0 = affine.apply affine_map<(d0) -> (d0 mod 12)>(%arg3)
+        %1 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg4)
+        %2 = affine.load %arg0[%arg2, %0, %1] : memref<8x12x16xi32>
+        %3 = arith.index_cast %2 : i32 to index
+        %4 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%3)
+        %5 = arith.index_cast %4 : index to i32
+        affine.store %5, %arg1[%arg2, %arg3, %arg4] : memref<8x24x48xi32>
+      }
+    }
+  }
+  return
+}