Combine masks and replace if

c-rhodes · c-rhodes · commit ec829d79c7db · 2023-11-07T15:46:08.000Z
diff --git a/mlir/lib/Conversion/ArmSMEToSCF/ArmSMEToSCF.cpp b/mlir/lib/Conversion/ArmSMEToSCF/ArmSMEToSCF.cpp
@@ -255,6 +255,7 @@ struct TileLoadOpWithMaskAndPadZeroConversion
 ///  %pad_1d = arith.constant dense<1> : vector<[4]xi32>
 ///  %num_rows = arith.constant 2 : index
 ///  %num_cols = arith.constant 4 : index
+///  %num_cols_i32 = arith.index_castui %num_cols : index to i32
 ///  %tile_id = arm_sme.get_tile_id : i32
 ///  %tile = arm_sme.cast_tile_to_vector %tile_id : i32 to vector<[4]x[4]xi32>
 ///  %vscale = vector.vscale
@@ -264,14 +265,13 @@ struct TileLoadOpWithMaskAndPadZeroConversion
 ///  %svl_s = arith.muli %min_svl_s, %vscale : index
 ///  scf.for %tile_slice_idx = %c0 to %svl_s step %c1 {
 ///    %row_is_active = arith.cmpi ult %tile_slice_idx, %num_rows : index
-///    %slice = scf.if %row_is_active -> vector<[4]xi32> {
-///      %slice = vector.maskedload %base[%tile_slice_idx, %c0], %num_cols, %pad
-///        : memref<?x?xi32>, vector<[4]xi1>,
-///          vector<[4]xi32> into vector<[4]xi32>
-///      scf.yield %slice : vector<[4]xi32>
-///    } else {
-///      scf.yield %pad_1d : vector<[4]xi32>
-///    }
+///    %row_is_active_i32 = arith.extsi %row_is_active : i1 to i32
+///    %mask = arith.andi %row_is_active_i32, %num_cols_i32 : i32
+///    %mask_index = arith.index_cast %mask : i32 to index
+///    %mask_1d = vector.create_mask %mask_index : vector<[4]xi1>
+///    %slice = vector.maskedload %base[%tile_slice_idx, %c0], %mask_1d, %pad
+///      : memref<?x?xi32>, vector<[4]xi1>,
+///        vector<[4]xi32> into vector<[4]xi32>
 ///    // Insert slice into tile
 ///    arm_sme.move_vector_to_tile_slice %slice, %tile, %tile_slice_idx
 ///      : vector<[4]xi32> into vector<[4]x[4]xi32>
@@ -312,11 +312,8 @@ struct TileLoadOpWithMaskAndPadNonZeroConversion
     auto numRows = createMaskOp.getOperands()[0];
     auto numCols = createMaskOp.getOperands()[1];
 
-    VectorType tileSliceType = VectorType::Builder(tileType).dropDim(0);
-    auto predicateType =
-        VectorType::get(tileType.getDimSize(1), rewriter.getI1Type(), true);
-    auto numColsOp =
-        rewriter.create<vector::CreateMaskOp>(loc, predicateType, numCols);
+    auto numColsI32 = rewriter.create<arith::IndexCastUIOp>(
+        loc, rewriter.getI32Type(), numCols);
 
     // Create 'arm_sme.get_tile' op.
     auto tileId = rewriter.create<arm_sme::GetTileID>(
@@ -343,41 +340,35 @@ struct TileLoadOpWithMaskAndPadNonZeroConversion
 
     auto tileSliceIndex = forOp.getInductionVar();
 
+    // Combine masks.
     auto rowIsActive = rewriter.create<arith::CmpIOp>(
         loc, arith::CmpIPredicate::ult, tileSliceIndex, numRows);
+    auto rowIsActiveI32 = rewriter.create<arith::ExtSIOp>(
+        loc, rewriter.getI32Type(), rowIsActive);
+    auto mask = rewriter.create<arith::AndIOp>(loc, rowIsActiveI32, numColsI32);
+    auto maskIndex =
+        rewriter.create<arith::IndexCastOp>(loc, rewriter.getIndexType(), mask);
+    auto predicateType =
+        VectorType::get(tileType.getDimSize(1), rewriter.getI1Type(), true);
+    auto maskOp1D = rewriter.create<vector::CreateMaskOp>(
+        loc, predicateType, maskIndex.getResult());
 
     SmallVector<Value> memrefIndices;
     getMemrefIndices(tileLoadOp.getIndices(),
                      tileLoadOp.getMemRefType().getRank(), tileSliceIndex,
                      numTileSlices, memrefIndices, loc, rewriter);
 
     // Splat pad into 1-D vector matching type of tile slice.
+    VectorType tileSliceType = VectorType::Builder(tileType).dropDim(0);
     auto pad1DOp = rewriter.create<vector::SplatOp>(loc, tileSliceType, padOp);
 
-    Operation *slice = rewriter.create<scf::IfOp>(
-        loc, rowIsActive,
-        [&](OpBuilder &b, Location loc) {
-          // If the row is active, emit a masked load where the predicate is
-          // 'numCols'. Pad is used for inactive elements, taken from
-          // passthru.
-          auto loadSlice = rewriter.create<vector::MaskedLoadOp>(
-              loc, tileSliceType, tileLoadOp.getBase(), memrefIndices,
-              numColsOp, /*passthru=*/pad1DOp);
-          rewriter.create<scf::YieldOp>(loc, loadSlice->getResult(0));
-        },
-        [&](OpBuilder &b, Location loc) {
-          // Inactive rows are filled with pad.
-          rewriter.create<scf::YieldOp>(loc, pad1DOp.getResult());
-        });
-
-    // TODO: If the load is vertical the transpose can't be done in-flight with
-    // a regular (SVE) maskedload. Propagate layout to
-    // 'arm_sme.move_vector_to_tile_slice' below once it supports layout. This
-    // is currently broken.
+    auto loadSlice = rewriter.create<vector::MaskedLoadOp>(
+        loc, tileSliceType, tileLoadOp.getBase(), memrefIndices, maskOp1D,
+        /*passthru=*/pad1DOp);
 
     // Create 'arm_sme.move_vector_to_tile_slice' to move slice into tile.
     rewriter.create<arm_sme::MoveVectorToTileSliceOp>(
-        loc, tileType, slice->getResult(0), tile, tileSliceIndex,
+        loc, tileType, loadSlice->getResult(0), tile, tileSliceIndex,
         tileLoadOp.getLayout());
 
     rewriter.setInsertionPointAfter(forOp);
diff --git a/mlir/test/Conversion/ArmSMEToSCF/arm-sme-to-scf.mlir b/mlir/test/Conversion/ArmSMEToSCF/arm-sme-to-scf.mlir
@@ -66,20 +66,20 @@ func.func @arm_sme_tile_load_hor_with_mask_and_pad_zero(%src : memref<?x?xi32>)
 // CHECK-DAG:     %[[C1:.*]] = arith.constant 1 : index
 // CHECK-DAG:     %[[C4:.*]] = arith.constant 4 : index
 // CHECK-DAG:     %[[NUM_ROWS:.*]] = arith.constant 3 : index
-// CHECK-DAG:     %[[NUM_COLS:.*]] = vector.create_mask %c2 : vector<[4]xi1>
+// CHECK-DAG:     %[[NUM_COLS:.*]] = arith.constant 2 : index
+// CHECK-DAG:     %[[NUM_COLS_I32:.*]] = arith.index_castui %[[NUM_COLS]] : index to i32
 // CHECK-DAG:     %[[VSCALE:.*]] = vector.vscale
 // CHECK-NEXT:    %[[NUM_TILE_SLICES:.*]] = arith.muli %[[C4]], %[[VSCALE]] : index
 // CHECK-NEXT:    scf.for %[[TILE_SLICE_INDEX:.*]] = %[[C0]] to %[[NUM_TILE_SLICES]] step %[[C1]] {
 // CHECK-NEXT:        %[[ROW_IS_ACTIVE:.*]] = arith.cmpi ult, %[[TILE_SLICE_INDEX]], %[[NUM_ROWS]] : index
+// CHECK-NEXT:        %[[ROW_IS_ACTIVE_SEXT_I32:.*]] = arith.extsi %[[ROW_IS_ACTIVE]] : i1 to i32
+// CHECK-NEXT:        %[[MASK:.*]] = arith.andi %[[ROW_IS_ACTIVE_SEXT_I32]], %[[NUM_COLS_I32]] : i32
+// CHECK-NEXT:        %[[MASK_INDEX:.*]] = arith.index_cast %[[MASK]] : i32 to index
+// CHECK-NEXT:        %[[MASK_1D:.*]] = vector.create_mask %[[MASK_INDEX]] : vector<[4]xi1>
 // CHECK-NEXT:        %[[OFFSET:.*]] = arith.addi %[[C0]], %[[TILE_SLICE_INDEX]] : index
 // CHECK:             %[[PAD_1D:.*]] = vector.splat %[[PAD]] : vector<[4]xi32>
-// CHECK:             %[[SLICE:.*]] = scf.if %[[ROW_IS_ACTIVE]] -> (vector<[4]xi32>) {
-// CHECK:               %[[LOAD_SLICE:.*]] = vector.maskedload %[[SRC]]{{\[}}%[[OFFSET]], %[[C0]]], %[[NUM_COLS]], %[[PAD_1D]] : memref<?x?xi32>, vector<[4]xi1>, vector<[4]xi32> into vector<[4]xi32>
-// CHECK:               scf.yield %[[LOAD_SLICE]] : vector<[4]xi32>
-// CHECK:             } else {
-// CHECK:               scf.yield %[[PAD_1D]] : vector<[4]xi32>
-// CHECK:             }
-// CHECK:             arm_sme.move_vector_to_tile_slice %[[SLICE]], %[[CAST_TILE_TO_VECTOR]], %[[TILE_SLICE_INDEX]] : vector<[4]xi32> into vector<[4]x[4]xi32>
+// CHECK:             %[[LOAD_SLICE:.*]] = vector.maskedload %[[SRC]]{{\[}}%[[OFFSET]], %[[C0]]], %[[MASK_1D]], %[[PAD_1D]] : memref<?x?xi32>, vector<[4]xi1>, vector<[4]xi32> into vector<[4]xi32>
+// CHECK:             arm_sme.move_vector_to_tile_slice %[[LOAD_SLICE]], %[[CAST_TILE_TO_VECTOR]], %[[TILE_SLICE_INDEX]] : vector<[4]xi32> into vector<[4]x[4]xi32>
 func.func @arm_sme_tile_load_hor_with_mask_and_nonzero_pad(%src : memref<?x?xi32>, %pad : i32) {
   %c0 = arith.constant 0 : index
   %c2 = arith.constant 2 : index