llvm
diff --git a/‎mlir/lib/Dialect/Vector/Transforms/LowerVectorShapeCast.cpp
Lines changed: 173 additions & 8 deletions b/‎mlir/lib/Dialect/Vector/Transforms/LowerVectorShapeCast.cpp
Lines changed: 173 additions & 8 deletions
@@ -54,6 +54,10 @@ class ShapeCastOp2DDownCastRewritePattern
                                 PatternRewriter &rewriter) const override {
     auto sourceVectorType = op.getSourceVectorType();
     auto resultVectorType = op.getResultVectorType();
+
+    if (sourceVectorType.isScalable() || resultVectorType.isScalable())
+      return failure();
+
     if (sourceVectorType.getRank() != 2 || resultVectorType.getRank() != 1)
       return failure();
 
@@ -87,6 +91,10 @@ class ShapeCastOp2DUpCastRewritePattern
                                 PatternRewriter &rewriter) const override {
     auto sourceVectorType = op.getSourceVectorType();
     auto resultVectorType = op.getResultVectorType();
+
+    if (sourceVectorType.isScalable() || resultVectorType.isScalable())
+      return failure();
+
     if (sourceVectorType.getRank() != 1 || resultVectorType.getRank() != 2)
       return failure();
 
@@ -106,6 +114,20 @@ class ShapeCastOp2DUpCastRewritePattern
   }
 };
 
+static void incIdx(llvm::MutableArrayRef<int64_t> idx, VectorType tp,
+                   int dimIdx, int initialStep = 1) {
+  int step = initialStep;
+  for (int d = dimIdx; d >= 0; d--) {
+    idx[d] += step;
+    if (idx[d] >= tp.getDimSize(d)) {
+      idx[d] = 0;
+      step = 1;
+    } else {
+      break;
+    }
+  }
+}
+
 // We typically should not lower general shape cast operations into data
 // movement instructions, since the assumption is that these casts are
 // optimized away during progressive lowering. For completeness, however,
@@ -121,6 +143,9 @@ class ShapeCastOpRewritePattern : public OpRewritePattern<vector::ShapeCastOp> {
     auto sourceVectorType = op.getSourceVectorType();
     auto resultVectorType = op.getResultVectorType();
 
+    if (sourceVectorType.isScalable() || resultVectorType.isScalable())
+      return failure();
+
     // Special case 2D / 1D lowerings with better implementations.
     // TODO: make is ND / 1D to allow generic ND -> 1D -> MD.
     int64_t srcRank = sourceVectorType.getRank();
@@ -175,21 +200,161 @@ class ShapeCastOpRewritePattern : public OpRewritePattern<vector::ShapeCastOp> {
     rewriter.replaceOp(op, result);
     return success();
   }
+};
+
+/// A shape_cast lowering for scalable vectors with a single trailing scalable
+/// dimension. This is similar to the general shape_cast lowering but makes use
+/// of vector.scalable.insert and vector.scalable.extract to move elements a
+/// subvector at a time.
+///
+/// E.g.:
+/// ```
+/// // Flatten scalable vector
+/// %0 = vector.shape_cast %arg0 : vector<2x1x[4]xi32> to vector<[8]xi32>
+/// ```
+/// is rewritten to:
+/// ```
+/// // Flatten scalable vector
+/// %c = arith.constant dense<0> : vector<[8]xi32>
+/// %0 = vector.extract %arg0[0, 0] : vector<2x1x[4]xi32>
+/// %1 = vector.scalable.insert %0, %c[0] : vector<[4]xi32> into vector<[8]xi32>
+/// %2 = vector.extract %arg0[1, 0] : vector<2x1x[4]xi32>
+/// %3 = vector.scalable.insert %2, %1[4] : vector<[4]xi32> into vector<[8]xi32>
+/// ```
+/// or:
+/// ```
+/// // Un-flatten scalable vector
+/// %0 = vector.shape_cast %arg0 : vector<[8]xi32> to vector<2x1x[4]xi32>
+/// ```
+/// is rewritten to:
+/// ```
+/// // Un-flatten scalable vector
+/// %c = arith.constant dense<0> : vector<2x1x[4]xi32>
+/// %0 = vector.scalable.extract %arg0[0] : vector<[4]xi32> from vector<[8]xi32>
+/// %1 = vector.insert %0, %c [0, 0] : vector<[4]xi32> into vector<2x1x[4]xi32>
+/// %2 = vector.scalable.extract %arg0[4] : vector<[4]xi32> from vector<[8]xi32>
+/// %3 = vector.insert %2, %1 [1, 0] : vector<[4]xi32> into vector<2x1x[4]xi32>
+/// ```
+class ScalableShapeCastOpRewritePattern
+    : public OpRewritePattern<vector::ShapeCastOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::ShapeCastOp op,
+                                PatternRewriter &rewriter) const override {
+
+    Location loc = op.getLoc();
+    auto sourceVectorType = op.getSourceVectorType();
+    auto resultVectorType = op.getResultVectorType();
+    auto srcRank = sourceVectorType.getRank();
+    auto resRank = resultVectorType.getRank();
+
+    // This can only lower shape_casts where both the source and result types
+    // have a single trailing scalable dimension. This is because there are no
+    // legal representation of other scalable types in LLVM (and likely won't be
+    // soon). There are also (currently) no operations that can index or extract
+    // from >= 2D scalable vectors or scalable vectors of fixed vectors.
+    if (!isTrailingDimScalable(sourceVectorType) ||
+        !isTrailingDimScalable(resultVectorType)) {
+      return failure();
+    }
+
+    // The sizes of the trailing dimension of the source and result vectors, the
+    // size of subvector to move, and the number of elements in the vectors.
+    // These are "min" sizes as they are the size when vscale == 1.
+    auto minSourceTrailingSize = sourceVectorType.getShape().back();
+    auto minResultTrailingSize = resultVectorType.getShape().back();
+    auto minExtractionSize =
+        std::min(minSourceTrailingSize, minResultTrailingSize);
+    int64_t minNumElts = 1;
+    for (auto size : sourceVectorType.getShape())
+      minNumElts *= size;
+
+    // The subvector type to move from the source to the result. Note that this
+    // is a scalable vector. This rewrite will generate code in terms of the
+    // "min" size (vscale == 1 case), that scales to any vscale.
+    auto extractionVectorType = VectorType::get(
+        {minExtractionSize}, sourceVectorType.getElementType(), {true});
+
+    Value result = rewriter.create<arith::ConstantOp>(
+        loc, resultVectorType, rewriter.getZeroAttr(resultVectorType));
+
+    SmallVector<int64_t> srcIdx(srcRank);
+    SmallVector<int64_t> resIdx(resRank);
+
+    // TODO: Try rewriting this with StaticTileOffsetRange (from IndexingUtils)
+    // once D150000 lands.
+    Value currentResultScalableVector;
+    Value currentSourceScalableVector;
+    for (int64_t i = 0; i < minNumElts; i += minExtractionSize) {
+      // 1. Extract a scalable subvector from the source vector.
+      if (!currentSourceScalableVector) {
+        if (srcRank != 1) {
+          currentSourceScalableVector = rewriter.create<vector::ExtractOp>(
+              loc, op.getSource(), llvm::ArrayRef(srcIdx).drop_back());
+        } else {
+          currentSourceScalableVector = op.getSource();
+        }
+      }
+      Value sourceSubVector = currentSourceScalableVector;
+      if (minExtractionSize < minSourceTrailingSize) {
+        sourceSubVector = rewriter.create<vector::ScalableExtractOp>(
+            loc, extractionVectorType, sourceSubVector, srcIdx.back());
+      }
 
-private:
-  static void incIdx(SmallVector<int64_t> &idx, VectorType tp, int64_t r) {
-    assert(0 <= r && r < tp.getRank());
-    if (++idx[r] == tp.getDimSize(r)) {
-      idx[r] = 0;
-      incIdx(idx, tp, r - 1);
+      // 2. Insert the scalable subvector into the result vector.
+      if (!currentResultScalableVector) {
+        if (minExtractionSize == minResultTrailingSize) {
+          currentResultScalableVector = sourceSubVector;
+        } else if (resRank != 1) {
+          currentResultScalableVector = rewriter.create<vector::ExtractOp>(
+              loc, result, llvm::ArrayRef(resIdx).drop_back());
+        } else {
+          currentResultScalableVector = result;
+        }
+      }
+      if (minExtractionSize < minResultTrailingSize) {
+        currentResultScalableVector = rewriter.create<vector::ScalableInsertOp>(
+            loc, sourceSubVector, currentResultScalableVector, resIdx.back());
+      }
+
+      // 3. Update the source and result scalable vectors if needed.
+      if (resIdx.back() + minExtractionSize >= minResultTrailingSize &&
+          currentResultScalableVector != result) {
+        // Finished row of result. Insert complete scalable vector into result
+        // (n-D) vector.
+        result = rewriter.create<vector::InsertOp>(
+            loc, currentResultScalableVector, result,
+            llvm::ArrayRef(resIdx).drop_back());
+        currentResultScalableVector = {};
+      }
+      if (srcIdx.back() + minExtractionSize >= minSourceTrailingSize) {
+        // Finished row of source.
+        currentSourceScalableVector = {};
+      }
+
+      // 4. Increment the insert/extract indices, stepping by minExtractionSize
+      // for the trailing dimensions.
+      incIdx(srcIdx, sourceVectorType, srcRank - 1, minExtractionSize);
+      incIdx(resIdx, resultVectorType, resRank - 1, minExtractionSize);
     }
+
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+
+  static bool isTrailingDimScalable(VectorType type) {
+    return type.getRank() >= 1 && type.getScalableDims().back() &&
+           !llvm::is_contained(type.getScalableDims().drop_back(), true);
   }
 };
+
 } // namespace
 
 void mlir::vector::populateVectorShapeCastLoweringPatterns(
     RewritePatternSet &patterns, PatternBenefit benefit) {
   patterns.add<ShapeCastOp2DDownCastRewritePattern,
-               ShapeCastOp2DUpCastRewritePattern, ShapeCastOpRewritePattern>(
-      patterns.getContext(), benefit);
+               ShapeCastOp2DUpCastRewritePattern, ShapeCastOpRewritePattern,
+               ScalableShapeCastOpRewritePattern>(patterns.getContext(),
+                                                  benefit);
 }