llvm
diff --git a/‎mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
Lines changed: 3 additions & 0 deletions b/‎mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
Lines changed: 3 additions & 0 deletions
diff --git a/‎mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
Lines changed: 15 additions & 9 deletions b/‎mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
Lines changed: 15 additions & 9 deletions
@@ -2990,6 +2990,9 @@ struct VectorizationPattern : public RewritePattern {
   /// Controls whether to vectorize `tensor.extract` when the input tensor is
   /// rank >= 2.
   bool vectorizeNDExtract = false;
+  /// Controls whether to "flatten" the channel dimension when vectorising 1D
+  /// depthwise convolutions. This should lead to bette vectorization for
+  /// tensors with a low number of channel dimensions.
   bool flatten1DDepthwiseConv = false;
 };
 } // namespace
 
@@ -2881,8 +2881,7 @@ struct Conv1DGenerator
         lhsVals.push_back(rewriter.create<vector::ExtractStridedSliceOp>(
             loc, lhs,
             /*offsets=*/ArrayRef<int64_t>{0, w * strideW + kw * dilationW, 0},
-            inOutSliceSizes,
-            inOutStrides));
+            inOutSliceSizes, inOutStrides));
       }
     }
     // Extract rhs slice of size {c} @ [kw].
@@ -2894,35 +2893,39 @@ struct Conv1DGenerator
     for (int64_t w = 0; w < wSize; w += wSizeStep) {
       resVals.push_back(rewriter.create<vector::ExtractStridedSliceOp>(
           loc, res,
-          /*offsets=*/ArrayRef<int64_t>{0, w, 0},
-          inOutSliceSizes,
+          /*offsets=*/ArrayRef<int64_t>{0, w, 0}, inOutSliceSizes,
           inOutStrides));
     }
 
     auto linearIndex = [&](int64_t kw, int64_t w) {
       return kw * (wSize / wSizeStep) + w;
     };
 
-    auto inOutFlattenSliceSizes = SmallVector<int64_t>{nSize, wSizeStep * cSize};
+    auto inOutFlattenSliceSizes =
+        SmallVector<int64_t>{nSize, wSizeStep * cSize};
     auto lhsCastType = VectorType::get(inOutFlattenSliceSizes, lhsEltType);
-    auto resCastType = VectorType::get(inOutFlattenSliceSizes, lhsEltType);
+    auto resCastType = VectorType::get(inOutFlattenSliceSizes, resEltType);
     // Compute contraction: O{n, w, c} += I{n, sw * w + dw * kw, c} * F{c}
     for (int64_t kw = 0; kw < kwSize; ++kw) {
       for (int64_t w = 0; w < wSize; w += wSizeStep) {
         Value lhsVal = lhsVals[linearIndex(kw, w)];
         Value resVal = resVals[w];
         ShapedType filterBCastTy = cast<ShapedType>(resVal.getType());
         if (flatten) {
+          // Flatten the input and filter vectors (collapse the channel
+          // dimension)
           lhsVal = rewriter.create<vector::ShapeCastOp>(
               loc, lhsCastType, lhsVals[linearIndex(kw, w)]);
           resVal = rewriter.create<vector::ShapeCastOp>(loc, resCastType,
                                                         resVals[w]);
         }
         resVals[w] = depthwiseConv1dSliceAsMulAcc(
             rewriter, loc, lhsVal, rhsVals[kw], resVal, filterBCastTy, flatten);
-        if (flatten)
+        if (flatten) {
+          // Un-flatten the output vector (restore the channel dimension)
           resVals[w] = rewriter.create<vector::ShapeCastOp>(
               loc, VectorType::get(inOutSliceSizes, resEltType), resVals[w]);
+        }
       }
     }
 
@@ -2970,8 +2973,11 @@ struct Conv1DGenerator
 
     rhs = rewriter.create<vector::BroadcastOp>(
         loc, bcastTy.clone(rhsTy.getElementType()), rhs);
-    if (flatten)
-      rhs = rewriter.create<vector::ShapeCastOp>(loc, resTy, rhs);
+    if (flatten) {
+      // Flatten the channel dimension
+      rhs = rewriter.create<vector::ShapeCastOp>(
+          loc, resTy.clone(rhsTy.getElementType()), rhs);
+    }
 
     rhs = promote(rewriter, loc, rhs, resTy);