rust-lang
diff --git a/‎mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h
Lines changed: 31 additions & 30 deletions b/‎mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h
Lines changed: 31 additions & 30 deletions
diff --git a/‎mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp
Lines changed: 121 additions & 64 deletions b/‎mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp
Lines changed: 121 additions & 64 deletions
diff --git a/‎mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp
Lines changed: 11 additions & 7 deletions b/‎mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp
Lines changed: 11 additions & 7 deletions
@@ -28,36 +28,37 @@ namespace memref {
 /// contiguous chunk of memory.
 bool isStaticShapeAndContiguousRowMajor(MemRefType type);
 
-/// Returns the flattened 1-D memref and linearized offset for narrow type
-/// emulation.
-///
-/// The emulation only works on 1D memref types. To make this work on N-D
-/// memref, we need to linearize the offset.
-///
-/// For example, to emulate i4 to i8, the following op:
-///
-/// %0 = memref.load %arg0[%v0, %v1] :
-///                  memref<?x?xi4, strided<[?, ?], offset: ?>>
-///
-/// can be replaced with
-///
-/// %b, %offset, %sizes:2, %strides:2 = memref.extract_strided_metadata %0
-///
-/// %linearized_offset = %v0 * %stride#0 + %v1 * %stride#1
-/// %linearized_size = %size0 * %size1
-/// %scaled_linear_offset = %linearized_offset / 8 * 4
-/// %scaled_base_offset = %offset / 8 * 4
-///
-/// %linearized = memref.reinterpret_cast %b, offset = [%scaled_base_offset],
-///                      sizes = [%linearized_size], strides = [%stride#1]
-///
-/// %new_load = memref.load %linearized[%scaled_linear_offset] :
-///                         memref<?xi8, strided<[?], offset: ?>>
-std::pair<Value, Value>
-getLinearizeMemRefAndOffset(Location loc, MemRefType sourceType, int srcBits,
-                            int dstBits, SmallVector<Value> indices,
-                            memref::ExtractStridedMetadataOp stridedMetadata,
-                            OpBuilder &builder);
+/// For a `memref` with `offset`, `sizes` and `strides`, returns the
+/// offset and size to use for the linearized `memref`.
+/// - If the linearization is done for emulating load/stores of
+///   element type with bitwidth `srcBits` using element type with
+///   bitwidth `dstBits`, the linearized offset and size are
+///   scaled down by `dstBits`/`srcBits`.
+/// - If `indices` is provided, it represents the position in the
+///   original `memref` being accessed. The method then returns the
+///   index to use in the linearized `memref`. The linearized index
+///   is also scaled down by `dstBits`/`srcBits`. If `indices` is not provided
+///   0, is returned for the linearized index.
+struct LinearizedMemRefInfo {
+  OpFoldResult linearizedOffset;
+  OpFoldResult linearizedSize;
+};
+std::pair<LinearizedMemRefInfo, OpFoldResult> getLinearizedMemRefOffsetAndSize(
+    OpBuilder &builder, Location loc, int srcBits, int dstBits,
+    OpFoldResult offset, ArrayRef<OpFoldResult> sizes,
+    ArrayRef<OpFoldResult> strides, ArrayRef<OpFoldResult> indices = {});
+
+/// For a `memref` with `offset` and `sizes`, returns the
+/// offset and size to use for the linearized `memref`, assuming that
+/// the strides are computed from a row-major ordering of the sizes;
+/// - If the linearization is done for emulating load/stores of
+///   element type with bitwidth `srcBits` using element type with
+///   bitwidth `dstBits`, the linearized offset and size are
+///   scaled down by `dstBits`/`srcBits`.
+LinearizedMemRefInfo
+getLinearizedMemRefOffsetAndSize(OpBuilder &builder, Location loc, int srcBits,
+                                 int dstBits, OpFoldResult offset,
+                                 ArrayRef<OpFoldResult> sizes);
 
 } // namespace memref
 } // namespace mlir
 
@@ -35,18 +35,18 @@ using namespace mlir;
 /// `sourceBits` equals to 4 and `targetBits` equals to 8, the x-th element is
 /// located at (x % 2) * 4. Because there are two elements in one i8, and one
 /// element has 4 bits.
-static Value getOffsetForBitwidth(Location loc, Value srcIdx, int sourceBits,
-                                  int targetBits, OpBuilder &builder) {
+static Value getOffsetForBitwidth(Location loc, OpFoldResult srcIdx,
+                                  int sourceBits, int targetBits,
+                                  OpBuilder &builder) {
   assert(targetBits % sourceBits == 0);
-  IntegerType targetType = builder.getIntegerType(targetBits);
-  IntegerAttr idxAttr =
-      builder.getIntegerAttr(targetType, targetBits / sourceBits);
-  auto idx = builder.create<arith::ConstantOp>(loc, targetType, idxAttr);
-  IntegerAttr srcBitsAttr = builder.getIntegerAttr(targetType, sourceBits);
-  auto srcBitsValue =
-      builder.create<arith::ConstantOp>(loc, targetType, srcBitsAttr);
-  auto m = builder.create<arith::RemUIOp>(loc, srcIdx, idx);
-  return builder.create<arith::MulIOp>(loc, targetType, m, srcBitsValue);
+  AffineExpr s0;
+  bindSymbols(builder.getContext(), s0);
+  int scaleFactor = targetBits / sourceBits;
+  OpFoldResult offsetVal = affine::makeComposedFoldedAffineApply(
+      builder, loc, (s0 % scaleFactor) * sourceBits, {srcIdx});
+  Value bitOffset = getValueOrCreateConstantIndexOp(builder, loc, offsetVal);
+  IntegerType dstType = builder.getIntegerType(targetBits);
+  return builder.create<arith::IndexCastOp>(loc, dstType, bitOffset);
 }
 
 namespace {
@@ -61,15 +61,43 @@ struct ConvertMemRefAlloc final : OpConversionPattern<memref::AllocOp> {
   LogicalResult
   matchAndRewrite(memref::AllocOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    Type newTy = getTypeConverter()->convertType(op.getType());
-    if (!newTy) {
+    auto currentType = op.getMemref().getType().cast<MemRefType>();
+    auto newResultType =
+        getTypeConverter()->convertType(op.getType()).dyn_cast<MemRefType>();
+    if (!newResultType) {
       return rewriter.notifyMatchFailure(
           op->getLoc(),
           llvm::formatv("failed to convert memref type: {0}", op.getType()));
     }
 
+    // Special case zero-rank memrefs.
+    if (currentType.getRank() == 0) {
+      rewriter.replaceOpWithNewOp<memref::AllocOp>(
+          op, newResultType, ValueRange{}, adaptor.getSymbolOperands(),
+          adaptor.getAlignmentAttr());
+      return success();
+    }
+
+    Location loc = op.getLoc();
+    OpFoldResult zero = rewriter.getIndexAttr(0);
+    SmallVector<OpFoldResult> indices(currentType.getRank(), zero);
+
+    // Get linearized type.
+    int srcBits = currentType.getElementType().getIntOrFloatBitWidth();
+    int dstBits = newResultType.getElementType().getIntOrFloatBitWidth();
+    SmallVector<OpFoldResult> sizes = op.getMixedSizes();
+
+    memref::LinearizedMemRefInfo linearizedMemRefInfo =
+        memref::getLinearizedMemRefOffsetAndSize(
+            rewriter, loc, srcBits, dstBits, /*offset =*/zero, sizes);
+    SmallVector<Value> dynamicLinearizedSize;
+    if (!newResultType.hasStaticShape()) {
+      dynamicLinearizedSize.push_back(getValueOrCreateConstantIndexOp(
+          rewriter, loc, linearizedMemRefInfo.linearizedSize));
+    }
+
     rewriter.replaceOpWithNewOp<memref::AllocOp>(
-        op, newTy, adaptor.getDynamicSizes(), adaptor.getSymbolOperands(),
+        op, newResultType, dynamicLinearizedSize, adaptor.getSymbolOperands(),
         adaptor.getAlignmentAttr());
     return success();
   }
@@ -109,73 +137,68 @@ struct ConvertMemRefLoad final : OpConversionPattern<memref::LoadOp> {
   LogicalResult
   matchAndRewrite(memref::LoadOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    Type newTy = getTypeConverter()->convertType(op.getMemRefType());
-    if (!newTy) {
-      return rewriter.notifyMatchFailure(
-          op->getLoc(), llvm::formatv("failed to convert memref type: {0}",
-                                      op.getMemRefType()));
-    }
-
-    if (op.getMemRefType() == newTy)
-      return failure();
-
-    auto loc = op.getLoc();
-    auto sourceType = cast<MemRefType>(adaptor.getMemref().getType());
-    unsigned sourceRank = sourceType.getRank();
-    SmallVector<Value> indices = adaptor.getIndices();
-    assert(indices.size() == sourceRank);
-
-    auto srcElementType = sourceType.getElementType();
+    auto convertedType = adaptor.getMemref().getType().cast<MemRefType>();
+    auto convertedElementType = convertedType.getElementType();
     auto oldElementType = op.getMemRefType().getElementType();
     int srcBits = oldElementType.getIntOrFloatBitWidth();
-    int dstBits = srcElementType.getIntOrFloatBitWidth();
+    int dstBits = convertedElementType.getIntOrFloatBitWidth();
     if (dstBits % srcBits != 0) {
       return rewriter.notifyMatchFailure(
           op, "only dstBits % srcBits == 0 supported");
     }
 
-    auto stridedMetadata = rewriter.create<memref::ExtractStridedMetadataOp>(
-        loc, adaptor.getMemref());
-
-    Value newLoad, lastIdx;
-    if (sourceRank == 0) {
-      newLoad = rewriter.create<memref::LoadOp>(
-          loc, srcElementType, adaptor.getMemref(), adaptor.getIndices());
-
-      lastIdx = stridedMetadata.getOffset();
+    Location loc = op.getLoc();
+    // Special case 0-rank memref loads.
+    Value bitsLoad;
+    if (convertedType.getRank() == 0) {
+      bitsLoad = rewriter.create<memref::LoadOp>(loc, adaptor.getMemref(),
+                                                 ValueRange{});
     } else {
-      auto [reinterpret, linearizedOffset] =
-          memref::getLinearizeMemRefAndOffset(loc, sourceType, srcBits, dstBits,
-                                              adaptor.getIndices(),
-                                              stridedMetadata, rewriter);
-
-      newLoad = rewriter.create<memref::LoadOp>(loc, srcElementType,
-                                                reinterpret, linearizedOffset);
-
-      lastIdx = adaptor.getIndices().back();
+      SmallVector<OpFoldResult> indices =
+          getAsOpFoldResult(adaptor.getIndices());
+
+      auto stridedMetadata = rewriter.create<memref::ExtractStridedMetadataOp>(
+          loc, op.getMemRef());
+
+      // Linearize the indices of the original load instruction. Do not account
+      // for the scaling yet. This will be accounted for later.
+      OpFoldResult linearizedIndices;
+      std::tie(std::ignore, linearizedIndices) =
+          memref::getLinearizedMemRefOffsetAndSize(
+              rewriter, loc, srcBits, srcBits,
+              stridedMetadata.getConstifiedMixedOffset(),
+              stridedMetadata.getConstifiedMixedSizes(),
+              stridedMetadata.getConstifiedMixedStrides(), indices);
+
+      AffineExpr s0;
+      bindSymbols(rewriter.getContext(), s0);
+      int64_t scaler = dstBits / srcBits;
+      OpFoldResult scaledLinearizedIndices =
+          affine::makeComposedFoldedAffineApply(
+              rewriter, loc, s0.floorDiv(scaler), {linearizedIndices});
+      Value newLoad = rewriter.create<memref::LoadOp>(
+          loc, adaptor.getMemref(),
+          getValueOrCreateConstantIndexOp(rewriter, loc,
+                                          scaledLinearizedIndices));
+
+      // Get the offset and shift the bits to the rightmost.
+      // Note, currently only the big-endian is supported.
+      Value bitwidthOffset = getOffsetForBitwidth(loc, linearizedIndices,
+                                                  srcBits, dstBits, rewriter);
+      bitsLoad = rewriter.create<arith::ShRSIOp>(loc, newLoad, bitwidthOffset);
     }
 
-    // Get the offset and shift the bits to the rightmost.
-    // Note, currently only the big-endian is supported.
-    auto castLastIdx =
-        rewriter.create<arith::IndexCastUIOp>(loc, srcElementType, lastIdx);
-
-    Value BitwidthOffset =
-        getOffsetForBitwidth(loc, castLastIdx, srcBits, dstBits, rewriter);
-    auto bitsLoad =
-        rewriter.create<arith::ShRSIOp>(loc, newLoad, BitwidthOffset);
-
     // Get the corresponding bits. If the arith computation bitwidth equals
     // to the emulated bitwidth, we apply a mask to extract the low bits.
     // It is not clear if this case actually happens in practice, but we keep
     // the operations just in case. Otherwise, if the arith computation bitwidth
     // is different from the emulated bitwidth we truncate the result.
     Operation *result;
     auto resultTy = getTypeConverter()->convertType(oldElementType);
-    if (resultTy == srcElementType) {
+    if (resultTy == convertedElementType) {
       auto mask = rewriter.create<arith::ConstantOp>(
-          loc, srcElementType,
-          rewriter.getIntegerAttr(srcElementType, (1 << srcBits) - 1));
+          loc, convertedElementType,
+          rewriter.getIntegerAttr(convertedElementType, (1 << srcBits) - 1));
 
       result = rewriter.create<arith::AndIOp>(loc, bitsLoad, mask);
     } else {
@@ -200,6 +223,25 @@ void memref::populateMemRefNarrowTypeEmulationPatterns(
   patterns
       .add<ConvertMemRefAlloc, ConvertMemRefLoad, ConvertMemRefAssumeAlignment>(
           typeConverter, patterns.getContext());
+  memref::populateResolveExtractStridedMetadataPatterns(patterns);
+}
+
+static SmallVector<int64_t> getLinearizedShape(MemRefType ty, int srcBits,
+                                               int dstBits) {
+  if (ty.getRank() == 0)
+    return {};
+
+  int64_t linearizedShape = 1;
+  for (auto shape : ty.getShape()) {
+    if (shape == ShapedType::kDynamic)
+      return {ShapedType::kDynamic};
+    linearizedShape *= shape;
+  }
+  int scale = dstBits / srcBits;
+  // Scale the size to the ceilDiv(linearizedShape, scale)
+  // to accomodate all the values.
+  linearizedShape = (linearizedShape + scale - 1) / scale;
+  return {linearizedShape};
 }
 
 void memref::populateMemRefNarrowTypeEmulationConversions(
@@ -215,11 +257,26 @@ void memref::populateMemRefNarrowTypeEmulationConversions(
         if (width >= loadStoreWidth)
           return ty;
 
+        // Currently only handle innermost stride being 1, checking
+        SmallVector<int64_t> strides;
+        int64_t offset;
+        if (failed(getStridesAndOffset(ty, strides, offset)))
+          return std::nullopt;
+        if (!strides.empty() && strides.back() != 1)
+          return std::nullopt;
+
         auto newElemTy = IntegerType::get(ty.getContext(), loadStoreWidth,
                                           intTy.getSignedness());
         if (!newElemTy)
           return std::nullopt;
 
-        return ty.cloneWith(std::nullopt, newElemTy);
+        StridedLayoutAttr layoutAttr;
+        if (offset != 0) {
+          layoutAttr = StridedLayoutAttr::get(ty.getContext(), offset,
+                                              ArrayRef<int64_t>{1});
+        }
+
+        return MemRefType::get(getLinearizedShape(ty, width, loadStoreWidth),
+                               newElemTy, layoutAttr, ty.getMemorySpace());
       });
 }
@@ -687,13 +687,17 @@ struct ExtractStridedMetadataOpAllocFolder
 
     auto baseBufferType = cast<MemRefType>(op.getBaseBuffer().getType());
     int64_t offset = 0;
-    if (allocLikeOp.getType() == baseBufferType)
-      results.push_back(allocLikeOp);
-    else
-      results.push_back(rewriter.create<memref::ReinterpretCastOp>(
-          loc, baseBufferType, allocLikeOp, offset,
-          /*sizes=*/ArrayRef<int64_t>(),
-          /*strides=*/ArrayRef<int64_t>()));
+    if (op.getBaseBuffer().use_empty()) {
+      results.push_back(nullptr);
+    } else {
+      if (allocLikeOp.getType() == baseBufferType)
+        results.push_back(allocLikeOp);
+      else
+        results.push_back(rewriter.create<memref::ReinterpretCastOp>(
+            loc, baseBufferType, allocLikeOp, offset,
+            /*sizes=*/ArrayRef<int64_t>(),
+            /*strides=*/ArrayRef<int64_t>()));
+    }
 
     // Offset.
     results.push_back(rewriter.create<arith::ConstantIndexOp>(loc, offset));