Do not use ConversionPatternRewriter outside of dialect conversion.

gflegar · copybara-github · commit ce8df36a40aa · 2024-02-27T09:41:21.000-08:00
MLIR does not allow this after llvm/llvm-project#82244 PiperOrigin-RevId: 610740193
diff --git a/third_party/triton/cl610740193.patch b/third_party/triton/cl610740193.patch
@@ -0,0 +1,246 @@
+Upstream PR: https://github.com/openai/triton/pull/3213
+
+diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
+--- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp
++++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
+@@ -157,9 +157,10 @@ getSharedMemoryObjectFromStruct(Location
+           /*offsets=*/{elems.begin() + 1 + rank, elems.end()}};
+ }
+ 
+-SmallVector<Value>
+-getStridesFromShapeAndOrder(ArrayRef<int64_t> shape, ArrayRef<unsigned> order,
+-                            Location loc, ConversionPatternRewriter &rewriter) {
++SmallVector<Value> getStridesFromShapeAndOrder(ArrayRef<int64_t> shape,
++                                               ArrayRef<unsigned> order,
++                                               Location loc,
++                                               RewriterBase &rewriter) {
+   auto rank = shape.size();
+   SmallVector<Value> strides(rank);
+   int64_t stride = 1;
+@@ -172,9 +173,8 @@ getStridesFromShapeAndOrder(ArrayRef<int
+ 
+ // Convert an \param index to a multi-dim coordinate given \param shape and
+ // \param order.
+-SmallVector<Value> delinearize(ConversionPatternRewriter &rewriter,
+-                               Location loc, Value linear,
+-                               ArrayRef<unsigned> shape,
++SmallVector<Value> delinearize(RewriterBase &rewriter, Location loc,
++                               Value linear, ArrayRef<unsigned> shape,
+                                ArrayRef<unsigned> order) {
+   unsigned rank = shape.size();
+   assert(rank == order.size());
+@@ -194,9 +194,8 @@ SmallVector<Value> delinearize(Conversio
+   return multiDim;
+ }
+ 
+-SmallVector<Value> delinearize(ConversionPatternRewriter &rewriter,
+-                               Location loc, unsigned linear,
+-                               ArrayRef<unsigned> shape) {
++SmallVector<Value> delinearize(RewriterBase &rewriter, Location loc,
++                               unsigned linear, ArrayRef<unsigned> shape) {
+   unsigned rank = shape.size();
+   assert(rank > 0);
+   SmallVector<Value> multiDim(rank);
+@@ -209,9 +208,8 @@ SmallVector<Value> delinearize(Conversio
+   return multiDim;
+ }
+ 
+-SmallVector<Value> delinearize(ConversionPatternRewriter &rewriter,
+-                               Location loc, Value linear,
+-                               ArrayRef<unsigned> shape) {
++SmallVector<Value> delinearize(RewriterBase &rewriter, Location loc,
++                               Value linear, ArrayRef<unsigned> shape) {
+   unsigned rank = shape.size();
+   assert(rank > 0);
+   SmallVector<Value> multiDim(rank);
+diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.h b/lib/Conversion/TritonGPUToLLVM/Utility.h
+--- a/lib/Conversion/TritonGPUToLLVM/Utility.h
++++ b/lib/Conversion/TritonGPUToLLVM/Utility.h
+@@ -232,9 +232,10 @@ void createStoreDSmem(Location loc, Patt
+                       Value ctaId, ArrayRef<Value> values);
+ 
+ /// Helper function to get strides from a given shape and its order
+-SmallVector<Value>
+-getStridesFromShapeAndOrder(ArrayRef<int64_t> shape, ArrayRef<unsigned> order,
+-                            Location loc, ConversionPatternRewriter &rewriter);
++SmallVector<Value> getStridesFromShapeAndOrder(ArrayRef<int64_t> shape,
++                                               ArrayRef<unsigned> order,
++                                               Location loc,
++                                               RewriterBase &rewriter);
+ struct SharedMemoryObject {
+   Value base; // i32 ptr. The start address of the shared memory object after
+               // the initial allocation or the last slicing operation.
+@@ -264,7 +265,7 @@ struct SharedMemoryObject {
+ 
+   SharedMemoryObject(Value base, Type baseElemType, ArrayRef<int64_t> shape,
+                      ArrayRef<unsigned> order, Location loc,
+-                     ConversionPatternRewriter &rewriter)
++                     RewriterBase &rewriter)
+       : base(base), baseElemType(baseElemType) {
+     strides = getStridesFromShapeAndOrder(shape, order, loc, rewriter);
+     offsets.append(order.size(), i32_val(0));
+@@ -311,18 +312,15 @@ getSharedMemoryObjectFromStruct(Location
+ 
+ // Convert an \param index to a multi-dim coordinate given \param shape and
+ // \param order.
+-SmallVector<Value> delinearize(ConversionPatternRewriter &rewriter,
+-                               Location loc, Value linear,
+-                               ArrayRef<unsigned> shape,
++SmallVector<Value> delinearize(RewriterBase &rewriter, Location loc,
++                               Value linear, ArrayRef<unsigned> shape,
+                                ArrayRef<unsigned> order);
+ 
+-SmallVector<Value> delinearize(ConversionPatternRewriter &rewriter,
+-                               Location loc, unsigned linear,
+-                               ArrayRef<unsigned> shape);
++SmallVector<Value> delinearize(RewriterBase &rewriter, Location loc,
++                               unsigned linear, ArrayRef<unsigned> shape);
+ 
+-SmallVector<Value> delinearize(ConversionPatternRewriter &rewriter,
+-                               Location loc, Value linear,
+-                               ArrayRef<unsigned> shape);
++SmallVector<Value> delinearize(RewriterBase &rewriter, Location loc,
++                               Value linear, ArrayRef<unsigned> shape);
+ 
+ Value linearize(ConversionPatternRewriter &rewriter, Location loc,
+                 ArrayRef<Value> multiDim, ArrayRef<unsigned> shape,
+@@ -380,22 +378,20 @@ static Value getSharedMemoryBase(Locatio
+ 
+ /* ------------------------------------ */
+ // Returns CTA level thread idx
+-static Value getThreadIdInCTA(ConversionPatternRewriter &rewriter,
+-                              Location loc) {
++static Value getThreadIdInCTA(RewriterBase &rewriter, Location loc) {
+   Value tid =
+       rewriter.create<::mlir::gpu::ThreadIdOp>(loc, ::mlir::gpu::Dimension::x);
+   return rewriter.create<arith::IndexCastOp>(loc, i32_ty, tid);
+ }
+ 
+ // Returns CTA level thread idx.
+-static Value getThreadId(ConversionPatternRewriter &rewriter, Location loc) {
++static Value getThreadId(RewriterBase &rewriter, Location loc) {
+   Value tid = getThreadIdInCTA(rewriter, loc);
+   auto mod = rewriter.getBlock()->getParent()->getParentOfType<ModuleOp>();
+   return tid;
+ }
+ 
+-static Value getClusterCTAId(ConversionPatternRewriter &rewriter,
+-                             Location loc) {
++static Value getClusterCTAId(RewriterBase &rewriter, Location loc) {
+   return rewriter.create<triton::nvgpu::ClusterCTAIdOp>(loc,
+                                                         rewriter.getI32Type());
+ }
+@@ -413,8 +409,8 @@ using ::mlir::triton::gpu::DotOperandEnc
+ using ::mlir::triton::gpu::NvidiaMmaEncodingAttr;
+ using ::mlir::triton::gpu::SliceEncodingAttr;
+ 
+-static Value dot(ConversionPatternRewriter &rewriter, Location loc,
+-                 ArrayRef<Value> offsets, ArrayRef<Value> strides) {
++static Value dot(RewriterBase &rewriter, Location loc, ArrayRef<Value> offsets,
++                 ArrayRef<Value> strides) {
+   assert(offsets.size() == strides.size());
+   Value ret = i32_val(0);
+   for (auto [offset, stride] : llvm::zip(offsets, strides)) {
+@@ -428,9 +424,10 @@ static Value dot(ConversionPatternRewrit
+ // -----------------------------------------------------------------------
+ 
+ // Get an index-base for each dimension for a \param blockedLayout.
+-static SmallVector<Value> emitBaseIndexWithinCTAForBlockedLayout(
+-    Location loc, ConversionPatternRewriter &rewriter,
+-    const BlockedEncodingAttr &blockedLayout, RankedTensorType type) {
++static SmallVector<Value>
++emitBaseIndexWithinCTAForBlockedLayout(Location loc, RewriterBase &rewriter,
++                                       const BlockedEncodingAttr &blockedLayout,
++                                       RankedTensorType type) {
+   auto shape = type.getShape();
+   Value threadId = getThreadId(rewriter, loc);
+   Value warpSize = i32_val(32);
+@@ -511,9 +508,10 @@ emitOffsetForBlockedLayout(const Blocked
+ // Mma layout indices
+ // -----------------------------------------------------------------------
+ 
+-static SmallVector<Value> emitBaseIndexWithinCTAForMmaLayoutV1(
+-    Location loc, ConversionPatternRewriter &rewriter,
+-    const NvidiaMmaEncodingAttr &mmaLayout, RankedTensorType type) {
++static SmallVector<Value>
++emitBaseIndexWithinCTAForMmaLayoutV1(Location loc, RewriterBase &rewriter,
++                                     const NvidiaMmaEncodingAttr &mmaLayout,
++                                     RankedTensorType type) {
+   auto shape = type.getShape();
+   auto wpt = mmaLayout.getWarpsPerCTA();
+   static constexpr std::array<int, 3> fpw{{2, 2, 1}};
+@@ -654,9 +652,10 @@ emitOffsetForMmaLayoutV2(const NvidiaMma
+   return ret;
+ }
+ 
+-static SmallVector<Value> emitBaseIndexWithinCTAForMmaLayoutV2V3(
+-    Location loc, ConversionPatternRewriter &rewriter,
+-    const NvidiaMmaEncodingAttr &mmaLayout, RankedTensorType type) {
++static SmallVector<Value>
++emitBaseIndexWithinCTAForMmaLayoutV2V3(Location loc, RewriterBase &rewriter,
++                                       const NvidiaMmaEncodingAttr &mmaLayout,
++                                       RankedTensorType type) {
+   auto shape = type.getShape();
+   auto _warpsPerCTA = mmaLayout.getWarpsPerCTA();
+   auto rank = shape.size();
+@@ -776,9 +775,10 @@ emitOffsetForSliceLayout(const SliceEnco
+ // Get offsets / indices for any layout
+ // -----------------------------------------------------------------------
+ 
+-static SmallVector<Value>
+-emitCTAOffsetForLayout(Location loc, ConversionPatternRewriter &rewriter,
+-                       Attribute layout, ArrayRef<int64_t> shape) {
++static SmallVector<Value> emitCTAOffsetForLayout(Location loc,
++                                                 RewriterBase &rewriter,
++                                                 Attribute layout,
++                                                 ArrayRef<int64_t> shape) {
+   unsigned rank = shape.size();
+   SmallVector<unsigned> CTAsPerCGA = triton::gpu::getCTAsPerCGA(layout);
+   SmallVector<unsigned> CTASplitNum = triton::gpu::getCTASplitNum(layout);
+@@ -806,13 +806,12 @@ emitCTAOffsetForLayout(Location loc, Con
+ }
+ 
+ static SmallVector<Value>
+-emitBaseIndexForLayout(Location loc, ConversionPatternRewriter &rewriter,
+-                       Attribute layout, RankedTensorType type,
+-                       bool withCTAOffset) {
++emitBaseIndexForLayout(Location loc, RewriterBase &rewriter, Attribute layout,
++                       RankedTensorType type, bool withCTAOffset) {
+   auto shape = type.getShape();
+ 
+   SmallVector<Value> baseIndex;
+-  ConversionPatternRewriter::InsertionGuard guard(rewriter);
++  RewriterBase::InsertionGuard guard(rewriter);
+   SmallVector<Value> result;
+   if (auto blockedLayout = layout.dyn_cast<BlockedEncodingAttr>()) {
+     result = emitBaseIndexWithinCTAForBlockedLayout(loc, rewriter,
+@@ -866,7 +865,7 @@ emitOffsetForLayout(Attribute layout, Ra
+ // Emit indices calculation within each ConversionPattern, and returns a
+ // [elemsPerThread X rank] index matrix.
+ static SmallVector<SmallVector<Value>>
+-emitIndices(Location loc, ConversionPatternRewriter &rewriter, Attribute layout,
++emitIndices(Location loc, RewriterBase &rewriter, Attribute layout,
+             RankedTensorType type, bool withCTAOffset) {
+   // step 1, delinearize threadId to get the base index
+   auto multiDimBase =
+@@ -892,7 +891,7 @@ emitIndices(Location loc, ConversionPatt
+ DenseMap<unsigned, Value> static getSwizzledSharedPtrs(
+     Location loc, unsigned inVec, RankedTensorType srcTy,
+     triton::gpu::SharedEncodingAttr resSharedLayout, Type resElemTy,
+-    SharedMemoryObject smemObj, ConversionPatternRewriter &rewriter,
++    SharedMemoryObject smemObj, RewriterBase &rewriter,
+     SmallVectorImpl<Value> &offsetVals, SmallVectorImpl<Value> &srcStrides) {
+   // This utility computes the pointers for accessing the provided swizzled
+   // shared memory layout `resSharedLayout`. More specifically, it computes,
+diff --git a/unittest/Conversion/TritonGPUToLLVM/DumpLayout.cpp b/unittest/Conversion/TritonGPUToLLVM/DumpLayout.cpp
+--- a/unittest/Conversion/TritonGPUToLLVM/DumpLayout.cpp
++++ b/unittest/Conversion/TritonGPUToLLVM/DumpLayout.cpp
+@@ -78,7 +78,7 @@ private:
+   LowerToLLVMOptions option;
+   TritonGPUToLLVMTypeConverter typeConverter;
+   Block block;
+-  ConversionPatternRewriter rewriter;
++  IRRewriter rewriter;
+   Location loc;
+ };
+ 
diff --git a/third_party/triton/workspace.bzl b/third_party/triton/workspace.bzl
@@ -16,5 +16,6 @@ def repo():
         patch_file = [
             "//third_party/triton:cl607293980.patch",  # long standing :(
             "//third_party/triton:cl610393680.patch",
+            "//third_party/triton:cl610740193.patch",
         ],
     )

Original file line number	Diff line number	Diff line change
`@@ -16,5 +16,6 @@ def repo():`
`16`	`16`	`patch_file = [`
`17`	`17`	`"//third_party/triton:cl607293980.patch", # long standing :(`
`18`	`18`	`"//third_party/triton:cl610393680.patch",`
	`19`	`+ "//third_party/triton:cl610740193.patch",`
`19`	`20`	`],`
`20`	`21`	`)`