Skip to content

Commit ae724f5

Browse files
committed
[mlir][AMDGPU] Improve amdgpu.lds_barrier, add warnings
On some architectures (currently gfx90a, gfx94*, and gfx10**), we can implement an LDS barrier using compiler intrinsics instead of inline assembly, improving optimization possibilities and decreasing the fragility of the underlying code. Other AMDGPU chipsets continue to require inline assembly to implement this barrier, as, by the default, the LLVM backend will insert waits on global memory (s_waintcnt vmcnt(0)) before barriers in order to ensure memory watchpoints set by debuggers work correctly. Use of amdgpu.lds_barrier, on these architectures, imposes a tradeoff between debugability and performance. The documentation, as well as the generated inline assembly, have been updated to explicitly call attention to this fact. For chipsets that did not require the inline assembly hack, we move to the s.waitcnt and s.barrier intrinsics, which have been added to the ROCDL dialect. The magic constants used as an argument to the waitcnt intrinsic can be derived from llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
1 parent a300b24 commit ae724f5

File tree

6 files changed

+136
-33
lines changed

6 files changed

+136
-33
lines changed

mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,11 @@ def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> {
424424
to complete before execution continues. Therefore, it should be used when
425425
operations on global memory can be issued far in advance of when their results
426426
are used (for example, by writing them to LDS).
427+
428+
WARNING: On architectures that do not support the BackOffBarrier feature,
429+
(those which will implement this barrier by emitting inline assembly),
430+
use of this operation will impede the usabiliity of memory watches (including
431+
breakpoints set on variables) when debugging.
427432
}];
428433
let assemblyFormat = "attr-dict";
429434
}

mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,23 @@ def ROCDL_GridDimZOp : ROCDL_DeviceFunctionOp<"grid.dim.z",
180180
//===----------------------------------------------------------------------===//
181181
// Synchronization primitives
182182

183+
// Emits the waintcnt instruction. The bitfield's semantics depend
184+
// on the target chipset
185+
def ROCDL_WaitcntOp : ROCDL_Op<"waitcnt">, Arguments<(ins I32:$bitfield)> {
186+
string llvmBuilder = [{
187+
createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_s_waitcnt,
188+
{$bitfield});
189+
}];
190+
let assemblyFormat = "attr-dict $bitfield";
191+
}
192+
193+
def ROCDL_SBarrierOp : ROCDL_Op<"s.barrier"> {
194+
string llvmBuilder = [{
195+
createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_s_barrier);
196+
}];
197+
let assemblyFormat = "attr-dict";
198+
}
199+
183200
def ROCDL_BarrierOp : ROCDL_Op<"barrier"> {
184201
string llvmBuilder = [{
185202
llvm::LLVMContext &llvmContext = builder.getContext();

mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp

Lines changed: 48 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -270,21 +270,55 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
270270
};
271271

272272
struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern<LDSBarrierOp> {
273-
using ConvertOpToLLVMPattern<LDSBarrierOp>::ConvertOpToLLVMPattern;
273+
LDSBarrierOpLowering(LLVMTypeConverter &converter, Chipset chipset)
274+
: ConvertOpToLLVMPattern<LDSBarrierOp>(converter), chipset(chipset) {}
275+
276+
Chipset chipset;
274277

275278
LogicalResult
276279
matchAndRewrite(LDSBarrierOp op, LDSBarrierOp::Adaptor adaptor,
277280
ConversionPatternRewriter &rewriter) const override {
278-
auto asmDialectAttr = LLVM::AsmDialectAttr::get(rewriter.getContext(),
279-
LLVM::AsmDialect::AD_ATT);
280-
const char *asmStr = "s_waitcnt lgkmcnt(0)\ns_barrier";
281-
const char *constraints = "";
282-
rewriter.replaceOpWithNewOp<LLVM::InlineAsmOp>(
283-
op,
284-
/*resultTypes=*/TypeRange(), /*operands=*/ValueRange(),
285-
/*asm_string=*/asmStr, constraints, /*has_side_effects=*/true,
286-
/*is_align_stack=*/false, /*asm_dialect=*/asmDialectAttr,
287-
/*operand_attrs=*/ArrayAttr());
281+
bool requiresInlineAsm =
282+
chipset.majorVersion < 9 ||
283+
(chipset.majorVersion == 9 && chipset.minorVersion < 0x0a) ||
284+
(chipset.majorVersion == 11);
285+
286+
if (requiresInlineAsm) {
287+
auto asmDialectAttr = LLVM::AsmDialectAttr::get(rewriter.getContext(),
288+
LLVM::AsmDialect::AD_ATT);
289+
const char *asmStr =
290+
";;;WARNING: BREAKS DEBUG WATCHES\ns_waitcnt lgkmcnt(0)\ns_barrier";
291+
const char *constraints = "";
292+
rewriter.replaceOpWithNewOp<LLVM::InlineAsmOp>(
293+
op,
294+
/*resultTypes=*/TypeRange(), /*operands=*/ValueRange(),
295+
/*asm_string=*/asmStr, constraints, /*has_side_effects=*/true,
296+
/*is_align_stack=*/false, /*asm_dialect=*/asmDialectAttr,
297+
/*operand_attrs=*/ArrayAttr());
298+
return success();
299+
}
300+
constexpr int32_t ldsOnlyBitsGfx6789 = ~(0x1f << 8);
301+
constexpr int32_t ldsOnlyBitsGfx10 = ~(0x3f << 8);
302+
// Left in place in case someone disables the inline ASM path or future
303+
// chipsets use the same bit pattern.
304+
constexpr int32_t ldsOnlyBitsGfx11 = ~(0x3f << 4);
305+
306+
int32_t ldsOnlyBits;
307+
if (chipset.majorVersion == 11)
308+
ldsOnlyBits = ldsOnlyBitsGfx11;
309+
else if (chipset.majorVersion == 10)
310+
ldsOnlyBits = ldsOnlyBitsGfx10;
311+
else if (chipset.majorVersion <= 9)
312+
ldsOnlyBits = ldsOnlyBitsGfx6789;
313+
else
314+
return op.emitOpError(
315+
"don't know how to lower this for chipset major version")
316+
<< chipset.majorVersion;
317+
318+
Location loc = op->getLoc();
319+
Value constant = createI32Constant(rewriter, loc, ldsOnlyBits);
320+
rewriter.create<ROCDL::WaitcntOp>(loc, constant);
321+
rewriter.replaceOpWithNewOp<ROCDL::SBarrierOp>(op);
288322
return success();
289323
}
290324
};
@@ -834,7 +868,6 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
834868
return converter.convertType(t.clone(IntegerType::get(t.getContext(), 16)));
835869
});
836870

837-
patterns.add<LDSBarrierOpLowering>(converter);
838871
patterns
839872
.add<RawBufferOpLowering<RawBufferLoadOp, ROCDL::RawPtrBufferLoadOp>,
840873
RawBufferOpLowering<RawBufferStoreOp, ROCDL::RawPtrBufferStoreOp>,
@@ -848,9 +881,9 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
848881
ROCDL::RawPtrBufferAtomicUminOp>,
849882
RawBufferOpLowering<RawBufferAtomicCmpswapOp,
850883
ROCDL::RawPtrBufferAtomicCmpSwap>,
851-
MFMAOpLowering, WMMAOpLowering, ExtPackedFp8OpLowering,
852-
PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering>(converter,
853-
chipset);
884+
LDSBarrierOpLowering, MFMAOpLowering, WMMAOpLowering,
885+
ExtPackedFp8OpLowering, PackedTrunc2xFp8OpLowering,
886+
PackedStochRoundFp8OpLowering>(converter, chipset);
854887
}
855888

856889
std::unique_ptr<Pass> mlir::createConvertAMDGPUToROCDLPass() {

mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir

Lines changed: 36 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
1-
// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx908 | FileCheck %s
2-
// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefix=RDNA
3-
// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefix=RDNA
1+
// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx908 | FileCheck %s --check-prefixes=CHECK,GFX9,GFX908
2+
// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx90a | FileCheck %s --check-prefixes=CHECK,GFX9,GFX90A
3+
// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefixes=CHECK,GFX10,RDNA
4+
// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefixes=CHECK,GFX11,RDNA
45

56
// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_scalar_i32
67
func.func @gpu_gcn_raw_buffer_load_scalar_i32(%buf: memref<i32>) -> i32 {
78
// CHECK: %[[stride:.*]] = llvm.mlir.constant(0 : i16)
89
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(4 : i32)
9-
// CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
10+
// GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
1011
// RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
1112
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %[[stride]], %[[numRecords]], %[[flags]] : !llvm.ptr to <8>
1213
// CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
@@ -19,7 +20,7 @@ func.func @gpu_gcn_raw_buffer_load_scalar_i32(%buf: memref<i32>) -> i32 {
1920
func.func @gpu_gcn_raw_buffer_load_i32(%buf: memref<64xi32>, %idx: i32) -> i32 {
2021
// CHECK: %[[stride:.*]] = llvm.mlir.constant(0 : i16)
2122
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
22-
// CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
23+
// GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
2324
// RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
2425
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %[[stride]], %[[numRecords]], %[[flags]] : !llvm.ptr to <8>
2526
// CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
@@ -30,11 +31,11 @@ func.func @gpu_gcn_raw_buffer_load_i32(%buf: memref<64xi32>, %idx: i32) -> i32 {
3031

3132
// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32_oob_off
3233
func.func @gpu_gcn_raw_buffer_load_i32_oob_off(%buf: memref<64xi32>, %idx: i32) -> i32 {
33-
// CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
34+
// GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
3435
// RDNA: %[[flags:.*]] = llvm.mlir.constant(553807872 : i32)
35-
// RDNA: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %{{.*}}, %[[flags]]
36-
// RDNA: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
37-
// RDNA: return %[[ret]]
36+
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %{{.*}}, %[[flags]]
37+
// CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
38+
// CHECK: return %[[ret]]
3839
%0 = amdgpu.raw_buffer_load {boundsCheck = false} %buf[%idx] : memref<64xi32>, i32 -> i32
3940
func.return %0 : i32
4041
}
@@ -103,7 +104,8 @@ func.func @gpu_gcn_raw_buffer_load_4xf8E4M3FNUZ(%buf: memref<64xf8E4M3FNUZ>, %id
103104
// Since the lowering logic is shared with loads, only bitcasts need to be rechecked
104105
// CHECK-LABEL: func @gpu_gcn_raw_buffer_store_scalar_i32
105106
func.func @gpu_gcn_raw_buffer_store_scalar_i32(%value: i32, %buf: memref<i32>) {
106-
// CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
107+
// GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
108+
// RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
107109
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %{{.*}}, %[[flags]]
108110
// CHECK: rocdl.raw.ptr.buffer.store %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
109111
amdgpu.raw_buffer_store {boundsCheck = true} %value -> %buf[] : i32 -> memref<i32>
@@ -113,7 +115,8 @@ func.func @gpu_gcn_raw_buffer_store_scalar_i32(%value: i32, %buf: memref<i32>) {
113115
// CHECK-LABEL: func @gpu_gcn_raw_buffer_store_i32
114116
func.func @gpu_gcn_raw_buffer_store_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) {
115117
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
116-
// CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
118+
// GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
119+
// RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
117120
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
118121
// CHECK: rocdl.raw.ptr.buffer.store %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
119122
amdgpu.raw_buffer_store {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32
@@ -140,7 +143,8 @@ func.func @gpu_gcn_raw_buffer_store_16xi8(%value: vector<16xi8>, %buf: memref<64
140143
// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fadd_f32
141144
func.func @gpu_gcn_raw_buffer_atomic_fadd_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) {
142145
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
143-
// CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
146+
// GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
147+
// RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
144148
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
145149
// CHECK: rocdl.raw.ptr.buffer.atomic.fadd %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : f32
146150
amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %value -> %buf[%idx] : f32 -> memref<64xf32>, i32
@@ -150,7 +154,8 @@ func.func @gpu_gcn_raw_buffer_atomic_fadd_f32(%value: f32, %buf: memref<64xf32>,
150154
// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fmax_f32
151155
func.func @gpu_gcn_raw_buffer_atomic_fmax_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) {
152156
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
153-
// CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
157+
// GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
158+
// RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
154159
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
155160
// CHECK: rocdl.raw.ptr.buffer.atomic.fmax %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : f32
156161
amdgpu.raw_buffer_atomic_fmax {boundsCheck = true} %value -> %buf[%idx] : f32 -> memref<64xf32>, i32
@@ -160,7 +165,8 @@ func.func @gpu_gcn_raw_buffer_atomic_fmax_f32(%value: f32, %buf: memref<64xf32>,
160165
// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_smax_i32
161166
func.func @gpu_gcn_raw_buffer_atomic_smax_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) {
162167
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
163-
// CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
168+
// GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
169+
// RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
164170
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
165171
// CHECK: rocdl.raw.ptr.buffer.atomic.smax %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
166172
amdgpu.raw_buffer_atomic_smax {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32
@@ -170,7 +176,8 @@ func.func @gpu_gcn_raw_buffer_atomic_smax_i32(%value: i32, %buf: memref<64xi32>,
170176
// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_umin_i32
171177
func.func @gpu_gcn_raw_buffer_atomic_umin_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) {
172178
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
173-
// CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
179+
// GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
180+
// RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
174181
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
175182
// CHECK: rocdl.raw.ptr.buffer.atomic.umin %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
176183
amdgpu.raw_buffer_atomic_umin {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32
@@ -183,7 +190,8 @@ func.func @amdgpu_raw_buffer_atomic_cmpswap_f32(%src : f32, %cmp : f32, %buf : m
183190
// CHECK: %[[srcCast:.*]] = llvm.bitcast %[[src]] : f32 to i32
184191
// CHECK: %[[cmpCast:.*]] = llvm.bitcast %[[cmp]] : f32 to i32
185192
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
186-
// CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
193+
// GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
194+
// RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
187195
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
188196
// CHECK: %[[dst:.*]] = rocdl.raw.ptr.buffer.atomic.cmpswap %[[srcCast]], %[[cmpCast]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
189197
// CHECK: %[[dstCast:.*]] = llvm.bitcast %[[dst]] : i32 to f32
@@ -196,7 +204,8 @@ func.func @amdgpu_raw_buffer_atomic_cmpswap_f32(%src : f32, %cmp : f32, %buf : m
196204
// CHECK-SAME: (%[[src:.*]]: i64, %[[cmp:.*]]: i64, {{.*}})
197205
func.func @amdgpu_raw_buffer_atomic_cmpswap_i64(%src : i64, %cmp : i64, %buf : memref<64xi64>, %idx: i32) -> i64 {
198206
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(512 : i32)
199-
// CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
207+
// GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
208+
// RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
200209
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
201210
// CHECK: %[[dst:.*]] = rocdl.raw.ptr.buffer.atomic.cmpswap %[[src]], %[[cmp]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i64
202211
// CHECK: return %[[dst]]
@@ -206,7 +215,16 @@ func.func @amdgpu_raw_buffer_atomic_cmpswap_i64(%src : i64, %cmp : i64, %buf : m
206215

207216
// CHECK-LABEL: func @lds_barrier
208217
func.func @lds_barrier() {
209-
// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "s_waitcnt lgkmcnt(0)\0As_barrier"
218+
// GFX908: llvm.inline_asm has_side_effects asm_dialect = att
219+
// GFX908-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier"
220+
// GFX90A: %[[cst:.*]] = llvm.mlir.constant(-7937 : i32) : i32
221+
// GFX90A: rocdl.waitcnt %[[cst]]
222+
// GFX90A-NEXT: rocdl.s.barrier
223+
// GFX10: %[[cst:.*]] = llvm.mlir.constant(-16129 : i32) : i32
224+
// GFX10: rocdl.waitcnt %[[cst]]
225+
// GFX10-NEXT: rocdl.s.barrier
226+
// GFX11: llvm.inline_asm has_side_effects asm_dialect = att
227+
// GFX11-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier"
210228
amdgpu.lds_barrier
211229
func.return
212230
}

mlir/test/Dialect/LLVMIR/rocdl.mlir

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,19 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 {
351351
llvm.return %source5 : i32
352352
}
353353

354+
llvm.func @rocdl.waitcnt(%arg0 : i32) {
355+
// CHECK-LABEL: rocdl.waitcnt
356+
// CHECK: rocdl.waitcnt
357+
rocdl.waitcnt %arg0
358+
llvm.return
359+
}
360+
361+
llvm.func @rocdl.s.barrier() {
362+
// CHECK-LABEL: rocdl.s.barrier
363+
// CHECK: rocdl.s.barrier
364+
rocdl.s.barrier
365+
llvm.return
366+
}
354367
// -----
355368

356369
// expected-error@below {{attribute attached to unexpected op}}

mlir/test/Target/LLVMIR/rocdl.mlir

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,24 @@ llvm.func @rocdl.bpermute(%src : i32) -> i32 {
8282
llvm.return %0 : i32
8383
}
8484

85+
llvm.func @rocdl.waitcnt() {
86+
// CHECK-LABEL: rocdl.waitcnt
87+
// CHECK-NEXT: call void @llvm.amdgcn.s.waitcnt(i32 0)
88+
%0 = llvm.mlir.constant(0 : i32) : i32
89+
rocdl.waitcnt %0
90+
llvm.return
91+
}
92+
93+
llvm.func @rocdl.s.barrier() {
94+
// CHECK-LABEL: rocdl.s.barrier
95+
// CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
96+
rocdl.s.barrier
97+
llvm.return
98+
}
99+
100+
85101
llvm.func @rocdl.barrier() {
102+
// CHECK-LABEL: rocdl.barrier
86103
// CHECK: fence syncscope("workgroup") release
87104
// CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
88105
// CHECK-NEXT: fence syncscope("workgroup") acquire

0 commit comments

Comments
 (0)