Skip to content

Commit b05c152

Browse files
authored
[mlir][AMDGPU] Improve amdgpu.lds_barrier, add warnings (#77942)
On some architectures (currently gfx90a, gfx94*, and gfx10**), we can implement an LDS barrier using compiler intrinsics instead of inline assembly, improving optimization possibilities and decreasing the fragility of the underlying code. Other AMDGPU chipsets continue to require inline assembly to implement this barrier, as, by the default, the LLVM backend will insert waits on global memory (s_waintcnt vmcnt(0)) before barriers in order to ensure memory watchpoints set by debuggers work correctly. Use of amdgpu.lds_barrier, on these architectures, imposes a tradeoff between debugability and performance. The documentation, as well as the generated inline assembly, have been updated to explicitly call attention to this fact. For chipsets that did not require the inline assembly hack, we move to the s.waitcnt and s.barrier intrinsics, which have been added to the ROCDL dialect. The magic constants used as an argument to the waitcnt intrinsic can be derived from llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
1 parent 63af858 commit b05c152

File tree

6 files changed

+132
-33
lines changed

6 files changed

+132
-33
lines changed

mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,11 @@ def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> {
441441
to complete before execution continues. Therefore, it should be used when
442442
operations on global memory can be issued far in advance of when their results
443443
are used (for example, by writing them to LDS).
444+
445+
WARNING: On architectures that do not support the BackOffBarrier feature,
446+
(those which will implement this barrier by emitting inline assembly),
447+
use of this operation will impede the usabiliity of memory watches (including
448+
breakpoints set on variables) when debugging.
444449
}];
445450
let assemblyFormat = "attr-dict";
446451
}

mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,23 @@ def ROCDL_GridDimZOp : ROCDL_DeviceFunctionOp<"grid.dim.z",
194194
//===----------------------------------------------------------------------===//
195195
// Synchronization primitives
196196

197+
// Emits the waintcnt instruction. The bitfield's semantics depend
198+
// on the target chipset
199+
def ROCDL_WaitcntOp : ROCDL_Op<"waitcnt">, Arguments<(ins I32Attr:$bitfield)> {
200+
string llvmBuilder = [{
201+
createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_s_waitcnt,
202+
{builder.getInt32($bitfield)});
203+
}];
204+
let assemblyFormat = "attr-dict $bitfield";
205+
}
206+
207+
def ROCDL_SBarrierOp : ROCDL_Op<"s.barrier"> {
208+
string llvmBuilder = [{
209+
createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_s_barrier);
210+
}];
211+
let assemblyFormat = "attr-dict";
212+
}
213+
197214
def ROCDL_BarrierOp : ROCDL_Op<"barrier"> {
198215
string llvmBuilder = [{
199216
llvm::LLVMContext &llvmContext = builder.getContext();

mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp

Lines changed: 47 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -270,21 +270,54 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
270270
};
271271

272272
struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern<LDSBarrierOp> {
273-
using ConvertOpToLLVMPattern<LDSBarrierOp>::ConvertOpToLLVMPattern;
273+
LDSBarrierOpLowering(LLVMTypeConverter &converter, Chipset chipset)
274+
: ConvertOpToLLVMPattern<LDSBarrierOp>(converter), chipset(chipset) {}
275+
276+
Chipset chipset;
274277

275278
LogicalResult
276279
matchAndRewrite(LDSBarrierOp op, LDSBarrierOp::Adaptor adaptor,
277280
ConversionPatternRewriter &rewriter) const override {
278-
auto asmDialectAttr = LLVM::AsmDialectAttr::get(rewriter.getContext(),
279-
LLVM::AsmDialect::AD_ATT);
280-
const char *asmStr = "s_waitcnt lgkmcnt(0)\ns_barrier";
281-
const char *constraints = "";
282-
rewriter.replaceOpWithNewOp<LLVM::InlineAsmOp>(
283-
op,
284-
/*resultTypes=*/TypeRange(), /*operands=*/ValueRange(),
285-
/*asm_string=*/asmStr, constraints, /*has_side_effects=*/true,
286-
/*is_align_stack=*/false, /*asm_dialect=*/asmDialectAttr,
287-
/*operand_attrs=*/ArrayAttr());
281+
bool requiresInlineAsm =
282+
chipset.majorVersion < 9 ||
283+
(chipset.majorVersion == 9 && chipset.minorVersion < 0x0a) ||
284+
(chipset.majorVersion == 11);
285+
286+
if (requiresInlineAsm) {
287+
auto asmDialectAttr = LLVM::AsmDialectAttr::get(rewriter.getContext(),
288+
LLVM::AsmDialect::AD_ATT);
289+
const char *asmStr =
290+
";;;WARNING: BREAKS DEBUG WATCHES\ns_waitcnt lgkmcnt(0)\ns_barrier";
291+
const char *constraints = "";
292+
rewriter.replaceOpWithNewOp<LLVM::InlineAsmOp>(
293+
op,
294+
/*resultTypes=*/TypeRange(), /*operands=*/ValueRange(),
295+
/*asm_string=*/asmStr, constraints, /*has_side_effects=*/true,
296+
/*is_align_stack=*/false, /*asm_dialect=*/asmDialectAttr,
297+
/*operand_attrs=*/ArrayAttr());
298+
return success();
299+
}
300+
constexpr int32_t ldsOnlyBitsGfx6789 = ~(0x1f << 8);
301+
constexpr int32_t ldsOnlyBitsGfx10 = ~(0x3f << 8);
302+
// Left in place in case someone disables the inline ASM path or future
303+
// chipsets use the same bit pattern.
304+
constexpr int32_t ldsOnlyBitsGfx11 = ~(0x3f << 4);
305+
306+
int32_t ldsOnlyBits;
307+
if (chipset.majorVersion == 11)
308+
ldsOnlyBits = ldsOnlyBitsGfx11;
309+
else if (chipset.majorVersion == 10)
310+
ldsOnlyBits = ldsOnlyBitsGfx10;
311+
else if (chipset.majorVersion <= 9)
312+
ldsOnlyBits = ldsOnlyBitsGfx6789;
313+
else
314+
return op.emitOpError(
315+
"don't know how to lower this for chipset major version")
316+
<< chipset.majorVersion;
317+
318+
Location loc = op->getLoc();
319+
rewriter.create<ROCDL::WaitcntOp>(loc, ldsOnlyBits);
320+
rewriter.replaceOpWithNewOp<ROCDL::SBarrierOp>(op);
288321
return success();
289322
}
290323
};
@@ -834,7 +867,6 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
834867
return converter.convertType(t.clone(IntegerType::get(t.getContext(), 16)));
835868
});
836869

837-
patterns.add<LDSBarrierOpLowering>(converter);
838870
patterns
839871
.add<RawBufferOpLowering<RawBufferLoadOp, ROCDL::RawPtrBufferLoadOp>,
840872
RawBufferOpLowering<RawBufferStoreOp, ROCDL::RawPtrBufferStoreOp>,
@@ -848,9 +880,9 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
848880
ROCDL::RawPtrBufferAtomicUminOp>,
849881
RawBufferOpLowering<RawBufferAtomicCmpswapOp,
850882
ROCDL::RawPtrBufferAtomicCmpSwap>,
851-
MFMAOpLowering, WMMAOpLowering, ExtPackedFp8OpLowering,
852-
PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering>(converter,
853-
chipset);
883+
LDSBarrierOpLowering, MFMAOpLowering, WMMAOpLowering,
884+
ExtPackedFp8OpLowering, PackedTrunc2xFp8OpLowering,
885+
PackedStochRoundFp8OpLowering>(converter, chipset);
854886
}
855887

856888
std::unique_ptr<Pass> mlir::createConvertAMDGPUToROCDLPass() {

mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir

Lines changed: 34 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
1-
// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx908 | FileCheck %s
2-
// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefix=RDNA
3-
// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefix=RDNA
1+
// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx908 | FileCheck %s --check-prefixes=CHECK,GFX9,GFX908
2+
// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx90a | FileCheck %s --check-prefixes=CHECK,GFX9,GFX90A
3+
// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefixes=CHECK,GFX10,RDNA
4+
// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefixes=CHECK,GFX11,RDNA
45

56
// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_scalar_i32
67
func.func @gpu_gcn_raw_buffer_load_scalar_i32(%buf: memref<i32>) -> i32 {
78
// CHECK: %[[stride:.*]] = llvm.mlir.constant(0 : i16)
89
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(4 : i32)
9-
// CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
10+
// GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
1011
// RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
1112
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %[[stride]], %[[numRecords]], %[[flags]] : !llvm.ptr to <8>
1213
// CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
@@ -19,7 +20,7 @@ func.func @gpu_gcn_raw_buffer_load_scalar_i32(%buf: memref<i32>) -> i32 {
1920
func.func @gpu_gcn_raw_buffer_load_i32(%buf: memref<64xi32>, %idx: i32) -> i32 {
2021
// CHECK: %[[stride:.*]] = llvm.mlir.constant(0 : i16)
2122
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
22-
// CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
23+
// GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
2324
// RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
2425
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %[[stride]], %[[numRecords]], %[[flags]] : !llvm.ptr to <8>
2526
// CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
@@ -30,11 +31,11 @@ func.func @gpu_gcn_raw_buffer_load_i32(%buf: memref<64xi32>, %idx: i32) -> i32 {
3031

3132
// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32_oob_off
3233
func.func @gpu_gcn_raw_buffer_load_i32_oob_off(%buf: memref<64xi32>, %idx: i32) -> i32 {
33-
// CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
34+
// GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
3435
// RDNA: %[[flags:.*]] = llvm.mlir.constant(553807872 : i32)
35-
// RDNA: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %{{.*}}, %[[flags]]
36-
// RDNA: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
37-
// RDNA: return %[[ret]]
36+
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %{{.*}}, %[[flags]]
37+
// CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
38+
// CHECK: return %[[ret]]
3839
%0 = amdgpu.raw_buffer_load {boundsCheck = false} %buf[%idx] : memref<64xi32>, i32 -> i32
3940
func.return %0 : i32
4041
}
@@ -103,7 +104,8 @@ func.func @gpu_gcn_raw_buffer_load_4xf8E4M3FNUZ(%buf: memref<64xf8E4M3FNUZ>, %id
103104
// Since the lowering logic is shared with loads, only bitcasts need to be rechecked
104105
// CHECK-LABEL: func @gpu_gcn_raw_buffer_store_scalar_i32
105106
func.func @gpu_gcn_raw_buffer_store_scalar_i32(%value: i32, %buf: memref<i32>) {
106-
// CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
107+
// GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
108+
// RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
107109
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %{{.*}}, %[[flags]]
108110
// CHECK: rocdl.raw.ptr.buffer.store %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
109111
amdgpu.raw_buffer_store {boundsCheck = true} %value -> %buf[] : i32 -> memref<i32>
@@ -113,7 +115,8 @@ func.func @gpu_gcn_raw_buffer_store_scalar_i32(%value: i32, %buf: memref<i32>) {
113115
// CHECK-LABEL: func @gpu_gcn_raw_buffer_store_i32
114116
func.func @gpu_gcn_raw_buffer_store_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) {
115117
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
116-
// CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
118+
// GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
119+
// RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
117120
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
118121
// CHECK: rocdl.raw.ptr.buffer.store %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
119122
amdgpu.raw_buffer_store {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32
@@ -140,7 +143,8 @@ func.func @gpu_gcn_raw_buffer_store_16xi8(%value: vector<16xi8>, %buf: memref<64
140143
// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fadd_f32
141144
func.func @gpu_gcn_raw_buffer_atomic_fadd_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) {
142145
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
143-
// CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
146+
// GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
147+
// RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
144148
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
145149
// CHECK: rocdl.raw.ptr.buffer.atomic.fadd %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : f32
146150
amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %value -> %buf[%idx] : f32 -> memref<64xf32>, i32
@@ -150,7 +154,8 @@ func.func @gpu_gcn_raw_buffer_atomic_fadd_f32(%value: f32, %buf: memref<64xf32>,
150154
// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fmax_f32
151155
func.func @gpu_gcn_raw_buffer_atomic_fmax_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) {
152156
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
153-
// CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
157+
// GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
158+
// RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
154159
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
155160
// CHECK: rocdl.raw.ptr.buffer.atomic.fmax %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : f32
156161
amdgpu.raw_buffer_atomic_fmax {boundsCheck = true} %value -> %buf[%idx] : f32 -> memref<64xf32>, i32
@@ -160,7 +165,8 @@ func.func @gpu_gcn_raw_buffer_atomic_fmax_f32(%value: f32, %buf: memref<64xf32>,
160165
// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_smax_i32
161166
func.func @gpu_gcn_raw_buffer_atomic_smax_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) {
162167
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
163-
// CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
168+
// GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
169+
// RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
164170
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
165171
// CHECK: rocdl.raw.ptr.buffer.atomic.smax %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
166172
amdgpu.raw_buffer_atomic_smax {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32
@@ -170,7 +176,8 @@ func.func @gpu_gcn_raw_buffer_atomic_smax_i32(%value: i32, %buf: memref<64xi32>,
170176
// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_umin_i32
171177
func.func @gpu_gcn_raw_buffer_atomic_umin_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) {
172178
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
173-
// CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
179+
// GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
180+
// RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
174181
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
175182
// CHECK: rocdl.raw.ptr.buffer.atomic.umin %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
176183
amdgpu.raw_buffer_atomic_umin {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32
@@ -183,7 +190,8 @@ func.func @amdgpu_raw_buffer_atomic_cmpswap_f32(%src : f32, %cmp : f32, %buf : m
183190
// CHECK: %[[srcCast:.*]] = llvm.bitcast %[[src]] : f32 to i32
184191
// CHECK: %[[cmpCast:.*]] = llvm.bitcast %[[cmp]] : f32 to i32
185192
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
186-
// CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
193+
// GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
194+
// RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
187195
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
188196
// CHECK: %[[dst:.*]] = rocdl.raw.ptr.buffer.atomic.cmpswap %[[srcCast]], %[[cmpCast]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
189197
// CHECK: %[[dstCast:.*]] = llvm.bitcast %[[dst]] : i32 to f32
@@ -196,7 +204,8 @@ func.func @amdgpu_raw_buffer_atomic_cmpswap_f32(%src : f32, %cmp : f32, %buf : m
196204
// CHECK-SAME: (%[[src:.*]]: i64, %[[cmp:.*]]: i64, {{.*}})
197205
func.func @amdgpu_raw_buffer_atomic_cmpswap_i64(%src : i64, %cmp : i64, %buf : memref<64xi64>, %idx: i32) -> i64 {
198206
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(512 : i32)
199-
// CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
207+
// GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
208+
// RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
200209
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
201210
// CHECK: %[[dst:.*]] = rocdl.raw.ptr.buffer.atomic.cmpswap %[[src]], %[[cmp]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i64
202211
// CHECK: return %[[dst]]
@@ -206,7 +215,14 @@ func.func @amdgpu_raw_buffer_atomic_cmpswap_i64(%src : i64, %cmp : i64, %buf : m
206215

207216
// CHECK-LABEL: func @lds_barrier
208217
func.func @lds_barrier() {
209-
// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "s_waitcnt lgkmcnt(0)\0As_barrier"
218+
// GFX908: llvm.inline_asm has_side_effects asm_dialect = att
219+
// GFX908-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier"
220+
// GFX90A: rocdl.waitcnt -7937
221+
// GFX90A-NEXT: rocdl.s.barrier
222+
// GFX10: rocdl.waitcnt -16129
223+
// GFX10-NEXT: rocdl.s.barrier
224+
// GFX11: llvm.inline_asm has_side_effects asm_dialect = att
225+
// GFX11-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier"
210226
amdgpu.lds_barrier
211227
func.return
212228
}

mlir/test/Dialect/LLVMIR/rocdl.mlir

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,19 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 {
363363
llvm.return %source5 : i32
364364
}
365365

366+
llvm.func @rocdl.waitcnt() {
367+
// CHECK-LABEL: rocdl.waitcnt
368+
// CHECK: rocdl.waitcnt 0
369+
rocdl.waitcnt 0
370+
llvm.return
371+
}
372+
373+
llvm.func @rocdl.s.barrier() {
374+
// CHECK-LABEL: rocdl.s.barrier
375+
// CHECK: rocdl.s.barrier
376+
rocdl.s.barrier
377+
llvm.return
378+
}
366379
// -----
367380

368381
// expected-error@below {{attribute attached to unexpected op}}

mlir/test/Target/LLVMIR/rocdl.mlir

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,23 @@ llvm.func @rocdl.bpermute(%src : i32) -> i32 {
8888
llvm.return %0 : i32
8989
}
9090

91+
llvm.func @rocdl.waitcnt() {
92+
// CHECK-LABEL: rocdl.waitcnt
93+
// CHECK-NEXT: call void @llvm.amdgcn.s.waitcnt(i32 0)
94+
rocdl.waitcnt 0
95+
llvm.return
96+
}
97+
98+
llvm.func @rocdl.s.barrier() {
99+
// CHECK-LABEL: rocdl.s.barrier
100+
// CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
101+
rocdl.s.barrier
102+
llvm.return
103+
}
104+
105+
91106
llvm.func @rocdl.barrier() {
107+
// CHECK-LABEL: rocdl.barrier
92108
// CHECK: fence syncscope("workgroup") release
93109
// CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
94110
// CHECK-NEXT: fence syncscope("workgroup") acquire

0 commit comments

Comments
 (0)