@@ -378,22 +378,74 @@ def NVVM_Barrier0Op : NVVM_Op<"barrier0"> {
378
378
}
379
379
380
380
def NVVM_ClusterArriveOp : NVVM_Op<"cluster.arrive"> {
381
+ let arguments = (ins OptionalAttr<UnitAttr>:$aligned);
382
+
383
+ let summary = "Cluster Barrier Arrive Op";
384
+ let description = [{
385
+ The `cluster.arrive` can be used by the threads within the cluster for synchronization and
386
+ communication. The `cluster.arrive` instruction marks the warps' arrival at the barrier
387
+ without causing the executing thread to wait for other participating threads.
388
+
389
+ The `aligned` attribute, when provided, generates the .aligned version of the PTX instruction.
390
+
391
+ [For more information, see PTX ISA]
392
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster)
393
+ }];
394
+
381
395
string llvmBuilder = [{
382
- createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_arrive);
396
+ if ($aligned)
397
+ createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_arrive_aligned);
398
+ else
399
+ createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_arrive);
383
400
}];
384
401
let assemblyFormat = "attr-dict";
385
402
}
386
403
387
404
def NVVM_ClusterArriveRelaxedOp : NVVM_Op<"cluster.arrive.relaxed"> {
405
+ let arguments = (ins OptionalAttr<UnitAttr>:$aligned);
406
+
407
+ let summary = "Cluster Barrier Relaxed Arrive Op";
408
+ let description = [{
409
+ The `cluster.arrive` can be used by the threads within the cluster for synchronization and
410
+ communication. The `cluster.arrive` instruction marks the warps' arrival at the barrier
411
+ without causing the executing thread to wait for other participating threads.
412
+
413
+ The `aligned` attribute, when provided, generates the .aligned version of the PTX instruction.
414
+ The .relaxed qualifier on `cluster.arrive` specifies that there are no memory
415
+ ordering and visibility guarantees provided for the memory accesses performed prior to
416
+ `cluster.arrive`.
417
+
418
+ [For more information, see PTX ISA]
419
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster)
420
+ }];
421
+
388
422
string llvmBuilder = [{
389
- createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_arrive_relaxed);
423
+ if ($aligned)
424
+ createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_arrive_relaxed_aligned);
425
+ else
426
+ createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_arrive_relaxed);
390
427
}];
391
428
let assemblyFormat = "attr-dict";
392
429
}
393
430
394
431
def NVVM_ClusterWaitOp : NVVM_Op<"cluster.wait"> {
432
+ let arguments = (ins OptionalAttr<UnitAttr>:$aligned);
433
+
434
+ let summary = "Cluster Barrier Wait Op";
435
+ let description = [{
436
+ The `cluster.wait` causes the executing thread to wait for all non-exited threads
437
+ of the cluster to perform `cluster.arrive`. The `aligned` attribute, when provided,
438
+ generates the .aligned version of the PTX instruction.
439
+
440
+ [For more information, see PTX ISA]
441
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster)
442
+ }];
443
+
395
444
string llvmBuilder = [{
396
- createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_wait);
445
+ if ($aligned)
446
+ createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_wait_aligned);
447
+ else
448
+ createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_wait);
397
449
}];
398
450
let assemblyFormat = "attr-dict";
399
451
}
0 commit comments