@@ -462,6 +462,143 @@ to left-shift the found bit into the most-significant bit position, otherwise
462
462
the result is the shift amount needed to right-shift the found bit into the
463
463
least-significant bit position. 0xffffffff is returned if no 1 bit is found.
464
464
465
+ TMA family of Intrinsics
466
+ ------------------------
467
+
468
+ '``llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d ``'
469
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
470
+
471
+ Syntax:
472
+ """""""
473
+
474
+ .. code-block :: llvm
475
+
476
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
477
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(..., i32 %d0, i32 %d1, ...)
478
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
479
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
480
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
481
+
482
+ Overview:
483
+ """""""""
484
+
485
+ The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d ``' intrinsics
486
+ correspond to the ``cp.async.bulk.tensor.[1-5]d.* `` set of PTX instructions.
487
+ These instructions initiate an asynchronous copy of tensor data from
488
+ global memory to shared::cluster memory (indicated by the ``g2s `` prefix)
489
+ in ``tile `` mode. In tile mode, the multi-dimensional layout of the
490
+ source tensor is preserved at the destination. The dimension of the
491
+ tensor data ranges from 1d to 5d with the coordinates specified
492
+ by the ``i32 %d0 ... i32 %d4 `` arguments.
493
+
494
+ * The last two arguments to these intrinsics are boolean flags
495
+ indicating support for cache_hint and/or multicast modifiers.
496
+ These flag arguments must be compile-time constants. The backend
497
+ looks through these flags and lowers the intrinsics appropriately.
498
+
499
+ * The Nth argument (denoted by ``i1 flag_ch ``) when set, indicates
500
+ a valid cache_hint (``i64 %ch ``) and generates the ``.L2::cache_hint ``
501
+ variant of the PTX instruction.
502
+
503
+ * The [N-1]th argument (denoted by ``i1 flag_mc ``) when set, indicates
504
+ the presence of a multicast mask (``i16 %mc ``) and generates the PTX
505
+ instruction with the ``.multicast::cluster `` modifier.
506
+
507
+ For more information, refer PTX ISA
508
+ `<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor >`_.
509
+
510
+ '``llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.[3-5]d ``'
511
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
512
+
513
+ Syntax:
514
+ """""""
515
+
516
+ .. code-block :: llvm
517
+
518
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
519
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
520
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
521
+
522
+ Overview:
523
+ """""""""
524
+
525
+ The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.[3-5]d ``' intrinsics
526
+ correspond to the ``cp.async.bulk.tensor.[1-5]d.* `` set of PTX instructions.
527
+ These instructions initiate an asynchronous copy of tensor data from
528
+ global memory to shared::cluster memory (indicated by the ``g2s `` prefix)
529
+ in ``im2col `` mode. In im2col mode, some dimensions of the source tensor
530
+ are unrolled into a single dimensional column at the destination. In this
531
+ mode, the tensor has to be at least three-dimensional. Along with the tensor
532
+ coordinates, im2col offsets are also specified (denoted by
533
+ ``i16 im2col0...i16 %im2col2 ``). The number of im2col offsets is two less
534
+ than the number of dimensions of the tensor operation. The last two arguments
535
+ to these intrinsics are boolean flags, with the same functionality as described
536
+ in the ``tile `` mode intrinsics above.
537
+
538
+ For more information, refer PTX ISA
539
+ `<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor >`_.
540
+
541
+ '``llvm.nvvm.cp.async.bulk.tensor.s2g.tile.[1-5]d ``'
542
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
543
+
544
+ Syntax:
545
+ """""""
546
+
547
+ .. code-block :: llvm
548
+
549
+ declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
550
+ declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.2d(..., i32 %d0, i32 %d1, ...)
551
+ declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
552
+ declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
553
+ declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
554
+
555
+ Overview:
556
+ """""""""
557
+
558
+ The '``@llvm.nvvm.cp.async.bulk.tensor.s2g.tile.[1-5]d ``' intrinsics
559
+ correspond to the ``cp.async.bulk.tensor.[1-5]d.* `` set of PTX instructions.
560
+ These instructions initiate an asynchronous copy of tensor data from
561
+ shared::cta to global memory (indicated by the ``s2g `` prefix)
562
+ in ``tile `` mode. The dimension of the tensor data ranges from 1d to 5d
563
+ with the coordinates specified by the ``i32 %d0 ... i32 %d4 `` arguments.
564
+
565
+ * The last argument to these intrinsics is a boolean flag
566
+ indicating support for cache_hint. This flag argument must
567
+ be a compile-time constant. When set, it indicates a valid
568
+ cache_hint (``i64 %ch ``) and generates the ``.L2::cache_hint ``
569
+ variant of the PTX instruction.
570
+
571
+ For more information, refer PTX ISA
572
+ `<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor >`_.
573
+
574
+ '``llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.[3-5]d ``'
575
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
576
+
577
+ Syntax:
578
+ """""""
579
+
580
+ .. code-block :: llvm
581
+
582
+ declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.3d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %flag_ch)
583
+ declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
584
+ declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
585
+
586
+ Overview:
587
+ """""""""
588
+
589
+ The '``@llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.[1-5]d ``' intrinsics
590
+ correspond to the ``cp.async.bulk.tensor.[1-5]d.* `` set of PTX instructions.
591
+ These instructions initiate an asynchronous copy of tensor data from
592
+ shared::cta to global memory (indicated by the ``s2g `` prefix)
593
+ in ``im2col `` mode. In this mode, the tensor has to be at least
594
+ three-dimensional. Unlike the ``g2s `` variants, there are no
595
+ im2col_offsets for these intrinsics. The last argument to these
596
+ intrinsics is a boolean flag, with the same functionality as
597
+ described in the ``s2g.tile `` mode intrinsics above.
598
+
599
+ For more information, refer PTX ISA
600
+ `<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor >`_.
601
+
465
602
Other Intrinsics
466
603
----------------
467
604
0 commit comments