Skip to content

Commit f86b8ba

Browse files
committed
address comments
1 parent 778fdb1 commit f86b8ba

File tree

4 files changed

+124
-81
lines changed

4 files changed

+124
-81
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 66 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -632,7 +632,7 @@ Syntax:
632632

633633
.. code-block:: llvm
634634
635-
declare i32 @llvm.nvvm.prmt(i32 %a, i32 %b, i32 %c)
635+
declare i32 @llvm.nvvm.prmt(i32 %lo, i32 %hi, i32 %selector)
636636
637637
Overview:
638638
"""""""""
@@ -644,7 +644,7 @@ Semantics:
644644
""""""""""
645645

646646
The bytes in the first two source operands are numbered from 0 to 7:
647-
{%b, %a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each byte in the target
647+
{%hi, %lo} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each byte in the target
648648
register, a 4-bit selection value is defined.
649649

650650
The 3 lsbs of the selection value specify which of the 8 source bytes should be
@@ -653,7 +653,7 @@ copied, or if the sign (msb of the byte) should be replicated over all 8 bits
653653
of the target position (sign extend of the byte value); msb=0 means copy the
654654
literal value; msb=1 means replicate the sign.
655655

656-
These 4-bit selection values are pulled from the lower 16-bits of the third
656+
These 4-bit selection values are pulled from the lower 16-bits of the %selector
657657
operand, with the least significant selection value corresponding to the least
658658
significant byte of the destination.
659659

@@ -666,13 +666,13 @@ Syntax:
666666

667667
.. code-block:: llvm
668668
669-
declare i32 @llvm.nvvm.prmt.f4e(i32 %a, i32 %b, i32 %c)
670-
declare i32 @llvm.nvvm.prmt.b4e(i32 %a, i32 %b, i32 %c)
669+
declare i32 @llvm.nvvm.prmt.f4e(i32 %lo, i32 %hi, i32 %selector)
670+
declare i32 @llvm.nvvm.prmt.b4e(i32 %lo, i32 %hi, i32 %selector)
671671
672-
declare i32 @llvm.nvvm.prmt.rc8(i32 %a, i32 %c)
673-
declare i32 @llvm.nvvm.prmt.ecl(i32 %a, i32 %c)
674-
declare i32 @llvm.nvvm.prmt.ecr(i32 %a, i32 %c)
675-
declare i32 @llvm.nvvm.prmt.rc16(i32 %a, i32 %c)
672+
declare i32 @llvm.nvvm.prmt.rc8(i32 %lo, i32 %selector)
673+
declare i32 @llvm.nvvm.prmt.ecl(i32 %lo, i32 %selector)
674+
declare i32 @llvm.nvvm.prmt.ecr(i32 %lo, i32 %selector)
675+
declare i32 @llvm.nvvm.prmt.rc16(i32 %lo, i32 %selector)
676676
677677
Overview:
678678
"""""""""
@@ -685,64 +685,64 @@ Semantics:
685685
""""""""""
686686

687687
As with the generic '``llvm.nvvm.prmt``' intrinsic, the bytes in the first one
688-
or two source operands are numbered. The first source operand (%a) is numbered
688+
or two source operands are numbered. The first source operand (%lo) is numbered
689689
{b3, b2, b1, b0}, in the case of the '``f4e``' and '``b4e``' variants, the
690-
second source operand (%b) is numbered {b7, b6, b5, b4}.
691-
692-
Depending on the 2 least significant bits of the final operand, the result of
693-
the permutation is defined as follows:
694-
695-
+------------+---------+--------------+
696-
| Mode | %c[1:0] | Output |
697-
+------------+---------+--------------+
698-
| '``f4e``' | 0 | {3, 2, 1, 0} |
699-
| +---------+--------------+
700-
| | 1 | {4, 3, 2, 1} |
701-
| +---------+--------------+
702-
| | 2 | {5, 4, 3, 2} |
703-
| +---------+--------------+
704-
| | 3 | {6, 5, 4, 3} |
705-
+------------+---------+--------------+
706-
| '``b4e``' | 0 | {5, 6, 7, 0} |
707-
| +---------+--------------+
708-
| | 1 | {6, 7, 0, 1} |
709-
| +---------+--------------+
710-
| | 2 | {7, 0, 1, 2} |
711-
| +---------+--------------+
712-
| | 3 | {0, 1, 2, 3} |
713-
+------------+---------+--------------+
714-
| '``rc8``' | 0 | {0, 0, 0, 0} |
715-
| +---------+--------------+
716-
| | 1 | {1, 1, 1, 1} |
717-
| +---------+--------------+
718-
| | 2 | {2, 2, 2, 2} |
719-
| +---------+--------------+
720-
| | 3 | {3, 3, 3, 3} |
721-
+------------+---------+--------------+
722-
| '``ecl``' | 0 | {3, 2, 1, 0} |
723-
| +---------+--------------+
724-
| | 1 | {3, 2, 1, 1} |
725-
| +---------+--------------+
726-
| | 2 | {3, 2, 2, 2} |
727-
| +---------+--------------+
728-
| | 3 | {3, 3, 3, 3} |
729-
+------------+---------+--------------+
730-
| '``ecr``' | 0 | {0, 0, 0, 0} |
731-
| +---------+--------------+
732-
| | 1 | {1, 1, 1, 0} |
733-
| +---------+--------------+
734-
| | 2 | {2, 2, 1, 0} |
735-
| +---------+--------------+
736-
| | 3 | {3, 2, 1, 0} |
737-
+------------+---------+--------------+
738-
| '``rc16``' | 0 | {1, 0, 1, 0} |
739-
| +---------+--------------+
740-
| | 1 | {3, 2, 3, 2} |
741-
| +---------+--------------+
742-
| | 2 | {1, 0, 1, 0} |
743-
| +---------+--------------+
744-
| | 3 | {3, 2, 3, 2} |
745-
+------------+---------+--------------+
690+
second source operand (%hi) is numbered {b7, b6, b5, b4}.
691+
692+
Depending on the 2 least significant bits of the %selector operand, the result
693+
of the permutation is defined as follows:
694+
695+
+------------+----------------+--------------+
696+
| Mode | %selector[1:0] | Output |
697+
+------------+----------------+--------------+
698+
| '``f4e``' | 0 | {3, 2, 1, 0} |
699+
| +----------------+--------------+
700+
| | 1 | {4, 3, 2, 1} |
701+
| +----------------+--------------+
702+
| | 2 | {5, 4, 3, 2} |
703+
| +----------------+--------------+
704+
| | 3 | {6, 5, 4, 3} |
705+
+------------+----------------+--------------+
706+
| '``b4e``' | 0 | {5, 6, 7, 0} |
707+
| +----------------+--------------+
708+
| | 1 | {6, 7, 0, 1} |
709+
| +----------------+--------------+
710+
| | 2 | {7, 0, 1, 2} |
711+
| +----------------+--------------+
712+
| | 3 | {0, 1, 2, 3} |
713+
+------------+----------------+--------------+
714+
| '``rc8``' | 0 | {0, 0, 0, 0} |
715+
| +----------------+--------------+
716+
| | 1 | {1, 1, 1, 1} |
717+
| +----------------+--------------+
718+
| | 2 | {2, 2, 2, 2} |
719+
| +----------------+--------------+
720+
| | 3 | {3, 3, 3, 3} |
721+
+------------+----------------+--------------+
722+
| '``ecl``' | 0 | {3, 2, 1, 0} |
723+
| +----------------+--------------+
724+
| | 1 | {3, 2, 1, 1} |
725+
| +----------------+--------------+
726+
| | 2 | {3, 2, 2, 2} |
727+
| +----------------+--------------+
728+
| | 3 | {3, 3, 3, 3} |
729+
+------------+----------------+--------------+
730+
| '``ecr``' | 0 | {0, 0, 0, 0} |
731+
| +----------------+--------------+
732+
| | 1 | {1, 1, 1, 0} |
733+
| +----------------+--------------+
734+
| | 2 | {2, 2, 1, 0} |
735+
| +----------------+--------------+
736+
| | 3 | {3, 2, 1, 0} |
737+
+------------+----------------+--------------+
738+
| '``rc16``' | 0 | {1, 0, 1, 0} |
739+
| +----------------+--------------+
740+
| | 1 | {3, 2, 3, 2} |
741+
| +----------------+--------------+
742+
| | 2 | {1, 0, 1, 0} |
743+
| +----------------+--------------+
744+
| | 3 | {3, 2, 3, 2} |
745+
+------------+----------------+--------------+
746746

747747
TMA family of Intrinsics
748748
------------------------

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -751,7 +751,8 @@ let TargetPrefix = "nvvm" in {
751751
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
752752

753753
// Note: these variants also have 2 source operands but only one will ever
754-
// be used so we eliminate the other operand in the IR.
754+
// be used so we eliminate the other operand in the IR (0 is used as the
755+
// placeholder in the backend).
755756
foreach mode = ["rc8", "ecl", "ecr", "rc16"] in
756757
def int_nvvm_prmt_ # mode :
757758
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty]>;

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,48 @@ def F16X2RT : RegTyInfo<v2f16, Int32Regs, ?, ?, supports_imm = 0>;
238238
def BF16X2RT : RegTyInfo<v2bf16, Int32Regs, ?, ?, supports_imm = 0>;
239239

240240

241+
// This class provides a basic wrapper around an NVPTXInst that abstracts the
242+
// specific syntax of most PTX instructions. It automatically handles the
243+
// construction of the asm string based on the provided dag arguments.
244+
// For example, the following asm-strings would be computed:
245+
//
246+
// * BasicFlagsNVPTXInst<(outs Int32Regs:$dst),
247+
// (ins Int32Regs:$a, Int32Regs:$b), (ins),
248+
// "add.s32">;
249+
// ---> "add.s32 \t$dst, $a, $b;"
250+
//
251+
// * BasicFlagsNVPTXInst<(outs Int32Regs:$d),
252+
// (ins Int32Regs:$a, Int32Regs:$b, Hexu32imm:$c),
253+
// (ins PrmtMode:$mode),
254+
// "prmt.b32${mode}">;
255+
// ---> "prmt.b32${mode} \t$d, $a, $b, $c;"
256+
//
257+
class BasicFlagsNVPTXInst<dag outs_dag, dag ins_dag, dag flags_dag, string asmstr,
258+
list<dag> pattern = []>
259+
: NVPTXInst<
260+
outs_dag,
261+
!con(ins_dag, flags_dag),
262+
!strconcat(
263+
asmstr,
264+
!if(!and(!empty(ins_dag), !empty(outs_dag)), "",
265+
!strconcat(
266+
" \t",
267+
!interleave(
268+
!foreach(i, !range(!size(outs_dag)),
269+
"$" # !getdagname(outs_dag, i)),
270+
"|"),
271+
!if(!or(!empty(ins_dag), !empty(outs_dag)), "", ", "),
272+
!interleave(
273+
!foreach(i, !range(!size(ins_dag)),
274+
"$" # !getdagname(ins_dag, i)),
275+
", "))),
276+
";"),
277+
pattern>;
278+
279+
class BasicNVPTXInst<dag outs, dag insv, string asmstr, list<dag> pattern = []>
280+
: BasicFlagsNVPTXInst<outs, insv, (ins), asmstr, pattern>;
281+
282+
241283
multiclass I3Inst<string op_str, SDPatternOperator op_node, RegTyInfo t,
242284
bit commutative, list<Predicate> requires = []> {
243285
defvar asmstr = op_str # " \t$dst, $a, $b;";

llvm/test/CodeGen/NVPTX/prmt.ll

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
target triple = "nvptx64-nvidia-cuda"
66

7-
define i32 @test_prmt_basic(i32 %a, i32 %b, i32 %c) {
7+
define i32 @test_prmt_basic(i32 %lo, i32 %hi, i32 %selector) {
88
; CHECK-LABEL: test_prmt_basic(
99
; CHECK: {
1010
; CHECK-NEXT: .reg .b32 %r<5>;
@@ -16,11 +16,11 @@ define i32 @test_prmt_basic(i32 %a, i32 %b, i32 %c) {
1616
; CHECK-NEXT: prmt.b32 %r4, %r1, %r2, %r3;
1717
; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
1818
; CHECK-NEXT: ret;
19-
%val = call i32 @llvm.nvvm.prmt(i32 %a, i32 %b, i32 %c)
19+
%val = call i32 @llvm.nvvm.prmt(i32 %lo, i32 %hi, i32 %selector)
2020
ret i32 %val
2121
}
2222

23-
define i32 @test_prmt_f4e(i32 %a, i32 %b, i32 %c) {
23+
define i32 @test_prmt_f4e(i32 %lo, i32 %hi, i32 %selector) {
2424
; CHECK-LABEL: test_prmt_f4e(
2525
; CHECK: {
2626
; CHECK-NEXT: .reg .b32 %r<5>;
@@ -32,11 +32,11 @@ define i32 @test_prmt_f4e(i32 %a, i32 %b, i32 %c) {
3232
; CHECK-NEXT: prmt.b32.f4e %r4, %r1, %r2, %r3;
3333
; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
3434
; CHECK-NEXT: ret;
35-
%val = call i32 @llvm.nvvm.prmt.f4e(i32 %a, i32 %b, i32 %c)
35+
%val = call i32 @llvm.nvvm.prmt.f4e(i32 %lo, i32 %hi, i32 %selector)
3636
ret i32 %val
3737
}
3838

39-
define i32 @test_prmt_b4e(i32 %a, i32 %b, i32 %c) {
39+
define i32 @test_prmt_b4e(i32 %lo, i32 %hi, i32 %selector) {
4040
; CHECK-LABEL: test_prmt_b4e(
4141
; CHECK: {
4242
; CHECK-NEXT: .reg .b32 %r<5>;
@@ -48,11 +48,11 @@ define i32 @test_prmt_b4e(i32 %a, i32 %b, i32 %c) {
4848
; CHECK-NEXT: prmt.b32.b4e %r4, %r1, %r2, %r3;
4949
; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
5050
; CHECK-NEXT: ret;
51-
%val = call i32 @llvm.nvvm.prmt.b4e(i32 %a, i32 %b, i32 %c)
51+
%val = call i32 @llvm.nvvm.prmt.b4e(i32 %lo, i32 %hi, i32 %selector)
5252
ret i32 %val
5353
}
5454

55-
define i32 @test_prmt_rc8(i32 %a, i32 %b) {
55+
define i32 @test_prmt_rc8(i32 %lo, i32 %selector) {
5656
; CHECK-LABEL: test_prmt_rc8(
5757
; CHECK: {
5858
; CHECK-NEXT: .reg .b32 %r<4>;
@@ -63,11 +63,11 @@ define i32 @test_prmt_rc8(i32 %a, i32 %b) {
6363
; CHECK-NEXT: prmt.b32.rc8 %r3, %r1, 0, %r2;
6464
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
6565
; CHECK-NEXT: ret;
66-
%val = call i32 @llvm.nvvm.prmt.rc8(i32 %a, i32 %b)
66+
%val = call i32 @llvm.nvvm.prmt.rc8(i32 %lo, i32 %selector)
6767
ret i32 %val
6868
}
6969

70-
define i32 @test_prmt_ecl(i32 %a, i32 %b) {
70+
define i32 @test_prmt_ecl(i32 %lo, i32 %selector) {
7171
; CHECK-LABEL: test_prmt_ecl(
7272
; CHECK: {
7373
; CHECK-NEXT: .reg .b32 %r<4>;
@@ -78,11 +78,11 @@ define i32 @test_prmt_ecl(i32 %a, i32 %b) {
7878
; CHECK-NEXT: prmt.b32.ecl %r3, %r1, 0, %r2;
7979
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
8080
; CHECK-NEXT: ret;
81-
%val = call i32 @llvm.nvvm.prmt.ecl(i32 %a, i32 %b)
81+
%val = call i32 @llvm.nvvm.prmt.ecl(i32 %lo, i32 %selector)
8282
ret i32 %val
8383
}
8484

85-
define i32 @test_prmt_ecr(i32 %a, i32 %b) {
85+
define i32 @test_prmt_ecr(i32 %lo, i32 %selector) {
8686
; CHECK-LABEL: test_prmt_ecr(
8787
; CHECK: {
8888
; CHECK-NEXT: .reg .b32 %r<4>;
@@ -93,11 +93,11 @@ define i32 @test_prmt_ecr(i32 %a, i32 %b) {
9393
; CHECK-NEXT: prmt.b32.ecr %r3, %r1, 0, %r2;
9494
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
9595
; CHECK-NEXT: ret;
96-
%val = call i32 @llvm.nvvm.prmt.ecr(i32 %a, i32 %b)
96+
%val = call i32 @llvm.nvvm.prmt.ecr(i32 %lo, i32 %selector)
9797
ret i32 %val
9898
}
9999

100-
define i32 @test_prmt_rc16(i32 %a, i32 %b) {
100+
define i32 @test_prmt_rc16(i32 %lo, i32 %selector) {
101101
; CHECK-LABEL: test_prmt_rc16(
102102
; CHECK: {
103103
; CHECK-NEXT: .reg .b32 %r<4>;
@@ -108,6 +108,6 @@ define i32 @test_prmt_rc16(i32 %a, i32 %b) {
108108
; CHECK-NEXT: prmt.b32.rc16 %r3, %r1, 0, %r2;
109109
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
110110
; CHECK-NEXT: ret;
111-
%val = call i32 @llvm.nvvm.prmt.rc16(i32 %a, i32 %b)
111+
%val = call i32 @llvm.nvvm.prmt.rc16(i32 %lo, i32 %selector)
112112
ret i32 %val
113113
}

0 commit comments

Comments
 (0)