Skip to content

Commit 099bf20

Browse files
authored
[NVPTX] Add idp2a, idp4a intrinsics (#102763)
Add support for `llvm.nvvm.idp2a` and `llvm.nvvm.idp4a` which correspond directly to `dp2a` and `dp4a` PTX instructions.
1 parent 2c12c1e commit 099bf20

File tree

5 files changed

+340
-0
lines changed

5 files changed

+340
-0
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,77 @@ The ``@llvm.nvvm.fence.proxy.tensormap_generic.*`` is a uni-directional fence us
287287

288288
The address operand ``addr`` and the operand ``size`` together specify the memory range ``[addr, addr+size)`` on which the ordering guarantees on the memory accesses across the proxies is to be provided. The only supported value for the ``size`` operand is ``128`` and must be an immediate. Generic Addressing is used unconditionally, and the address specified by the operand addr must fall within the ``.global`` state space. Otherwise, the behavior is undefined. For more information, see `PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar>`_.
289289

290+
Arithmetic Intrinsics
291+
---------------------
292+
293+
'``llvm.nvvm.idp2a.[us].[us]``' Intrinsics
294+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
295+
296+
Syntax:
297+
"""""""
298+
299+
.. code-block:: llvm
300+
301+
declare i32 @llvm.nvvm.idp2a.s.s(i32 %a, i32 %b, i1 immarg %is.hi, i32 %c)
302+
declare i32 @llvm.nvvm.idp2a.s.u(i32 %a, i32 %b, i1 immarg %is.hi, i32 %c)
303+
declare i32 @llvm.nvvm.idp2a.u.s(i32 %a, i32 %b, i1 immarg %is.hi, i32 %c)
304+
declare i32 @llvm.nvvm.idp2a.u.u(i32 %a, i32 %b, i1 immarg %is.hi, i32 %c)
305+
306+
307+
Overview:
308+
"""""""""
309+
310+
The '``llvm.nvvm.idp2a.[us].[us]``' intrinsics performs a 2-element vector dot
311+
product followed by addition. They corresponds directly to the ``dp2a`` PTX
312+
instruction.
313+
314+
Semantics:
315+
""""""""""
316+
317+
The 32-bit value in ``%a`` is broken into 2 16-bit values which are extended to
318+
32 bits. For the '``llvm.nvvm.idp2a.u.[us]``' variants zero-extension is used,
319+
while for the '``llvm.nvvm.idp2a.s.[us]``' sign-extension is used. Two bytes are
320+
selected from ``%b``, if ``%is.hi`` is true, the most significant bytes are
321+
selected, otherwise the least significant bytes are selected. These bytes are
322+
then extended to 32-bits. For the '``llvm.nvvm.idp2a.[us].u``' variants
323+
zero-extension is used, while for the '``llvm.nvvm.idp2a.[us].s``'
324+
sign-extension is used. The dot product of these 2-element vectors is added to
325+
``%c`` to produce the return.
326+
327+
328+
'``llvm.nvvm.idp4a.[us].[us]``' Intrinsics
329+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
330+
331+
Syntax:
332+
"""""""
333+
334+
.. code-block:: llvm
335+
336+
declare i32 @llvm.nvvm.idp4a.s.s(i32 %a, i32 %b, i32 %c)
337+
declare i32 @llvm.nvvm.idp4a.s.u(i32 %a, i32 %b, i32 %c)
338+
declare i32 @llvm.nvvm.idp4a.u.s(i32 %a, i32 %b, i32 %c)
339+
declare i32 @llvm.nvvm.idp4a.u.u(i32 %a, i32 %b, i32 %c)
340+
341+
Overview:
342+
"""""""""
343+
344+
The '``llvm.nvvm.idp4a.[us].[us]``' intrinsics perform a 4-element vector dot
345+
product followed by addition. They corresponds directly to the ``dp4a`` PTX
346+
instruction.
347+
348+
Semantics:
349+
""""""""""
350+
351+
Each of the 4 bytes in both ``%a`` and ``%b`` are extended to 32-bit integers
352+
forming 2 ``<4 x i32>``. For ``%a``, zero-extension is used in the
353+
'``llvm.nvvm.idp4a.u.[us]``' variants, while sign-extension is used with
354+
'``llvm.nvvm.idp4a.s.[us]``' variants. Similarly, for ``%b``, zero-extension is
355+
used in the '``llvm.nvvm.idp4a.[us].u``' variants, while sign-extension is used
356+
with '``llvm.nvvm.idp4a.[us].s``' variants. The dot product of these 4-element
357+
vectors is added to ``%c`` to produce the return.
358+
359+
360+
290361
Other Intrinsics
291362
----------------
292363

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1052,6 +1052,22 @@ let TargetPrefix = "nvvm" in {
10521052
DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
10531053
[IntrNoMem, IntrSpeculatable, Commutative]>;
10541054

1055+
//
1056+
// Dot Product
1057+
//
1058+
foreach a_type = ["s", "u"] in {
1059+
foreach b_type = ["s", "u"] in {
1060+
def int_nvvm_idp4a_ # a_type # _ # b_type :
1061+
DefaultAttrsIntrinsic<[llvm_i32_ty],
1062+
[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1063+
[IntrNoMem, IntrSpeculatable]>;
1064+
def int_nvvm_idp2a_ # a_type # _ # b_type :
1065+
DefaultAttrsIntrinsic<[llvm_i32_ty],
1066+
[llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty],
1067+
[IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>]>;
1068+
}
1069+
}
1070+
10551071
//
10561072
// Convert
10571073
//

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
159159

160160
def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
161161
def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
162+
def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
162163

163164
def True : Predicate<"true">;
164165
def False : Predicate<"false">;
@@ -3920,6 +3921,33 @@ let isTerminator = 1, isBranch = 1, isIndirectBranch = 1, isNotDuplicable = 1 in
39203921
}
39213922

39223923

3924+
foreach a_type = ["s", "u"] in {
3925+
foreach b_type = ["s", "u"] in {
3926+
3927+
def DOT4_ # a_type # b_type :
3928+
NVPTXInst<(outs Int32Regs:$dst),
3929+
(ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
3930+
"dp4a." # a_type # "32." # b_type # "32 \t$dst, $a, $b, $c;",
3931+
[(set Int32Regs:$dst,
3932+
(!cast<Intrinsic>("int_nvvm_idp4a_" # a_type # "_" # b_type)
3933+
(i32 Int32Regs:$a), (i32 Int32Regs:$b), (i32 Int32Regs:$c)))]>,
3934+
Requires<[hasDotInstructions]>;
3935+
3936+
foreach is_hi = [0, -1] in {
3937+
defvar lohi_suffix = !if(is_hi, "hi", "lo");
3938+
3939+
def DOT2_ # lohi_suffix # _ # a_type # b_type :
3940+
NVPTXInst<(outs Int32Regs:$dst),
3941+
(ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
3942+
"dp2a." # lohi_suffix # "." # a_type # "32." # b_type # "32 \t$dst, $a, $b, $c;",
3943+
[(set Int32Regs:$dst,
3944+
(!cast<Intrinsic>("int_nvvm_idp2a_" # a_type # "_" # b_type)
3945+
(i32 Int32Regs:$a), (i32 Int32Regs:$b), is_hi, (i32 Int32Regs:$c)))]>,
3946+
Requires<[hasDotInstructions]>;
3947+
}
3948+
}
3949+
}
3950+
39233951
include "NVPTXIntrinsics.td"
39243952

39253953
//-----------------------------------

llvm/lib/Target/NVPTX/NVPTXSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,9 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
9090
bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; }
9191
// Does SM & PTX support atomic relaxed MMIO operations ?
9292
bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; }
93+
bool hasDotInstructions() const {
94+
return SmVersion >= 61 && PTXVersion >= 50;
95+
}
9396
unsigned int getFullSmVersion() const { return FullSmVersion; }
9497
unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
9598
// GPUs with "a" suffix have include architecture-accelerated features that
Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -march=nvptx -mcpu=sm_61 | FileCheck %s
3+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_61 | FileCheck %s
4+
5+
target triple = "nvptx-nvidia-cuda"
6+
7+
declare i32 @llvm.nvvm.idp4a.s.s(i32, i32, i32)
8+
declare i32 @llvm.nvvm.idp4a.s.u(i32, i32, i32)
9+
declare i32 @llvm.nvvm.idp4a.u.s(i32, i32, i32)
10+
declare i32 @llvm.nvvm.idp4a.u.u(i32, i32, i32)
11+
12+
define i32 @test_dp4a_u32_u32(i32 %a, i32 %b, i32 %c) {
13+
; CHECK-LABEL: test_dp4a_u32_u32(
14+
; CHECK: {
15+
; CHECK-NEXT: .reg .b32 %r<5>;
16+
; CHECK-EMPTY:
17+
; CHECK-NEXT: // %bb.0:
18+
; CHECK-NEXT: ld.param.u32 %r1, [test_dp4a_u32_u32_param_0];
19+
; CHECK-NEXT: ld.param.u32 %r2, [test_dp4a_u32_u32_param_1];
20+
; CHECK-NEXT: ld.param.u32 %r3, [test_dp4a_u32_u32_param_2];
21+
; CHECK-NEXT: dp4a.u32.u32 %r4, %r1, %r2, %r3;
22+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4;
23+
; CHECK-NEXT: ret;
24+
%call = call i32 @llvm.nvvm.idp4a.u.u(i32 %a, i32 %b, i32 %c)
25+
ret i32 %call
26+
}
27+
28+
define i32 @test_dp4a_u32imm_u32imm(i32 %c) {
29+
; CHECK-LABEL: test_dp4a_u32imm_u32imm(
30+
; CHECK: {
31+
; CHECK-NEXT: .reg .b32 %r<4>;
32+
; CHECK-EMPTY:
33+
; CHECK-NEXT: // %bb.0:
34+
; CHECK-NEXT: ld.param.u32 %r1, [test_dp4a_u32imm_u32imm_param_0];
35+
; CHECK-NEXT: mov.b32 %r2, 0;
36+
; CHECK-NEXT: dp4a.u32.u32 %r3, %r2, %r2, %r1;
37+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3;
38+
; CHECK-NEXT: ret;
39+
%call = call i32 @llvm.nvvm.idp4a.u.u(i32 0, i32 0, i32 %c)
40+
ret i32 %call
41+
}
42+
43+
define i32 @test_dp4a_u32_s32(i32 %a, i32 %b, i32 %c) {
44+
; CHECK-LABEL: test_dp4a_u32_s32(
45+
; CHECK: {
46+
; CHECK-NEXT: .reg .b32 %r<5>;
47+
; CHECK-EMPTY:
48+
; CHECK-NEXT: // %bb.0:
49+
; CHECK-NEXT: ld.param.u32 %r1, [test_dp4a_u32_s32_param_0];
50+
; CHECK-NEXT: ld.param.u32 %r2, [test_dp4a_u32_s32_param_1];
51+
; CHECK-NEXT: ld.param.u32 %r3, [test_dp4a_u32_s32_param_2];
52+
; CHECK-NEXT: dp4a.u32.s32 %r4, %r1, %r2, %r3;
53+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4;
54+
; CHECK-NEXT: ret;
55+
%call = call i32 @llvm.nvvm.idp4a.u.s(i32 %a, i32 %b, i32 %c)
56+
ret i32 %call
57+
}
58+
59+
define i32 @test_dp4a_s32_u32(i32 %a, i32 %b, i32 %c) {
60+
; CHECK-LABEL: test_dp4a_s32_u32(
61+
; CHECK: {
62+
; CHECK-NEXT: .reg .b32 %r<5>;
63+
; CHECK-EMPTY:
64+
; CHECK-NEXT: // %bb.0:
65+
; CHECK-NEXT: ld.param.u32 %r1, [test_dp4a_s32_u32_param_0];
66+
; CHECK-NEXT: ld.param.u32 %r2, [test_dp4a_s32_u32_param_1];
67+
; CHECK-NEXT: ld.param.u32 %r3, [test_dp4a_s32_u32_param_2];
68+
; CHECK-NEXT: dp4a.s32.u32 %r4, %r1, %r2, %r3;
69+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4;
70+
; CHECK-NEXT: ret;
71+
%call = call i32 @llvm.nvvm.idp4a.s.u(i32 %a, i32 %b, i32 %c)
72+
ret i32 %call
73+
}
74+
75+
define i32 @test_dp4a_s32_s32(i32 %a, i32 %b, i32 %c) {
76+
; CHECK-LABEL: test_dp4a_s32_s32(
77+
; CHECK: {
78+
; CHECK-NEXT: .reg .b32 %r<5>;
79+
; CHECK-EMPTY:
80+
; CHECK-NEXT: // %bb.0:
81+
; CHECK-NEXT: ld.param.u32 %r1, [test_dp4a_s32_s32_param_0];
82+
; CHECK-NEXT: ld.param.u32 %r2, [test_dp4a_s32_s32_param_1];
83+
; CHECK-NEXT: ld.param.u32 %r3, [test_dp4a_s32_s32_param_2];
84+
; CHECK-NEXT: dp4a.s32.s32 %r4, %r1, %r2, %r3;
85+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4;
86+
; CHECK-NEXT: ret;
87+
%call = call i32 @llvm.nvvm.idp4a.s.s(i32 %a, i32 %b, i32 %c)
88+
ret i32 %call
89+
}
90+
91+
declare i32 @llvm.nvvm.idp2a.s.s(i32, i32, i1 immarg, i32)
92+
declare i32 @llvm.nvvm.idp2a.s.u(i32, i32, i1 immarg, i32)
93+
declare i32 @llvm.nvvm.idp2a.u.s(i32, i32, i1 immarg, i32)
94+
declare i32 @llvm.nvvm.idp2a.u.u(i32, i32, i1 immarg, i32)
95+
96+
define i32 @test_dp2a_lo_u32_u32(i32 %a, i32 %b, i32 %c) {
97+
; CHECK-LABEL: test_dp2a_lo_u32_u32(
98+
; CHECK: {
99+
; CHECK-NEXT: .reg .b32 %r<5>;
100+
; CHECK-EMPTY:
101+
; CHECK-NEXT: // %bb.0:
102+
; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_lo_u32_u32_param_0];
103+
; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_lo_u32_u32_param_1];
104+
; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_lo_u32_u32_param_2];
105+
; CHECK-NEXT: dp2a.lo.u32.u32 %r4, %r1, %r2, %r3;
106+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4;
107+
; CHECK-NEXT: ret;
108+
%call = call i32 @llvm.nvvm.idp2a.u.u(i32 %a, i32 %b, i1 0, i32 %c)
109+
ret i32 %call
110+
}
111+
112+
define i32 @test_dp2a_lo_u32_s32(i32 %a, i32 %b, i32 %c) {
113+
; CHECK-LABEL: test_dp2a_lo_u32_s32(
114+
; CHECK: {
115+
; CHECK-NEXT: .reg .b32 %r<5>;
116+
; CHECK-EMPTY:
117+
; CHECK-NEXT: // %bb.0:
118+
; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_lo_u32_s32_param_0];
119+
; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_lo_u32_s32_param_1];
120+
; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_lo_u32_s32_param_2];
121+
; CHECK-NEXT: dp2a.lo.u32.s32 %r4, %r1, %r2, %r3;
122+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4;
123+
; CHECK-NEXT: ret;
124+
%call = call i32 @llvm.nvvm.idp2a.u.s(i32 %a, i32 %b, i1 0, i32 %c)
125+
ret i32 %call
126+
}
127+
128+
define i32 @test_dp2a_lo_s32_u32(i32 %a, i32 %b, i32 %c) {
129+
; CHECK-LABEL: test_dp2a_lo_s32_u32(
130+
; CHECK: {
131+
; CHECK-NEXT: .reg .b32 %r<5>;
132+
; CHECK-EMPTY:
133+
; CHECK-NEXT: // %bb.0:
134+
; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_lo_s32_u32_param_0];
135+
; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_lo_s32_u32_param_1];
136+
; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_lo_s32_u32_param_2];
137+
; CHECK-NEXT: dp2a.lo.s32.u32 %r4, %r1, %r2, %r3;
138+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4;
139+
; CHECK-NEXT: ret;
140+
%call = call i32 @llvm.nvvm.idp2a.s.u(i32 %a, i32 %b, i1 0, i32 %c)
141+
ret i32 %call
142+
}
143+
144+
define i32 @test_dp2a_lo_s32_s32(i32 %a, i32 %b, i32 %c) {
145+
; CHECK-LABEL: test_dp2a_lo_s32_s32(
146+
; CHECK: {
147+
; CHECK-NEXT: .reg .b32 %r<5>;
148+
; CHECK-EMPTY:
149+
; CHECK-NEXT: // %bb.0:
150+
; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_lo_s32_s32_param_0];
151+
; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_lo_s32_s32_param_1];
152+
; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_lo_s32_s32_param_2];
153+
; CHECK-NEXT: dp2a.lo.s32.s32 %r4, %r1, %r2, %r3;
154+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4;
155+
; CHECK-NEXT: ret;
156+
%call = call i32 @llvm.nvvm.idp2a.s.s(i32 %a, i32 %b, i1 0, i32 %c)
157+
ret i32 %call
158+
}
159+
160+
define i32 @test_dp2a_hi_u32_u32(i32 %a, i32 %b, i32 %c) {
161+
; CHECK-LABEL: test_dp2a_hi_u32_u32(
162+
; CHECK: {
163+
; CHECK-NEXT: .reg .b32 %r<5>;
164+
; CHECK-EMPTY:
165+
; CHECK-NEXT: // %bb.0:
166+
; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_hi_u32_u32_param_0];
167+
; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_hi_u32_u32_param_1];
168+
; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_hi_u32_u32_param_2];
169+
; CHECK-NEXT: dp2a.hi.u32.u32 %r4, %r1, %r2, %r3;
170+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4;
171+
; CHECK-NEXT: ret;
172+
%call = call i32 @llvm.nvvm.idp2a.u.u(i32 %a, i32 %b, i1 1, i32 %c)
173+
ret i32 %call
174+
}
175+
176+
define i32 @test_dp2a_hi_u32_s32(i32 %a, i32 %b, i32 %c) {
177+
; CHECK-LABEL: test_dp2a_hi_u32_s32(
178+
; CHECK: {
179+
; CHECK-NEXT: .reg .b32 %r<5>;
180+
; CHECK-EMPTY:
181+
; CHECK-NEXT: // %bb.0:
182+
; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_hi_u32_s32_param_0];
183+
; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_hi_u32_s32_param_1];
184+
; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_hi_u32_s32_param_2];
185+
; CHECK-NEXT: dp2a.hi.u32.s32 %r4, %r1, %r2, %r3;
186+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4;
187+
; CHECK-NEXT: ret;
188+
%call = call i32 @llvm.nvvm.idp2a.u.s(i32 %a, i32 %b, i1 1, i32 %c)
189+
ret i32 %call
190+
}
191+
192+
define i32 @test_dp2a_hi_s32_u32(i32 %a, i32 %b, i32 %c) {
193+
; CHECK-LABEL: test_dp2a_hi_s32_u32(
194+
; CHECK: {
195+
; CHECK-NEXT: .reg .b32 %r<5>;
196+
; CHECK-EMPTY:
197+
; CHECK-NEXT: // %bb.0:
198+
; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_hi_s32_u32_param_0];
199+
; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_hi_s32_u32_param_1];
200+
; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_hi_s32_u32_param_2];
201+
; CHECK-NEXT: dp2a.hi.s32.u32 %r4, %r1, %r2, %r3;
202+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4;
203+
; CHECK-NEXT: ret;
204+
%call = call i32 @llvm.nvvm.idp2a.s.u(i32 %a, i32 %b, i1 1, i32 %c)
205+
ret i32 %call
206+
}
207+
208+
define i32 @test_dp2a_hi_s32_s32(i32 %a, i32 %b, i32 %c) {
209+
; CHECK-LABEL: test_dp2a_hi_s32_s32(
210+
; CHECK: {
211+
; CHECK-NEXT: .reg .b32 %r<5>;
212+
; CHECK-EMPTY:
213+
; CHECK-NEXT: // %bb.0:
214+
; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_hi_s32_s32_param_0];
215+
; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_hi_s32_s32_param_1];
216+
; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_hi_s32_s32_param_2];
217+
; CHECK-NEXT: dp2a.hi.s32.s32 %r4, %r1, %r2, %r3;
218+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4;
219+
; CHECK-NEXT: ret;
220+
%call = call i32 @llvm.nvvm.idp2a.s.s(i32 %a, i32 %b, i1 1, i32 %c)
221+
ret i32 %call
222+
}

0 commit comments

Comments
 (0)