Skip to content

Commit 671e2ba

Browse files
authored
[NVPTX] Improve lowering of v2i16 logical ops. (llvm#67365)
Bitwise logical ops can always be done as b32, regardless of availability of other v2i16 ops, that would need a new GPU. Includes the missing lowering for 2-argument register operation variants and additional tests for `and`.
1 parent 4ca00a5 commit 671e2ba

File tree

4 files changed

+571
-511
lines changed

4 files changed

+571
-511
lines changed

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -642,10 +642,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
642642
setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
643643

644644
// Other arithmetic and logic ops are unsupported.
645-
setOperationAction({ISD::AND, ISD::OR, ISD::XOR, ISD::SDIV, ISD::UDIV,
646-
ISD::SRA, ISD::SRL, ISD::MULHS, ISD::MULHU,
647-
ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP,
648-
ISD::UINT_TO_FP},
645+
setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS,
646+
ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
647+
ISD::SINT_TO_FP, ISD::UINT_TO_FP},
649648
MVT::v2i16, Expand);
650649

651650
setOperationAction(ISD::ADDC, MVT::i32, Legal);

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1486,6 +1486,24 @@ defm OR : BITWISE<"or", or>;
14861486
defm AND : BITWISE<"and", and>;
14871487
defm XOR : BITWISE<"xor", xor>;
14881488

1489+
// Lower logical v2i16 ops as bitwise ops on b32.
1490+
def: Pat<(or (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)),
1491+
(ORb32rr Int32Regs:$a, Int32Regs:$b)>;
1492+
def: Pat<(xor (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)),
1493+
(XORb32rr Int32Regs:$a, Int32Regs:$b)>;
1494+
def: Pat<(and (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)),
1495+
(ANDb32rr Int32Regs:$a, Int32Regs:$b)>;
1496+
1497+
// The constants get legalized into a bitcast from i32, so that's what we need
1498+
// to match here.
1499+
def: Pat<(or Int32Regs:$a, (v2i16 (bitconvert (i32 imm:$b)))),
1500+
(ORb32ri Int32Regs:$a, imm:$b)>;
1501+
def: Pat<(xor Int32Regs:$a, (v2i16 (bitconvert (i32 imm:$b)))),
1502+
(XORb32ri Int32Regs:$a, imm:$b)>;
1503+
def: Pat<(and Int32Regs:$a, (v2i16 (bitconvert (i32 imm:$b)))),
1504+
(ANDb32ri Int32Regs:$a, imm:$b)>;
1505+
1506+
14891507
def NOT1 : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
14901508
"not.pred \t$dst, $src;",
14911509
[(set Int1Regs:$dst, (not Int1Regs:$src))]>;

llvm/test/CodeGen/NVPTX/i16x2-instructions.ll

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,151 @@ define <2 x i16> @test_mul(<2 x i16> %a, <2 x i16> %b) #0 {
235235
ret <2 x i16> %r
236236
}
237237

238+
;; Logical ops are available on all GPUs as regular 32-bit logical ops
239+
; COMMON-LABEL: test_or(
240+
; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_or_param_0];
241+
; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_or_param_1];
242+
; COMMON-NEXT: or.b32 [[R:%r[0-9]+]], [[A]], [[B]];
243+
; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]];
244+
; COMMON-NEXT: ret;
245+
define <2 x i16> @test_or(<2 x i16> %a, <2 x i16> %b) #0 {
246+
%r = or <2 x i16> %a, %b
247+
ret <2 x i16> %r
248+
}
249+
250+
; Ops that operate on computed arguments go though a different lowering path.
251+
; compared to the ones that operate on loaded data. So we test them separately.
252+
; COMMON-LABEL: test_or_computed(
253+
; COMMON: ld.param.u16 [[A:%rs[0-9+]]], [test_or_computed_param_0];
254+
; COMMON-DAG: mov.u16 [[C0:%rs[0-9]+]], 0;
255+
; COMMON-DAG: mov.b32 [[R1:%r[0-9]+]], {[[A]], [[C0]]};
256+
; COMMON-DAG: mov.u16 [[C5:%rs[0-9]+]], 5;
257+
; COMMON-DAG: mov.b32 [[R2:%r[0-9]+]], {[[A]], [[C5]]};
258+
; COMMON: or.b32 [[R:%r[0-9]+]], [[R2]], [[R1]];
259+
; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]];
260+
define <2 x i16> @test_or_computed(i16 %a) {
261+
%ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0
262+
%ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1
263+
%r = or <2 x i16> %ins.1, %ins.0
264+
ret <2 x i16> %r
265+
}
266+
267+
; Check that we can lower or with immediate arguments.
268+
; COMMON-LABEL: test_or_imm_0(
269+
; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_or_imm_0_param_0];
270+
; COMMON-NEXT: or.b32 [[R:%r[0-9]+]], [[A]], 131073;
271+
; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]];
272+
; COMMON-NEXT: ret;
273+
define <2 x i16> @test_or_imm_0(<2 x i16> %a) #0 {
274+
%r = or <2 x i16> <i16 1, i16 2>, %a
275+
ret <2 x i16> %r
276+
}
277+
278+
; COMMON-LABEL: test_or_imm_1(
279+
; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_or_imm_1_param_0];
280+
; COMMON-NEXT: or.b32 [[R:%r[0-9]+]], [[A]], 131073;
281+
; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]];
282+
; COMMON-NEXT: ret;
283+
define <2 x i16> @test_or_imm_1(<2 x i16> %a) #0 {
284+
%r = or <2 x i16> %a, <i16 1, i16 2>
285+
ret <2 x i16> %r
286+
}
287+
288+
; COMMON-LABEL: test_xor(
289+
; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_xor_param_0];
290+
; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_xor_param_1];
291+
; COMMON-NEXT: xor.b32 [[R:%r[0-9]+]], [[A]], [[B]];
292+
; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]];
293+
; COMMON-NEXT: ret;
294+
define <2 x i16> @test_xor(<2 x i16> %a, <2 x i16> %b) #0 {
295+
%r = xor <2 x i16> %a, %b
296+
ret <2 x i16> %r
297+
}
298+
299+
; COMMON-LABEL: test_xor_computed(
300+
; COMMON: ld.param.u16 [[A:%rs[0-9+]]], [test_xor_computed_param_0];
301+
; COMMON-DAG: mov.u16 [[C0:%rs[0-9]+]], 0;
302+
; COMMON-DAG: mov.b32 [[R1:%r[0-9]+]], {[[A]], [[C0]]};
303+
; COMMON-DAG: mov.u16 [[C5:%rs[0-9]+]], 5;
304+
; COMMON-DAG: mov.b32 [[R2:%r[0-9]+]], {[[A]], [[C5]]};
305+
; COMMON: xor.b32 [[R:%r[0-9]+]], [[R2]], [[R1]];
306+
; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]];
307+
define <2 x i16> @test_xor_computed(i16 %a) {
308+
%ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0
309+
%ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1
310+
%r = xor <2 x i16> %ins.1, %ins.0
311+
ret <2 x i16> %r
312+
}
313+
314+
; Check that we can lower xor with immediate arguments.
315+
; COMMON-LABEL: test_xor_imm_0(
316+
; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_xor_imm_0_param_0];
317+
; COMMON-NEXT: xor.b32 [[R:%r[0-9]+]], [[A]], 131073;
318+
; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]];
319+
; COMMON-NEXT: ret;
320+
define <2 x i16> @test_xor_imm_0(<2 x i16> %a) #0 {
321+
%r = xor <2 x i16> <i16 1, i16 2>, %a
322+
ret <2 x i16> %r
323+
}
324+
325+
; COMMON-LABEL: test_xor_imm_1(
326+
; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_xor_imm_1_param_0];
327+
; COMMON-NEXT: xor.b32 [[R:%r[0-9]+]], [[A]], 131073;
328+
; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]];
329+
; COMMON-NEXT: ret;
330+
define <2 x i16> @test_xor_imm_1(<2 x i16> %a) #0 {
331+
%r = xor <2 x i16> %a, <i16 1, i16 2>
332+
ret <2 x i16> %r
333+
}
334+
335+
; COMMON-LABEL: test_and(
336+
; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_and_param_0];
337+
; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_and_param_1];
338+
; COMMON-NEXT: and.b32 [[R:%r[0-9]+]], [[A]], [[B]];
339+
; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]];
340+
; COMMON-NEXT: ret;
341+
define <2 x i16> @test_and(<2 x i16> %a, <2 x i16> %b) #0 {
342+
%r = and <2 x i16> %a, %b
343+
ret <2 x i16> %r
344+
}
345+
346+
; Ops that operate on computed arguments go though a different lowering path.
347+
; compared to the ones that operate on loaded data. So we test them separately.
348+
; COMMON-LABEL: test_and_computed(
349+
; COMMON: ld.param.u16 [[A:%rs[0-9+]]], [test_and_computed_param_0];
350+
; COMMON-DAG: mov.u16 [[C0:%rs[0-9]+]], 0;
351+
; COMMON-DAG: mov.b32 [[R1:%r[0-9]+]], {[[A]], [[C0]]};
352+
; COMMON-DAG: mov.u16 [[C5:%rs[0-9]+]], 5;
353+
; COMMON-DAG: mov.b32 [[R2:%r[0-9]+]], {[[A]], [[C5]]};
354+
; COMMON: and.b32 [[R:%r[0-9]+]], [[R2]], [[R1]];
355+
; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]];
356+
define <2 x i16> @test_and_computed(i16 %a) {
357+
%ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0
358+
%ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1
359+
%r = and <2 x i16> %ins.1, %ins.0
360+
ret <2 x i16> %r
361+
}
362+
363+
; Check that we can lower and with immediate arguments.
364+
; COMMON-LABEL: test_and_imm_0(
365+
; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_and_imm_0_param_0];
366+
; COMMON-NEXT: and.b32 [[R:%r[0-9]+]], [[A]], 131073;
367+
; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]];
368+
; COMMON-NEXT: ret;
369+
define <2 x i16> @test_and_imm_0(<2 x i16> %a) #0 {
370+
%r = and <2 x i16> <i16 1, i16 2>, %a
371+
ret <2 x i16> %r
372+
}
373+
374+
; COMMON-LABEL: test_and_imm_1(
375+
; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_and_imm_1_param_0];
376+
; COMMON-NEXT: and.b32 [[R:%r[0-9]+]], [[A]], 131073;
377+
; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]];
378+
; COMMON-NEXT: ret;
379+
define <2 x i16> @test_and_imm_1(<2 x i16> %a) #0 {
380+
%r = and <2 x i16> %a, <i16 1, i16 2>
381+
ret <2 x i16> %r
382+
}
238383

239384
; COMMON-LABEL: .func test_ldst_v2i16(
240385
; COMMON-DAG: ld.param.u64 [[A:%rd[0-9]+]], [test_ldst_v2i16_param_0];

0 commit comments

Comments
 (0)