Skip to content

Commit 0970633

Browse files
committed
[NVPTX] Set v2i16 SETCC to Expand
Note that this refers to the return type of SETCC. This operation is not legal in PTX but was assumed as such because v2i16 is declared a legal type. We were already expanding v4i8 SETCC. The DAGCombiner would in certain circumstances try to fold an extension of an illegal v2i1 SETCC (because v2i1 is illegal) into a "legal" v2i16 SETCC, which we wouldn't have patterns for.
1 parent 1e23a61 commit 0970633

File tree

2 files changed

+73
-1
lines changed

2 files changed

+73
-1
lines changed

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -725,7 +725,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
725725
// Other arithmetic and logic ops are unsupported.
726726
setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS,
727727
ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
728-
ISD::SINT_TO_FP, ISD::UINT_TO_FP},
728+
ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::SETCC},
729729
MVT::v2i16, Expand);
730730

731731
setOperationAction(ISD::ADDC, MVT::i32, Legal);

llvm/test/CodeGen/NVPTX/sext-setcc.ll

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 | FileCheck %s
3+
; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
4+
5+
define <2 x i16> @sext_setcc_v2i1_to_v2i16(ptr %p) {
6+
; CHECK-LABEL: sext_setcc_v2i1_to_v2i16(
7+
; CHECK: {
8+
; CHECK-NEXT: .reg .pred %p<3>;
9+
; CHECK-NEXT: .reg .b16 %rs<5>;
10+
; CHECK-NEXT: .reg .b32 %r<3>;
11+
; CHECK-NEXT: .reg .b64 %rd<2>;
12+
; CHECK-EMPTY:
13+
; CHECK-NEXT: // %bb.0: // %entry
14+
; CHECK-NEXT: ld.param.u64 %rd1, [sext_setcc_v2i1_to_v2i16_param_0];
15+
; CHECK-NEXT: ld.u32 %r1, [%rd1];
16+
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
17+
; CHECK-NEXT: setp.eq.s16 %p1, %rs1, 0;
18+
; CHECK-NEXT: setp.eq.s16 %p2, %rs2, 0;
19+
; CHECK-NEXT: selp.s16 %rs3, -1, 0, %p2;
20+
; CHECK-NEXT: selp.s16 %rs4, -1, 0, %p1;
21+
; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3};
22+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
23+
; CHECK-NEXT: ret;
24+
entry:
25+
%v = load <2 x i16>, ptr %p, align 4
26+
%cmp = icmp eq <2 x i16> %v, zeroinitializer
27+
%sext = sext <2 x i1> %cmp to <2 x i16>
28+
ret <2 x i16> %sext
29+
}
30+
31+
define <4 x i8> @sext_setcc_v4i1_to_v4i8(ptr %p) {
32+
; CHECK-LABEL: sext_setcc_v4i1_to_v4i8(
33+
; CHECK: {
34+
; CHECK-NEXT: .reg .pred %p<5>;
35+
; CHECK-NEXT: .reg .b16 %rs<9>;
36+
; CHECK-NEXT: .reg .b32 %r<14>;
37+
; CHECK-NEXT: .reg .b64 %rd<2>;
38+
; CHECK-EMPTY:
39+
; CHECK-NEXT: // %bb.0: // %entry
40+
; CHECK-NEXT: ld.param.u64 %rd1, [sext_setcc_v4i1_to_v4i8_param_0];
41+
; CHECK-NEXT: ld.u32 %r1, [%rd1];
42+
; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
43+
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
44+
; CHECK-NEXT: and.b16 %rs2, %rs1, 255;
45+
; CHECK-NEXT: setp.eq.s16 %p1, %rs2, 0;
46+
; CHECK-NEXT: bfe.u32 %r3, %r1, 16, 8;
47+
; CHECK-NEXT: cvt.u16.u32 %rs3, %r3;
48+
; CHECK-NEXT: and.b16 %rs4, %rs3, 255;
49+
; CHECK-NEXT: setp.eq.s16 %p2, %rs4, 0;
50+
; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
51+
; CHECK-NEXT: cvt.u16.u32 %rs5, %r4;
52+
; CHECK-NEXT: and.b16 %rs6, %rs5, 255;
53+
; CHECK-NEXT: setp.eq.s16 %p3, %rs6, 0;
54+
; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8;
55+
; CHECK-NEXT: cvt.u16.u32 %rs7, %r5;
56+
; CHECK-NEXT: and.b16 %rs8, %rs7, 255;
57+
; CHECK-NEXT: setp.eq.s16 %p4, %rs8, 0;
58+
; CHECK-NEXT: selp.s32 %r6, -1, 0, %p4;
59+
; CHECK-NEXT: selp.s32 %r7, -1, 0, %p3;
60+
; CHECK-NEXT: bfi.b32 %r8, %r7, %r6, 8, 8;
61+
; CHECK-NEXT: selp.s32 %r9, -1, 0, %p2;
62+
; CHECK-NEXT: bfi.b32 %r10, %r9, %r8, 16, 8;
63+
; CHECK-NEXT: selp.s32 %r11, -1, 0, %p1;
64+
; CHECK-NEXT: bfi.b32 %r12, %r11, %r10, 24, 8;
65+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r12;
66+
; CHECK-NEXT: ret;
67+
entry:
68+
%v = load <4 x i8>, ptr %p, align 4
69+
%cmp = icmp eq <4 x i8> %v, zeroinitializer
70+
%sext = sext <4 x i1> %cmp to <4 x i8>
71+
ret <4 x i8> %sext
72+
}

0 commit comments

Comments
 (0)