[NVPTX] Improve lowering of v2i16 logical ops. (llvm#67365)

Artem-B · web-flow · commit 671e2ba45bf3 · 2023-09-25T14:29:48.000-07:00
Bitwise logical ops can always be done as b32, regardless of
availability of other v2i16 ops, that would need a new GPU.

Includes the missing lowering for 2-argument register operation variants
and additional tests for `and`.
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -642,10 +642,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
 
   // Other arithmetic and logic ops are unsupported.
-  setOperationAction({ISD::AND, ISD::OR, ISD::XOR, ISD::SDIV, ISD::UDIV,
-                      ISD::SRA, ISD::SRL, ISD::MULHS, ISD::MULHU,
-                      ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP,
-                      ISD::UINT_TO_FP},
+  setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS,
+                      ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
+                      ISD::SINT_TO_FP, ISD::UINT_TO_FP},
                      MVT::v2i16, Expand);
 
   setOperationAction(ISD::ADDC, MVT::i32, Legal);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1486,6 +1486,24 @@ defm OR  : BITWISE<"or", or>;
 defm AND : BITWISE<"and", and>;
 defm XOR : BITWISE<"xor", xor>;
 
+// Lower logical v2i16 ops as bitwise ops on b32.
+def: Pat<(or (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)),
+         (ORb32rr Int32Regs:$a, Int32Regs:$b)>;
+def: Pat<(xor (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)),
+         (XORb32rr Int32Regs:$a, Int32Regs:$b)>;
+def: Pat<(and (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)),
+         (ANDb32rr Int32Regs:$a, Int32Regs:$b)>;
+
+// The constants get legalized into a bitcast from i32, so that's what we need
+// to match here.
+def: Pat<(or Int32Regs:$a, (v2i16 (bitconvert (i32 imm:$b)))),
+         (ORb32ri Int32Regs:$a, imm:$b)>;
+def: Pat<(xor Int32Regs:$a, (v2i16 (bitconvert (i32 imm:$b)))),
+         (XORb32ri Int32Regs:$a, imm:$b)>;
+def: Pat<(and Int32Regs:$a, (v2i16 (bitconvert (i32 imm:$b)))),
+         (ANDb32ri Int32Regs:$a, imm:$b)>;
+
+
 def NOT1  : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
                       "not.pred \t$dst, $src;",
                       [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
@@ -235,6 +235,151 @@ define <2 x i16> @test_mul(<2 x i16> %a, <2 x i16> %b) #0 {
   ret <2 x i16> %r
 }
 
+;; Logical ops are available on all GPUs as regular 32-bit logical ops
+; COMMON-LABEL: test_or(
+; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_or_param_0];
+; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_or_param_1];
+; COMMON-NEXT: or.b32          [[R:%r[0-9]+]], [[A]], [[B]];
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_or(<2 x i16> %a, <2 x i16> %b) #0 {
+  %r = or <2 x i16> %a, %b
+  ret <2 x i16> %r
+}
+
+; Ops that operate on computed arguments go though a different lowering path.
+; compared to the ones that operate on loaded data. So we test them separately.
+; COMMON-LABEL: test_or_computed(
+; COMMON:        ld.param.u16    [[A:%rs[0-9+]]], [test_or_computed_param_0];
+; COMMON-DAG:    mov.u16         [[C0:%rs[0-9]+]], 0;
+; COMMON-DAG:    mov.b32         [[R1:%r[0-9]+]], {[[A]], [[C0]]};
+; COMMON-DAG:    mov.u16         [[C5:%rs[0-9]+]], 5;
+; COMMON-DAG:    mov.b32         [[R2:%r[0-9]+]], {[[A]], [[C5]]};
+; COMMON:        or.b32          [[R:%r[0-9]+]], [[R2]], [[R1]];
+; COMMON-NEXT:   st.param.b32    [func_retval0+0], [[R]];
+define <2 x i16> @test_or_computed(i16 %a) {
+  %ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0
+  %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1
+  %r = or <2 x i16> %ins.1, %ins.0
+  ret <2 x i16> %r
+}
+
+; Check that we can lower or with immediate arguments.
+; COMMON-LABEL: test_or_imm_0(
+; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_or_imm_0_param_0];
+; COMMON-NEXT: or.b32          [[R:%r[0-9]+]], [[A]], 131073;
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_or_imm_0(<2 x i16> %a) #0 {
+  %r = or <2 x i16> <i16 1, i16 2>, %a
+  ret <2 x i16> %r
+}
+
+; COMMON-LABEL: test_or_imm_1(
+; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_or_imm_1_param_0];
+; COMMON-NEXT: or.b32          [[R:%r[0-9]+]], [[A]], 131073;
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_or_imm_1(<2 x i16> %a) #0 {
+  %r = or <2 x i16> %a, <i16 1, i16 2>
+  ret <2 x i16> %r
+}
+
+; COMMON-LABEL: test_xor(
+; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_xor_param_0];
+; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_xor_param_1];
+; COMMON-NEXT: xor.b32         [[R:%r[0-9]+]], [[A]], [[B]];
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_xor(<2 x i16> %a, <2 x i16> %b) #0 {
+  %r = xor <2 x i16> %a, %b
+  ret <2 x i16> %r
+}
+
+; COMMON-LABEL: test_xor_computed(
+; COMMON:        ld.param.u16    [[A:%rs[0-9+]]], [test_xor_computed_param_0];
+; COMMON-DAG:    mov.u16         [[C0:%rs[0-9]+]], 0;
+; COMMON-DAG:    mov.b32         [[R1:%r[0-9]+]], {[[A]], [[C0]]};
+; COMMON-DAG:    mov.u16         [[C5:%rs[0-9]+]], 5;
+; COMMON-DAG:    mov.b32         [[R2:%r[0-9]+]], {[[A]], [[C5]]};
+; COMMON:        xor.b32         [[R:%r[0-9]+]], [[R2]], [[R1]];
+; COMMON-NEXT:   st.param.b32    [func_retval0+0], [[R]];
+define <2 x i16> @test_xor_computed(i16 %a) {
+  %ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0
+  %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1
+  %r = xor <2 x i16> %ins.1, %ins.0
+  ret <2 x i16> %r
+}
+
+; Check that we can lower xor with immediate arguments.
+; COMMON-LABEL: test_xor_imm_0(
+; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_xor_imm_0_param_0];
+; COMMON-NEXT: xor.b32         [[R:%r[0-9]+]], [[A]], 131073;
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_xor_imm_0(<2 x i16> %a) #0 {
+  %r = xor <2 x i16> <i16 1, i16 2>, %a
+  ret <2 x i16> %r
+}
+
+; COMMON-LABEL: test_xor_imm_1(
+; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_xor_imm_1_param_0];
+; COMMON-NEXT: xor.b32         [[R:%r[0-9]+]], [[A]], 131073;
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_xor_imm_1(<2 x i16> %a) #0 {
+  %r = xor <2 x i16> %a, <i16 1, i16 2>
+  ret <2 x i16> %r
+}
+
+; COMMON-LABEL: test_and(
+; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_and_param_0];
+; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_and_param_1];
+; COMMON-NEXT: and.b32          [[R:%r[0-9]+]], [[A]], [[B]];
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_and(<2 x i16> %a, <2 x i16> %b) #0 {
+  %r = and <2 x i16> %a, %b
+  ret <2 x i16> %r
+}
+
+; Ops that operate on computed arguments go though a different lowering path.
+; compared to the ones that operate on loaded data. So we test them separately.
+; COMMON-LABEL: test_and_computed(
+; COMMON:        ld.param.u16    [[A:%rs[0-9+]]], [test_and_computed_param_0];
+; COMMON-DAG:    mov.u16         [[C0:%rs[0-9]+]], 0;
+; COMMON-DAG:    mov.b32         [[R1:%r[0-9]+]], {[[A]], [[C0]]};
+; COMMON-DAG:    mov.u16         [[C5:%rs[0-9]+]], 5;
+; COMMON-DAG:    mov.b32         [[R2:%r[0-9]+]], {[[A]], [[C5]]};
+; COMMON:        and.b32          [[R:%r[0-9]+]], [[R2]], [[R1]];
+; COMMON-NEXT:   st.param.b32    [func_retval0+0], [[R]];
+define <2 x i16> @test_and_computed(i16 %a) {
+  %ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0
+  %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1
+  %r = and <2 x i16> %ins.1, %ins.0
+  ret <2 x i16> %r
+}
+
+; Check that we can lower and with immediate arguments.
+; COMMON-LABEL: test_and_imm_0(
+; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_and_imm_0_param_0];
+; COMMON-NEXT: and.b32          [[R:%r[0-9]+]], [[A]], 131073;
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_and_imm_0(<2 x i16> %a) #0 {
+  %r = and <2 x i16> <i16 1, i16 2>, %a
+  ret <2 x i16> %r
+}
+
+; COMMON-LABEL: test_and_imm_1(
+; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_and_imm_1_param_0];
+; COMMON-NEXT: and.b32          [[R:%r[0-9]+]], [[A]], 131073;
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_and_imm_1(<2 x i16> %a) #0 {
+  %r = and <2 x i16> %a, <i16 1, i16 2>
+  ret <2 x i16> %r
+}
 
 ; COMMON-LABEL: .func test_ldst_v2i16(
 ; COMMON-DAG:    ld.param.u64    [[A:%rd[0-9]+]], [test_ldst_v2i16_param_0];
diff --git a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll