Skip to content

[NVPTX] unbreak extract_elt lowering #102688

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 12, 2024
Merged

Conversation

Artem-B
Copy link
Member

@Artem-B Artem-B commented Aug 9, 2024

LLVM has started using freeze instruction, and that unintentionally broke the lowering of some vector operations in NVPTX.

LLVM has started using `freeze` instruction, and that unintentionally broke
the lowering of some vector operations in NVPTX.
@Artem-B Artem-B requested a review from akuegel August 9, 2024 21:54
@llvmbot
Copy link
Member

llvmbot commented Aug 9, 2024

@llvm/pr-subscribers-backend-nvptx

Author: Artem Belevich (Artem-B)

Changes

LLVM has started using freeze instruction, and that unintentionally broke the lowering of some vector operations in NVPTX.


Full diff: https://github.com/llvm/llvm-project/pull/102688.diff

2 Files Affected:

  • (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+2)
  • (modified) llvm/test/CodeGen/NVPTX/extractelement.ll (+127-53)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 059cfff1f7e69..43a3fbf4d1306 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5920,6 +5920,8 @@ static SDValue PerformSETCCCombine(SDNode *N,
 static SDValue PerformEXTRACTCombine(SDNode *N,
                                      TargetLowering::DAGCombinerInfo &DCI) {
   SDValue Vector = N->getOperand(0);
+  if (Vector->getOpcode() == ISD::FREEZE)
+    Vector = Vector->getOperand(0);
   SDLoc DL(N);
   EVT VectorVT = Vector.getValueType();
   if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
diff --git a/llvm/test/CodeGen/NVPTX/extractelement.ll b/llvm/test/CodeGen/NVPTX/extractelement.ll
index 92d00f881cc26..367c20749a9f3 100644
--- a/llvm/test/CodeGen/NVPTX/extractelement.ll
+++ b/llvm/test/CodeGen/NVPTX/extractelement.ll
@@ -1,12 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
+target triple = "nvptx64-nvidia-cuda"
 
 
-; CHECK-LABEL: test_v2i8
-; CHECK-DAG:        ld.param.u16    [[A:%rs[0-9]+]], [test_v2i8_param_0];
-; CHECK-DAG:        cvt.s16.s8      [[E0:%rs[0-9]+]], [[A]];
-; CHECK-DAG:        shr.s16         [[E1:%rs[0-9]+]], [[A]], 8;
 define i16  @test_v2i8(i16 %a) {
+; CHECK-LABEL: test_v2i8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %rs1, [test_v2i8_param_0];
+; CHECK-NEXT:    cvt.s16.s8 %rs2, %rs1;
+; CHECK-NEXT:    shr.s16 %rs3, %rs1, 8;
+; CHECK-NEXT:    add.s16 %rs4, %rs2, %rs3;
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs4;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
   %v = bitcast i16 %a to <2 x i8>
   %r0 = extractelement <2 x i8> %v, i64 0
   %r1 = extractelement <2 x i8> %v, i64 1
@@ -16,17 +27,53 @@ define i16  @test_v2i8(i16 %a) {
   ret i16 %r01
 }
 
-; CHECK-LABEL: test_v4i8
-; CHECK:            ld.param.u32    [[R:%r[0-9]+]], [test_v4i8_param_0];
-; CHECK-DAG:        bfe.s32         [[R0:%r[0-9]+]], [[R]], 0, 8;
-; CHECK-DAG:        cvt.s8.s32      [[E0:%rs[0-9]+]], [[R0]];
-; CHECK-DAG:        bfe.s32         [[R1:%r[0-9]+]], [[R]], 8, 8;
-; CHECK-DAG:        cvt.s8.s32      [[E1:%rs[0-9]+]], [[R1]];
-; CHECK-DAG:        bfe.s32         [[R2:%r[0-9]+]], [[R]], 16, 8;
-; CHECK-DAG:        cvt.s8.s32      [[E2:%rs[0-9]+]], [[R2]];
-; CHECK-DAG:        bfe.s32         [[R3:%r[0-9]+]], [[R]], 24, 8;
-; CHECK-DAG:        cvt.s8.s32      [[E3:%rs[0-9]+]], [[R3]];
+define i1  @test_v2i8_load(ptr %a) {
+; CHECK-LABEL: test_v2i8_load(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_v2i8_load_param_0];
+; CHECK-NEXT:    ld.v2.u8 {%rs1, %rs2}, [%rd1];
+; CHECK-NEXT:    or.b16 %rs5, %rs1, %rs2;
+; CHECK-NEXT:    and.b16 %rs6, %rs5, 255;
+; CHECK-NEXT:    setp.eq.s16 %p1, %rs6, 0;
+; CHECK-NEXT:    selp.u32 %r1, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
+  %v = load <2 x i8>, ptr %a, align 4
+  %r0 = extractelement <2 x i8> %v, i64 0
+  %r1 = extractelement <2 x i8> %v, i64 1
+  %icmp = icmp eq i8 %r0, 0
+  %icmp3 = icmp eq i8 %r1, 0
+  %select = select i1 %icmp, i1 %icmp3, i1 false
+  ret i1 %select
+}
 define i16  @test_v4i8(i32 %a) {
+; CHECK-LABEL: test_v4i8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<8>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_v4i8_param_0];
+; CHECK-NEXT:    bfe.s32 %r2, %r1, 0, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs1, %r2;
+; CHECK-NEXT:    bfe.s32 %r3, %r1, 8, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs2, %r3;
+; CHECK-NEXT:    bfe.s32 %r4, %r1, 16, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs3, %r4;
+; CHECK-NEXT:    bfe.s32 %r5, %r1, 24, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs4, %r5;
+; CHECK-NEXT:    add.s16 %rs5, %rs1, %rs2;
+; CHECK-NEXT:    add.s16 %rs6, %rs3, %rs4;
+; CHECK-NEXT:    add.s16 %rs7, %rs5, %rs6;
+; CHECK-NEXT:    cvt.u32.u16 %r6, %rs7;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r6;
+; CHECK-NEXT:    ret;
   %v = bitcast i32 %a to <4 x i8>
   %r0 = extractelement <4 x i8> %v, i64 0
   %r1 = extractelement <4 x i8> %v, i64 1
@@ -42,16 +89,22 @@ define i16  @test_v4i8(i32 %a) {
   ret i16 %r
 }
 
-; CHECK-LABEL: test_v4i8_s32
-; CHECK:            ld.param.u32    [[R:%r[0-9]+]], [test_v4i8_s32_param_0];
-; CHECK-DAG:        bfe.s32         [[R0:%r[0-9]+]], [[R]], 0, 8;
-; CHECK-DAG:        bfe.s32         [[R1:%r[0-9]+]], [[R]], 8, 8;
-; CHECK-DAG:        bfe.s32         [[R2:%r[0-9]+]], [[R]], 16, 8;
-; CHECK-DAG:        bfe.s32         [[R3:%r[0-9]+]], [[R]], 24, 8;
-; CHECK-DAG:        add.s32         [[R01:%r[0-9]+]], [[R0]], [[R1]]
-; CHECK-DAG:        add.s32         [[R23:%r[0-9]+]], [[R2]], [[R3]]
-; CHECK-DAG:        add.s32         [[R0123:%r[0-9]+]], [[R01]], [[R23]]
 define i32  @test_v4i8_s32(i32 %a) {
+; CHECK-LABEL: test_v4i8_s32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_v4i8_s32_param_0];
+; CHECK-NEXT:    bfe.s32 %r2, %r1, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r3, %r1, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r4, %r1, 16, 8;
+; CHECK-NEXT:    bfe.s32 %r5, %r1, 24, 8;
+; CHECK-NEXT:    add.s32 %r6, %r2, %r3;
+; CHECK-NEXT:    add.s32 %r7, %r4, %r5;
+; CHECK-NEXT:    add.s32 %r8, %r6, %r7;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r8;
+; CHECK-NEXT:    ret;
   %v = bitcast i32 %a to <4 x i8>
   %r0 = extractelement <4 x i8> %v, i64 0
   %r1 = extractelement <4 x i8> %v, i64 1
@@ -67,16 +120,22 @@ define i32  @test_v4i8_s32(i32 %a) {
   ret i32 %r
 }
 
-; CHECK-LABEL: test_v4i8_u32
-; CHECK:            ld.param.u32    [[R:%r[0-9]+]], [test_v4i8_u32_param_0];
-; CHECK-DAG:        bfe.u32         [[R0:%r[0-9]+]], [[R]], 0, 8;
-; CHECK-DAG:        bfe.u32         [[R1:%r[0-9]+]], [[R]], 8, 8;
-; CHECK-DAG:        bfe.u32         [[R2:%r[0-9]+]], [[R]], 16, 8;
-; CHECK-DAG:        bfe.u32         [[R3:%r[0-9]+]], [[R]], 24, 8;
-; CHECK-DAG:        add.s32         [[R01:%r[0-9]+]], [[R0]], [[R1]]
-; CHECK-DAG:        add.s32         [[R23:%r[0-9]+]], [[R2]], [[R3]]
-; CHECK-DAG:        add.s32         [[R0123:%r[0-9]+]], [[R01]], [[R23]]
 define i32  @test_v4i8_u32(i32 %a) {
+; CHECK-LABEL: test_v4i8_u32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_v4i8_u32_param_0];
+; CHECK-NEXT:    bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r1, 24, 8;
+; CHECK-NEXT:    add.s32 %r6, %r2, %r3;
+; CHECK-NEXT:    add.s32 %r7, %r4, %r5;
+; CHECK-NEXT:    add.s32 %r8, %r6, %r7;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r8;
+; CHECK-NEXT:    ret;
   %v = bitcast i32 %a to <4 x i8>
   %r0 = extractelement <4 x i8> %v, i64 0
   %r1 = extractelement <4 x i8> %v, i64 1
@@ -94,28 +153,43 @@ define i32  @test_v4i8_u32(i32 %a) {
 
 
 
-; CHECK-LABEL: test_v8i8
-; CHECK:       ld.param.u64    [[R:%rd[0-9]+]], [test_v8i8_param_0];
-; CHECK-DAG:        cvt.u32.u64     [[R00:%r[0-9]+]], [[R]];
-; CHECK-DAG:        { .reg .b32 tmp; mov.b64 {tmp, [[R01:%r[0-9]+]]}, [[R]]; }
-; CHECK-DAG:        bfe.s32         [[R1:%r[0-9]+]], [[R00]], 0, 8;
-; CHECK-DAG:        cvt.s8.s32      [[E1:%rs[0-9]+]], [[R1]];
-; CHECK-DAG:        bfe.s32         [[R2:%r[0-9]+]], [[R00]], 8, 8;
-; CHECK-DAG:        cvt.s8.s32      [[E2:%rs[0-9]+]], [[R2]];
-; CHECK-DAG:        bfe.s32         [[R3:%r[0-9]+]], [[R00]], 16, 8;
-; CHECK-DAG:        cvt.s8.s32      [[E3:%rs[0-9]+]], [[R3]];
-; CHECK-DAG:        bfe.s32         [[R4:%r[0-9]+]], [[R00]], 24, 8;
-; CHECK-DAG:        cvt.s8.s32      [[E4:%rs[0-9]+]], [[R4]];
-; CHECK-DAG:        bfe.s32         [[R5:%r[0-9]+]], [[R01]], 0, 8;
-; CHECK-DAG:        cvt.s8.s32      [[E5:%rs[0-9]+]], [[R5]];
-; CHECK-DAG:        bfe.s32         [[R6:%r[0-9]+]], [[R01]], 8, 8;
-; CHECK-DAG:        cvt.s8.s32      [[E6:%rs[0-9]+]], [[R6]];
-; CHECK-DAG:        bfe.s32         [[R7:%r[0-9]+]], [[R01]], 16, 8;
-; CHECK-DAG:        cvt.s8.s32      [[E7:%rs[0-9]+]], [[R7]];
-; CHECK-DAG:        bfe.s32         [[R8:%r[0-9]+]], [[R01]], 24, 8;
-; CHECK-DAG:        cvt.s8.s32      [[E8:%rs[0-9]+]], [[R8]];
-
 define i16  @test_v8i8(i64 %a) {
+; CHECK-LABEL: test_v8i8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<16>;
+; CHECK-NEXT:    .reg .b32 %r<14>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_v8i8_param_0];
+; CHECK-NEXT:    cvt.u32.u64 %r1, %rd1;
+; CHECK-NEXT:    { .reg .b32 tmp; mov.b64 {tmp, %r2}, %rd1; }
+; CHECK-NEXT:    bfe.s32 %r5, %r1, 0, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs1, %r5;
+; CHECK-NEXT:    bfe.s32 %r6, %r1, 8, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs2, %r6;
+; CHECK-NEXT:    bfe.s32 %r7, %r1, 16, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs3, %r7;
+; CHECK-NEXT:    bfe.s32 %r8, %r1, 24, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs4, %r8;
+; CHECK-NEXT:    bfe.s32 %r9, %r2, 0, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs5, %r9;
+; CHECK-NEXT:    bfe.s32 %r10, %r2, 8, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs6, %r10;
+; CHECK-NEXT:    bfe.s32 %r11, %r2, 16, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs7, %r11;
+; CHECK-NEXT:    bfe.s32 %r12, %r2, 24, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs8, %r12;
+; CHECK-NEXT:    add.s16 %rs9, %rs1, %rs2;
+; CHECK-NEXT:    add.s16 %rs10, %rs3, %rs4;
+; CHECK-NEXT:    add.s16 %rs11, %rs5, %rs6;
+; CHECK-NEXT:    add.s16 %rs12, %rs7, %rs8;
+; CHECK-NEXT:    add.s16 %rs13, %rs9, %rs10;
+; CHECK-NEXT:    add.s16 %rs14, %rs11, %rs12;
+; CHECK-NEXT:    add.s16 %rs15, %rs13, %rs14;
+; CHECK-NEXT:    cvt.u32.u16 %r13, %rs15;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r13;
+; CHECK-NEXT:    ret;
   %v = bitcast i64 %a to <8 x i8>
   %r0 = extractelement <8 x i8> %v, i64 0
   %r1 = extractelement <8 x i8> %v, i64 1

@Artem-B Artem-B requested a review from AlexMaclean August 9, 2024 21:58
Copy link
Member

@AlexMaclean AlexMaclean left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@Artem-B Artem-B merged commit fe7d284 into llvm:main Aug 12, 2024
8 of 10 checks passed
@Artem-B Artem-B deleted the freeze-extract_elt branch August 12, 2024 17:31
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants