Skip to content

[GlobalISel] Add support to moreElementsVector for G_SEXT, G_ZEXT and G_ANYEXT #85038

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Conversation

dc03-work
Copy link
Contributor

No description provided.

Created using spr 1.3.5
@llvmbot
Copy link
Member

llvmbot commented Mar 13, 2024

@llvm/pr-subscribers-backend-aarch64

@llvm/pr-subscribers-llvm-globalisel

Author: Dhruv Chawla (work) (dc03-work)

Changes

Patch is 47.62 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/85038.diff

5 Files Affected:

  • (modified) llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp (+20)
  • (modified) llvm/test/CodeGen/AArch64/fcmp.ll (+37-44)
  • (modified) llvm/test/CodeGen/AArch64/sext.ll (+6-8)
  • (modified) llvm/test/CodeGen/AArch64/vecreduce-add.ll (+708-101)
  • (modified) llvm/test/CodeGen/AArch64/zext.ll (+8-10)
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index bd3ff7265d51f9..a480c290907761 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -5496,6 +5496,26 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
     return Legalized;
   }
 
+  case TargetOpcode::G_SEXT:
+  case TargetOpcode::G_ZEXT:
+  case TargetOpcode::G_ANYEXT: {
+    LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+    LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+    if (TypeIdx == 0) {
+      DstTy = MoreTy;
+      SrcTy = MoreTy.changeElementType(SrcTy.getElementType());
+    } else if (TypeIdx == 1) {
+      SrcTy = MoreTy;
+      DstTy = MoreTy.changeElementType(DstTy.getElementType());
+    }
+
+    Observer.changingInstr(MI);
+    moreElementsVectorSrc(MI, SrcTy, 1);
+    moreElementsVectorDst(MI, DstTy, 0);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+
   default:
     return UnableToLegalize;
   }
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 2d0b5574cdd7ba..9916aeeab1cad1 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -1108,61 +1108,54 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ;
 ; CHECK-GI-FP16-LABEL: v7f16_i32:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fcmgt v1.8h, v1.8h, v0.8h
-; CHECK-GI-FP16-NEXT:    mov w12, #31 // =0x1f
-; CHECK-GI-FP16-NEXT:    ldr s4, [sp]
-; CHECK-GI-FP16-NEXT:    fmov s2, w12
+; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    mov w10, #31 // =0x1f
+; CHECK-GI-FP16-NEXT:    ldr s3, [sp]
+; CHECK-GI-FP16-NEXT:    fmov s1, w10
 ; CHECK-GI-FP16-NEXT:    fmov s6, w0
-; CHECK-GI-FP16-NEXT:    ldr s5, [sp, #8]
+; CHECK-GI-FP16-NEXT:    ldr s4, [sp, #8]
 ; CHECK-GI-FP16-NEXT:    ldr s7, [sp, #24]
 ; CHECK-GI-FP16-NEXT:    ldr s16, [sp, #32]
-; CHECK-GI-FP16-NEXT:    umov w9, v1.h[4]
-; CHECK-GI-FP16-NEXT:    umov w8, v1.h[0]
-; CHECK-GI-FP16-NEXT:    umov w11, v1.h[5]
-; CHECK-GI-FP16-NEXT:    umov w10, v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v2.s[1], w12
-; CHECK-GI-FP16-NEXT:    umov w13, v1.h[2]
+; CHECK-GI-FP16-NEXT:    umov w8, v0.h[4]
+; CHECK-GI-FP16-NEXT:    umov w9, v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v1.s[1], w10
 ; CHECK-GI-FP16-NEXT:    mov v6.s[1], w1
 ; CHECK-GI-FP16-NEXT:    mov v7.s[1], v16.s[0]
 ; CHECK-GI-FP16-NEXT:    ldr s16, [sp, #40]
-; CHECK-GI-FP16-NEXT:    fmov s3, w9
-; CHECK-GI-FP16-NEXT:    fmov s0, w8
-; CHECK-GI-FP16-NEXT:    umov w8, v1.h[6]
-; CHECK-GI-FP16-NEXT:    mov v2.s[2], w12
-; CHECK-GI-FP16-NEXT:    umov w9, v1.h[3]
+; CHECK-GI-FP16-NEXT:    fmov s2, w8
+; CHECK-GI-FP16-NEXT:    umov w8, v0.h[6]
+; CHECK-GI-FP16-NEXT:    mov v1.s[2], w10
+; CHECK-GI-FP16-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-FP16-NEXT:    mov v6.s[2], w2
 ; CHECK-GI-FP16-NEXT:    mov v7.s[2], v16.s[0]
-; CHECK-GI-FP16-NEXT:    mov v3.s[1], w11
-; CHECK-GI-FP16-NEXT:    mov v0.s[1], w10
-; CHECK-GI-FP16-NEXT:    mov w10, #-1 // =0xffffffff
-; CHECK-GI-FP16-NEXT:    fmov s1, w10
-; CHECK-GI-FP16-NEXT:    neg v17.4s, v2.4s
+; CHECK-GI-FP16-NEXT:    mov v2.s[1], w9
+; CHECK-GI-FP16-NEXT:    mov w9, #-1 // =0xffffffff
+; CHECK-GI-FP16-NEXT:    fmov s5, w9
+; CHECK-GI-FP16-NEXT:    neg v17.4s, v1.4s
+; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
 ; CHECK-GI-FP16-NEXT:    mov v6.s[3], w3
+; CHECK-GI-FP16-NEXT:    mov v2.s[2], w8
+; CHECK-GI-FP16-NEXT:    fmov w8, s3
+; CHECK-GI-FP16-NEXT:    fmov s3, w7
+; CHECK-GI-FP16-NEXT:    mov v5.s[1], w9
+; CHECK-GI-FP16-NEXT:    sshr v0.4s, v0.4s, #31
+; CHECK-GI-FP16-NEXT:    mov v3.s[1], w8
+; CHECK-GI-FP16-NEXT:    fmov w8, s4
+; CHECK-GI-FP16-NEXT:    ldr s4, [sp, #16]
+; CHECK-GI-FP16-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-GI-FP16-NEXT:    fmov s2, w4
+; CHECK-GI-FP16-NEXT:    mov v5.s[2], w9
+; CHECK-GI-FP16-NEXT:    mov v2.s[1], w5
 ; CHECK-GI-FP16-NEXT:    mov v3.s[2], w8
+; CHECK-GI-FP16-NEXT:    sshl v1.4s, v1.4s, v17.4s
 ; CHECK-GI-FP16-NEXT:    fmov w8, s4
-; CHECK-GI-FP16-NEXT:    fmov s4, w7
-; CHECK-GI-FP16-NEXT:    mov v0.s[2], w13
-; CHECK-GI-FP16-NEXT:    mov v1.s[1], w10
-; CHECK-GI-FP16-NEXT:    mov v4.s[1], w8
-; CHECK-GI-FP16-NEXT:    fmov w8, s5
-; CHECK-GI-FP16-NEXT:    ldr s5, [sp, #16]
-; CHECK-GI-FP16-NEXT:    ushl v2.4s, v3.4s, v2.4s
-; CHECK-GI-FP16-NEXT:    fmov s3, w4
-; CHECK-GI-FP16-NEXT:    mov v0.s[3], w9
-; CHECK-GI-FP16-NEXT:    mov v1.s[2], w10
-; CHECK-GI-FP16-NEXT:    mov v3.s[1], w5
-; CHECK-GI-FP16-NEXT:    mov v4.s[2], w8
-; CHECK-GI-FP16-NEXT:    sshl v2.4s, v2.4s, v17.4s
-; CHECK-GI-FP16-NEXT:    fmov w8, s5
-; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
-; CHECK-GI-FP16-NEXT:    eor v1.16b, v2.16b, v1.16b
-; CHECK-GI-FP16-NEXT:    mov v3.s[2], w6
-; CHECK-GI-FP16-NEXT:    mov v4.s[3], w8
-; CHECK-GI-FP16-NEXT:    sshr v0.4s, v0.4s, #31
-; CHECK-GI-FP16-NEXT:    and v1.16b, v7.16b, v1.16b
-; CHECK-GI-FP16-NEXT:    and v2.16b, v3.16b, v2.16b
-; CHECK-GI-FP16-NEXT:    bsl v0.16b, v6.16b, v4.16b
-; CHECK-GI-FP16-NEXT:    orr v1.16b, v2.16b, v1.16b
+; CHECK-GI-FP16-NEXT:    eor v4.16b, v1.16b, v5.16b
+; CHECK-GI-FP16-NEXT:    mov v2.s[2], w6
+; CHECK-GI-FP16-NEXT:    mov v3.s[3], w8
+; CHECK-GI-FP16-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-GI-FP16-NEXT:    and v2.16b, v7.16b, v4.16b
+; CHECK-GI-FP16-NEXT:    bsl v0.16b, v6.16b, v3.16b
+; CHECK-GI-FP16-NEXT:    orr v1.16b, v1.16b, v2.16b
 ; CHECK-GI-FP16-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-FP16-NEXT:    mov s3, v0.s[2]
 ; CHECK-GI-FP16-NEXT:    mov s4, v0.s[3]
diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll
index 61f04fbf0484f7..3e0d5dd875097f 100644
--- a/llvm/test/CodeGen/AArch64/sext.ll
+++ b/llvm/test/CodeGen/AArch64/sext.ll
@@ -280,13 +280,12 @@ define <3 x i64> @sext_v3i8_v3i64(<3 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v3i8_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-GI-NEXT:    fmov d0, x0
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-NEXT:    fmov s0, w0
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-GI-NEXT:    sxtb x8, w2
 ; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    mov v0.d[1], x1
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
 ; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #56
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
@@ -444,13 +443,12 @@ define <3 x i64> @sext_v3i10_v3i64(<3 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v3i10_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-GI-NEXT:    fmov d0, x0
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-NEXT:    fmov s0, w0
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-GI-NEXT:    sbfx x8, x2, #0, #10
 ; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    mov v0.d[1], x1
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #54
 ; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #54
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 66b49466cc7361..94ab173e9183ac 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -4,11 +4,6 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-BASE
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - -mattr=+dotprod 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-DOT
 
-; CHECK-GI-BASE:        warning: Instruction selection used fallback path for test_udot_v24i8
-; CHECK-GI-BASE-NEXT:   warning: Instruction selection used fallback path for test_udot_v48i8
-; CHECK-GI-BASE-NEXT:   warning: Instruction selection used fallback path for test_sdot_v24i8
-; CHECK-GI-BASE-NEXT:   warning: Instruction selection used fallback path for test_sdot_v48i8
-
 define i32 @addv_v2i32(<2 x i32> %a) {
 ; CHECK-LABEL: addv_v2i32:
 ; CHECK:       // %bb.0: // %entry
@@ -2068,25 +2063,125 @@ define i32 @test_udot_v24i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-BASE-LABEL: test_udot_v24i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ldr q0, [x0]
-; CHECK-GI-BASE-NEXT:    ldr q1, [x1]
-; CHECK-GI-BASE-NEXT:    ldr d4, [x0, #16]
-; CHECK-GI-BASE-NEXT:    ldr d5, [x1, #16]
-; CHECK-GI-BASE-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-GI-BASE-NEXT:    umull v6.4s, v3.4h, v2.4h
-; CHECK-GI-BASE-NEXT:    umull2 v2.4s, v3.8h, v2.8h
-; CHECK-GI-BASE-NEXT:    ushll v3.8h, v4.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v4.8h, v5.8b, #0
-; CHECK-GI-BASE-NEXT:    umlal2 v2.4s, v4.8h, v3.8h
-; CHECK-GI-BASE-NEXT:    umlal v6.4s, v4.4h, v3.4h
-; CHECK-GI-BASE-NEXT:    umlal2 v2.4s, v1.8h, v0.8h
-; CHECK-GI-BASE-NEXT:    umlal v6.4s, v1.4h, v0.4h
-; CHECK-GI-BASE-NEXT:    add v0.4s, v6.4s, v2.4s
+; CHECK-GI-BASE-NEXT:    ldr b0, [x0]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #1]
+; CHECK-GI-BASE-NEXT:    ldr b1, [x0, #8]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #9]
+; CHECK-GI-BASE-NEXT:    ldr b2, [x0, #16]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x0, #17]
+; CHECK-GI-BASE-NEXT:    mov v0.b[1], v19.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b3, [x1]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x1, #1]
+; CHECK-GI-BASE-NEXT:    mov v1.b[1], v18.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[1], v16.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b4, [x1, #8]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x1, #9]
+; CHECK-GI-BASE-NEXT:    mov v3.b[1], v17.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b5, [x1, #16]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x1, #17]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #2]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #18]
+; CHECK-GI-BASE-NEXT:    mov v4.b[1], v7.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #10]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #10]
+; CHECK-GI-BASE-NEXT:    mov v5.b[1], v6.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[2], v17.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #2]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #18]
+; CHECK-GI-BASE-NEXT:    mov v1.b[2], v7.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[2], v16.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[2], v6.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #3]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #11]
+; CHECK-GI-BASE-NEXT:    mov v4.b[2], v18.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #3]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #19]
+; CHECK-GI-BASE-NEXT:    mov v5.b[2], v19.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #11]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #19]
+; CHECK-GI-BASE-NEXT:    mov v1.b[3], v7.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[3], v16.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[3], v17.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[3], v6.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #4]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #12]
+; CHECK-GI-BASE-NEXT:    mov v4.b[3], v18.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #4]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #20]
+; CHECK-GI-BASE-NEXT:    mov v5.b[3], v19.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #12]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #20]
+; CHECK-GI-BASE-NEXT:    mov v1.b[4], v7.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[4], v16.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[4], v17.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[4], v6.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #5]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #13]
+; CHECK-GI-BASE-NEXT:    mov v4.b[4], v18.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #5]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #21]
+; CHECK-GI-BASE-NEXT:    mov v5.b[4], v19.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #13]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #21]
+; CHECK-GI-BASE-NEXT:    mov v1.b[5], v7.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[5], v16.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[5], v17.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[5], v6.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #6]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #14]
+; CHECK-GI-BASE-NEXT:    mov v4.b[5], v18.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #6]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #22]
+; CHECK-GI-BASE-NEXT:    mov v5.b[5], v19.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #14]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #22]
+; CHECK-GI-BASE-NEXT:    mov v1.b[6], v7.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[6], v16.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[6], v17.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[6], v6.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #7]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #15]
+; CHECK-GI-BASE-NEXT:    mov v4.b[6], v18.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #7]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #23]
+; CHECK-GI-BASE-NEXT:    mov v5.b[6], v19.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #15]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #23]
+; CHECK-GI-BASE-NEXT:    mov v1.b[7], v7.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[7], v16.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[7], v17.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[7], v6.b[0]
+; CHECK-GI-BASE-NEXT:    mov v4.b[7], v18.b[0]
+; CHECK-GI-BASE-NEXT:    mov v5.b[7], v19.b[0]
+; CHECK-GI-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v3.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v4.8h, v4.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v5.8h, v5.8b, #0
+; CHECK-GI-BASE-NEXT:    umull v6.4s, v3.4h, v0.4h
+; CHECK-GI-BASE-NEXT:    umull2 v0.4s, v3.8h, v0.8h
+; CHECK-GI-BASE-NEXT:    umull v3.4s, v4.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    umull2 v1.4s, v4.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    umull v4.4s, v5.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    umull2 v2.4s, v5.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    addv s5, v6.4s
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
+; CHECK-GI-BASE-NEXT:    addv s3, v3.4s
+; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
+; CHECK-GI-BASE-NEXT:    addv s2, v2.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s5
+; CHECK-GI-BASE-NEXT:    fmov w9, s0
+; CHECK-GI-BASE-NEXT:    fmov w11, s1
+; CHECK-GI-BASE-NEXT:    fmov w10, s3
+; CHECK-GI-BASE-NEXT:    fmov w12, s4
+; CHECK-GI-BASE-NEXT:    fmov w13, s2
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    add w8, w8, w10
+; CHECK-GI-BASE-NEXT:    add w9, w11, w12
+; CHECK-GI-BASE-NEXT:    add w9, w9, w13
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: test_udot_v24i8:
@@ -2257,39 +2352,245 @@ define i32 @test_udot_v48i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-BASE-LABEL: test_udot_v48i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ldp q0, q4, [x1]
-; CHECK-GI-BASE-NEXT:    ldr q2, [x0, #32]
-; CHECK-GI-BASE-NEXT:    ldp q1, q3, [x0]
-; CHECK-GI-BASE-NEXT:    ldr q7, [x1, #32]
-; CHECK-GI-BASE-NEXT:    ushll2 v16.8h, v2.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v6.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    ldr b0, [x0]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #1]
+; CHECK-GI-BASE-NEXT:    ldr b1, [x0, #8]
+; CHECK-GI-BASE-NEXT:    ldr b3, [x0, #9]
+; CHECK-GI-BASE-NEXT:    ldr b2, [x0, #16]
+; CHECK-GI-BASE-NEXT:    ldr b4, [x0, #17]
+; CHECK-GI-BASE-NEXT:    mov v0.b[1], v7.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b5, [x0, #2]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #10]
+; CHECK-GI-BASE-NEXT:    mov v1.b[1], v3.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[1], v4.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b4, [x0, #32]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x0, #33]
+; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #18]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #3]
+; CHECK-GI-BASE-NEXT:    ldr b3, [x0, #24]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #25]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x0, #34]
+; CHECK-GI-BASE-NEXT:    mov v0.b[2], v5.b[0]
+; CHECK-GI-BASE-NEXT:    mov v4.b[1], v16.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b5, [x0, #40]
+; CHECK-GI-BASE-NEXT:    mov v1.b[2], v19.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[2], v20.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[1], v7.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #41]
+; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #11]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #4]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x0, #42]
+; CHECK-GI-BASE-NEXT:    ldr b21, [x0, #12]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #5]
+; CHECK-GI-BASE-NEXT:    mov v5.b[1], v19.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[3], v6.b[0]
+; CHECK-GI-BASE-NEXT:    mov v4.b[2], v24.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #19]
+; CHECK-GI-BASE-NEXT:    mov v1.b[3], v20.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #26]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #13]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #6]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x0, #14]
+; CHECK-GI-BASE-NEXT:    mov v2.b[3], v6.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #20]
+; CHECK-GI-BASE-NEXT:    mov v3.b[2], v20.b[0]
+; CHECK-GI-BASE-NEXT:    mov v5.b[2], v25.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[4], v18.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #35]
+; CHECK-GI-BASE-NEXT:    mov v1.b[4], v21.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x0, #27]
+; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #21]
+; CHECK-GI-BASE-NEXT:    mov v4.b[3], v18.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #44]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x0, #22]
+; CHECK-GI-BASE-NEXT:    mov v2.b[4], v6.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #43]
+; CHECK-GI-BASE-NEXT:    mov v3.b[3], v25.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[5], v7.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x0, #28]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x0, #7]
+; CHECK-GI-BASE-NEXT:    mov v5.b[3], v6.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #36]
+; CHECK-GI-BASE-NEXT:    mov v1.b[5], v19.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #37]
+; CHECK-GI-BASE-NEXT:    ldr b23, [x0, #15]
+; CHECK-GI-BASE-NEXT:    ldr b26, [x0, #29]
+; CHECK-GI-BASE-NEXT:    mov v4.b[4], v6.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[5], v20.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[4], v25.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[6], v17.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #45]
+; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #38]
+; CHECK-GI-BASE-NEXT:    mov v5.b[4], v18.b[0]
+; CHECK-GI-BASE-NEXT:    mov v1.b[6], v22.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x1]
+; CHECK-GI-BASE-NEXT:    ldr b21, [x0, #23]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #17]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #2]
+; CHECK-GI-BASE-NEXT:    mov v4.b[5], v19.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[6], v24.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #1]
+; CHECK-GI-BASE-NEXT:    mov v3.b[5], v26.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[7], v16.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b26, [x1, #9]
+; CHECK-GI-BASE-NEXT:    mov v5.b[5], v17.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x1, #8]
+; CHECK-GI-BASE-NEXT:    mov v1.b[7], v23.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #16]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #24]
+; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #25]
+; CHECK-GI-BASE-NEXT:    mov v4.b[6], v20.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b20, [x1, #32]
+; CHECK-GI-BASE-NEXT:    mov v6.b[1], v24.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #33]
+; CHECK-GI-BASE-NEXT:    mov v2.b[7], v21.b[0]
+; CHECK-GI-BASE-NEXT:    mov v17.b[1], v26.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b21, [x1, #40]
+; CHECK-GI-BASE-NEXT:    mov v16.b[1], v25.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #41]
+; CHECK-GI-BASE-NEXT:    mov v19.b[1], v23.b[0]
+; CHECK-GI-BASE-NEXT:    mov v20.b[1], v24.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #10]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #18]
+; CHECK-GI-BASE-NEXT:    mov v21.b[1], v25.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #26]
+; CHECK-GI-BASE-NEXT:    mov v6.b[2], v22.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #34]
+; CHECK-GI-BASE-NEXT:    mov v17.b[2], v23.b[0]
+; CHECK-GI-BASE-NEXT:    mov v16.b[2], v24.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #42]
+; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #3]
+; CHECK-GI-BASE-NEXT:    mov v19.b[2], v25.b[0]
+; CHECK-GI-BASE-NEXT:    mov v20.b[2], v22.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #11]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #19]
+; CHECK-GI-BASE-NEXT:    mov v21.b[2], v24.b[0]
+; ...
[truncated]

@dc03-work
Copy link
Contributor Author

This is the parent PR for #85042, which fixes the codegen issues regarding the large numbers of loads generated for the vecreduce-add.ll test case.

@davemgreen davemgreen requested a review from chuongg3 March 13, 2024 09:00
Copy link
Collaborator

@davemgreen davemgreen left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was looking at G_TRUNC recently which is quite similar in terms of what is falling back. It was running into some of the same regressions that this sees in zext_v3i8_v3i64, among other issues.

@@ -5496,6 +5496,26 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
return Legalized;
}

case TargetOpcode::G_SEXT:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a case above that handles G_TRUNC and a few others that have different type sizes. It could be extended to handle these operations too.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Awesome, that works - though I have to set AArch64LegalizerInfo to use index 0 instead of 1.

@chuongg3
Copy link
Contributor

chuongg3 commented Mar 13, 2024

Would it be possible to replace .maxNumElements() with .clampNumElements() in AArch64LegalizerInfo.cpp?
Currently moreElements only gets called by .moreElementsToNextPow2() for the EXTEND instructions
This might help with legalizing quite a few of the failing EXTEND instructions that has been showing up like:
%1:_(<4 x s16>) = G_ZEXT %0:_(<4 x s8>)

@dc03-work
Copy link
Contributor Author

Would it be possible to replace .maxNumElements() with .clampNumElements() in AArch64LegalizerInfo.cpp? Currently moreElements only gets called by .moreElementsToNextPow2() for the EXTEND instructions This might help with legalizing quite a few of the failing EXTEND instructions that has been showing up like: %1:_(<4 x s16>) = G_ZEXT %0:_(<4 x s8>)

Hmm, what would the lower bound be in this case?

@davemgreen
Copy link
Collaborator

Would it be possible to replace .maxNumElements() with .clampNumElements() in AArch64LegalizerInfo.cpp? Currently moreElements only gets called by .moreElementsToNextPow2() for the EXTEND instructions This might help with legalizing quite a few of the failing EXTEND instructions that has been showing up like: %1:_(<4 x s16>) = G_ZEXT %0:_(<4 x s8>)

Hmm, what would the lower bound be in this case?

The idea would be to (where we can), generally always produce legal types after legalization. Clamp all vector types to 64bit or 128bit, and after legalization only those types would remain, with the intermediate legalization artefacts combined away. So v8i8, v4i16, v2i32 (v1i64 isn't as important in this case).

@dc03-work
Copy link
Contributor Author

dc03-work commented Mar 14, 2024

Would it be possible to replace .maxNumElements() with .clampNumElements() in AArch64LegalizerInfo.cpp? Currently moreElements only gets called by .moreElementsToNextPow2() for the EXTEND instructions This might help with legalizing quite a few of the failing EXTEND instructions that has been showing up like: %1:_(<4 x s16>) = G_ZEXT %0:_(<4 x s8>)

Hmm, what would the lower bound be in this case?

The idea would be to (where we can), generally always produce legal types after legalization. Clamp all vector types to 64bit or 128bit, and after legalization only those types would remain, with the intermediate legalization artefacts combined away. So v8i8, v4i16, v2i32 (v1i64 isn't as important in this case).

I don't think legal vector arguments to *ext can be 128 bits wide, so this would basically mean clamping them to always be 64 bits wide. I feel this should be a separate patch.

@davemgreen
Copy link
Collaborator

I don't think legal vector arguments to *ext can be 128 bits wide, so this would basically mean clamping them to always be 64 bits wide. I feel this should be a separate patch.

Yeah the legal operations are always essentially that given size. I agree that it's good not to try and solve every problem in a single ticket, and this is a nice step forward. LGTM

@dc03-work dc03-work merged commit 843a978 into main Mar 18, 2024
@dc03-work dc03-work deleted the users/dc03-work/spr/globalisel-add-support-to-moreelementsvector-for-g_sext-g_zext-and-g_anyext branch March 18, 2024 02:16
@dc03-work dc03-work restored the users/dc03-work/spr/globalisel-add-support-to-moreelementsvector-for-g_sext-g_zext-and-g_anyext branch March 18, 2024 02:16
@dc03-work dc03-work deleted the users/dc03-work/spr/globalisel-add-support-to-moreelementsvector-for-g_sext-g_zext-and-g_anyext branch March 18, 2024 02:17
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants