-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[GlobalISel] Add support to moreElementsVector for G_SEXT, G_ZEXT and G_ANYEXT #85038
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[GlobalISel] Add support to moreElementsVector for G_SEXT, G_ZEXT and G_ANYEXT #85038
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-llvm-globalisel Author: Dhruv Chawla (work) (dc03-work) ChangesPatch is 47.62 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/85038.diff 5 Files Affected:
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index bd3ff7265d51f9..a480c290907761 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -5496,6 +5496,26 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
return Legalized;
}
+ case TargetOpcode::G_SEXT:
+ case TargetOpcode::G_ZEXT:
+ case TargetOpcode::G_ANYEXT: {
+ LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+ LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+ if (TypeIdx == 0) {
+ DstTy = MoreTy;
+ SrcTy = MoreTy.changeElementType(SrcTy.getElementType());
+ } else if (TypeIdx == 1) {
+ SrcTy = MoreTy;
+ DstTy = MoreTy.changeElementType(DstTy.getElementType());
+ }
+
+ Observer.changingInstr(MI);
+ moreElementsVectorSrc(MI, SrcTy, 1);
+ moreElementsVectorDst(MI, DstTy, 0);
+ Observer.changedInstr(MI);
+ return Legalized;
+ }
+
default:
return UnableToLegalize;
}
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 2d0b5574cdd7ba..9916aeeab1cad1 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -1108,61 +1108,54 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
;
; CHECK-GI-FP16-LABEL: v7f16_i32:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: fcmgt v1.8h, v1.8h, v0.8h
-; CHECK-GI-FP16-NEXT: mov w12, #31 // =0x1f
-; CHECK-GI-FP16-NEXT: ldr s4, [sp]
-; CHECK-GI-FP16-NEXT: fmov s2, w12
+; CHECK-GI-FP16-NEXT: fcmgt v0.8h, v1.8h, v0.8h
+; CHECK-GI-FP16-NEXT: mov w10, #31 // =0x1f
+; CHECK-GI-FP16-NEXT: ldr s3, [sp]
+; CHECK-GI-FP16-NEXT: fmov s1, w10
; CHECK-GI-FP16-NEXT: fmov s6, w0
-; CHECK-GI-FP16-NEXT: ldr s5, [sp, #8]
+; CHECK-GI-FP16-NEXT: ldr s4, [sp, #8]
; CHECK-GI-FP16-NEXT: ldr s7, [sp, #24]
; CHECK-GI-FP16-NEXT: ldr s16, [sp, #32]
-; CHECK-GI-FP16-NEXT: umov w9, v1.h[4]
-; CHECK-GI-FP16-NEXT: umov w8, v1.h[0]
-; CHECK-GI-FP16-NEXT: umov w11, v1.h[5]
-; CHECK-GI-FP16-NEXT: umov w10, v1.h[1]
-; CHECK-GI-FP16-NEXT: mov v2.s[1], w12
-; CHECK-GI-FP16-NEXT: umov w13, v1.h[2]
+; CHECK-GI-FP16-NEXT: umov w8, v0.h[4]
+; CHECK-GI-FP16-NEXT: umov w9, v0.h[5]
+; CHECK-GI-FP16-NEXT: mov v1.s[1], w10
; CHECK-GI-FP16-NEXT: mov v6.s[1], w1
; CHECK-GI-FP16-NEXT: mov v7.s[1], v16.s[0]
; CHECK-GI-FP16-NEXT: ldr s16, [sp, #40]
-; CHECK-GI-FP16-NEXT: fmov s3, w9
-; CHECK-GI-FP16-NEXT: fmov s0, w8
-; CHECK-GI-FP16-NEXT: umov w8, v1.h[6]
-; CHECK-GI-FP16-NEXT: mov v2.s[2], w12
-; CHECK-GI-FP16-NEXT: umov w9, v1.h[3]
+; CHECK-GI-FP16-NEXT: fmov s2, w8
+; CHECK-GI-FP16-NEXT: umov w8, v0.h[6]
+; CHECK-GI-FP16-NEXT: mov v1.s[2], w10
+; CHECK-GI-FP16-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-FP16-NEXT: mov v6.s[2], w2
; CHECK-GI-FP16-NEXT: mov v7.s[2], v16.s[0]
-; CHECK-GI-FP16-NEXT: mov v3.s[1], w11
-; CHECK-GI-FP16-NEXT: mov v0.s[1], w10
-; CHECK-GI-FP16-NEXT: mov w10, #-1 // =0xffffffff
-; CHECK-GI-FP16-NEXT: fmov s1, w10
-; CHECK-GI-FP16-NEXT: neg v17.4s, v2.4s
+; CHECK-GI-FP16-NEXT: mov v2.s[1], w9
+; CHECK-GI-FP16-NEXT: mov w9, #-1 // =0xffffffff
+; CHECK-GI-FP16-NEXT: fmov s5, w9
+; CHECK-GI-FP16-NEXT: neg v17.4s, v1.4s
+; CHECK-GI-FP16-NEXT: shl v0.4s, v0.4s, #31
; CHECK-GI-FP16-NEXT: mov v6.s[3], w3
+; CHECK-GI-FP16-NEXT: mov v2.s[2], w8
+; CHECK-GI-FP16-NEXT: fmov w8, s3
+; CHECK-GI-FP16-NEXT: fmov s3, w7
+; CHECK-GI-FP16-NEXT: mov v5.s[1], w9
+; CHECK-GI-FP16-NEXT: sshr v0.4s, v0.4s, #31
+; CHECK-GI-FP16-NEXT: mov v3.s[1], w8
+; CHECK-GI-FP16-NEXT: fmov w8, s4
+; CHECK-GI-FP16-NEXT: ldr s4, [sp, #16]
+; CHECK-GI-FP16-NEXT: ushl v1.4s, v2.4s, v1.4s
+; CHECK-GI-FP16-NEXT: fmov s2, w4
+; CHECK-GI-FP16-NEXT: mov v5.s[2], w9
+; CHECK-GI-FP16-NEXT: mov v2.s[1], w5
; CHECK-GI-FP16-NEXT: mov v3.s[2], w8
+; CHECK-GI-FP16-NEXT: sshl v1.4s, v1.4s, v17.4s
; CHECK-GI-FP16-NEXT: fmov w8, s4
-; CHECK-GI-FP16-NEXT: fmov s4, w7
-; CHECK-GI-FP16-NEXT: mov v0.s[2], w13
-; CHECK-GI-FP16-NEXT: mov v1.s[1], w10
-; CHECK-GI-FP16-NEXT: mov v4.s[1], w8
-; CHECK-GI-FP16-NEXT: fmov w8, s5
-; CHECK-GI-FP16-NEXT: ldr s5, [sp, #16]
-; CHECK-GI-FP16-NEXT: ushl v2.4s, v3.4s, v2.4s
-; CHECK-GI-FP16-NEXT: fmov s3, w4
-; CHECK-GI-FP16-NEXT: mov v0.s[3], w9
-; CHECK-GI-FP16-NEXT: mov v1.s[2], w10
-; CHECK-GI-FP16-NEXT: mov v3.s[1], w5
-; CHECK-GI-FP16-NEXT: mov v4.s[2], w8
-; CHECK-GI-FP16-NEXT: sshl v2.4s, v2.4s, v17.4s
-; CHECK-GI-FP16-NEXT: fmov w8, s5
-; CHECK-GI-FP16-NEXT: shl v0.4s, v0.4s, #31
-; CHECK-GI-FP16-NEXT: eor v1.16b, v2.16b, v1.16b
-; CHECK-GI-FP16-NEXT: mov v3.s[2], w6
-; CHECK-GI-FP16-NEXT: mov v4.s[3], w8
-; CHECK-GI-FP16-NEXT: sshr v0.4s, v0.4s, #31
-; CHECK-GI-FP16-NEXT: and v1.16b, v7.16b, v1.16b
-; CHECK-GI-FP16-NEXT: and v2.16b, v3.16b, v2.16b
-; CHECK-GI-FP16-NEXT: bsl v0.16b, v6.16b, v4.16b
-; CHECK-GI-FP16-NEXT: orr v1.16b, v2.16b, v1.16b
+; CHECK-GI-FP16-NEXT: eor v4.16b, v1.16b, v5.16b
+; CHECK-GI-FP16-NEXT: mov v2.s[2], w6
+; CHECK-GI-FP16-NEXT: mov v3.s[3], w8
+; CHECK-GI-FP16-NEXT: and v1.16b, v2.16b, v1.16b
+; CHECK-GI-FP16-NEXT: and v2.16b, v7.16b, v4.16b
+; CHECK-GI-FP16-NEXT: bsl v0.16b, v6.16b, v3.16b
+; CHECK-GI-FP16-NEXT: orr v1.16b, v1.16b, v2.16b
; CHECK-GI-FP16-NEXT: mov s2, v0.s[1]
; CHECK-GI-FP16-NEXT: mov s3, v0.s[2]
; CHECK-GI-FP16-NEXT: mov s4, v0.s[3]
diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll
index 61f04fbf0484f7..3e0d5dd875097f 100644
--- a/llvm/test/CodeGen/AArch64/sext.ll
+++ b/llvm/test/CodeGen/AArch64/sext.ll
@@ -280,13 +280,12 @@ define <3 x i64> @sext_v3i8_v3i64(<3 x i8> %a) {
;
; CHECK-GI-LABEL: sext_v3i8_v3i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-GI-NEXT: fmov d0, x0
-; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-NEXT: fmov s0, w0
; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2
; CHECK-GI-NEXT: sxtb x8, w2
; CHECK-GI-NEXT: fmov d2, x8
-; CHECK-GI-NEXT: mov v0.d[1], x1
+; CHECK-GI-NEXT: mov v0.s[1], w1
+; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: shl v0.2d, v0.2d, #56
; CHECK-GI-NEXT: sshr v0.2d, v0.2d, #56
; CHECK-GI-NEXT: mov d1, v0.d[1]
@@ -444,13 +443,12 @@ define <3 x i64> @sext_v3i10_v3i64(<3 x i10> %a) {
;
; CHECK-GI-LABEL: sext_v3i10_v3i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-GI-NEXT: fmov d0, x0
-; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-NEXT: fmov s0, w0
; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2
; CHECK-GI-NEXT: sbfx x8, x2, #0, #10
; CHECK-GI-NEXT: fmov d2, x8
-; CHECK-GI-NEXT: mov v0.d[1], x1
+; CHECK-GI-NEXT: mov v0.s[1], w1
+; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: shl v0.2d, v0.2d, #54
; CHECK-GI-NEXT: sshr v0.2d, v0.2d, #54
; CHECK-GI-NEXT: mov d1, v0.d[1]
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 66b49466cc7361..94ab173e9183ac 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -4,11 +4,6 @@
; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-BASE
; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - -mattr=+dotprod 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-DOT
-; CHECK-GI-BASE: warning: Instruction selection used fallback path for test_udot_v24i8
-; CHECK-GI-BASE-NEXT: warning: Instruction selection used fallback path for test_udot_v48i8
-; CHECK-GI-BASE-NEXT: warning: Instruction selection used fallback path for test_sdot_v24i8
-; CHECK-GI-BASE-NEXT: warning: Instruction selection used fallback path for test_sdot_v48i8
-
define i32 @addv_v2i32(<2 x i32> %a) {
; CHECK-LABEL: addv_v2i32:
; CHECK: // %bb.0: // %entry
@@ -2068,25 +2063,125 @@ define i32 @test_udot_v24i8(ptr %p1, ptr %p2) {
;
; CHECK-GI-BASE-LABEL: test_udot_v24i8:
; CHECK-GI-BASE: // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT: ldr q0, [x0]
-; CHECK-GI-BASE-NEXT: ldr q1, [x1]
-; CHECK-GI-BASE-NEXT: ldr d4, [x0, #16]
-; CHECK-GI-BASE-NEXT: ldr d5, [x1, #16]
-; CHECK-GI-BASE-NEXT: ushll v2.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT: ushll v3.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-BASE-NEXT: ushll2 v1.8h, v1.16b, #0
-; CHECK-GI-BASE-NEXT: umull v6.4s, v3.4h, v2.4h
-; CHECK-GI-BASE-NEXT: umull2 v2.4s, v3.8h, v2.8h
-; CHECK-GI-BASE-NEXT: ushll v3.8h, v4.8b, #0
-; CHECK-GI-BASE-NEXT: ushll v4.8h, v5.8b, #0
-; CHECK-GI-BASE-NEXT: umlal2 v2.4s, v4.8h, v3.8h
-; CHECK-GI-BASE-NEXT: umlal v6.4s, v4.4h, v3.4h
-; CHECK-GI-BASE-NEXT: umlal2 v2.4s, v1.8h, v0.8h
-; CHECK-GI-BASE-NEXT: umlal v6.4s, v1.4h, v0.4h
-; CHECK-GI-BASE-NEXT: add v0.4s, v6.4s, v2.4s
+; CHECK-GI-BASE-NEXT: ldr b0, [x0]
+; CHECK-GI-BASE-NEXT: ldr b19, [x0, #1]
+; CHECK-GI-BASE-NEXT: ldr b1, [x0, #8]
+; CHECK-GI-BASE-NEXT: ldr b18, [x0, #9]
+; CHECK-GI-BASE-NEXT: ldr b2, [x0, #16]
+; CHECK-GI-BASE-NEXT: ldr b16, [x0, #17]
+; CHECK-GI-BASE-NEXT: mov v0.b[1], v19.b[0]
+; CHECK-GI-BASE-NEXT: ldr b3, [x1]
+; CHECK-GI-BASE-NEXT: ldr b17, [x1, #1]
+; CHECK-GI-BASE-NEXT: mov v1.b[1], v18.b[0]
+; CHECK-GI-BASE-NEXT: mov v2.b[1], v16.b[0]
+; CHECK-GI-BASE-NEXT: ldr b4, [x1, #8]
+; CHECK-GI-BASE-NEXT: ldr b7, [x1, #9]
+; CHECK-GI-BASE-NEXT: mov v3.b[1], v17.b[0]
+; CHECK-GI-BASE-NEXT: ldr b5, [x1, #16]
+; CHECK-GI-BASE-NEXT: ldr b6, [x1, #17]
+; CHECK-GI-BASE-NEXT: ldr b16, [x1, #2]
+; CHECK-GI-BASE-NEXT: ldr b17, [x0, #18]
+; CHECK-GI-BASE-NEXT: mov v4.b[1], v7.b[0]
+; CHECK-GI-BASE-NEXT: ldr b7, [x0, #10]
+; CHECK-GI-BASE-NEXT: ldr b18, [x1, #10]
+; CHECK-GI-BASE-NEXT: mov v5.b[1], v6.b[0]
+; CHECK-GI-BASE-NEXT: mov v2.b[2], v17.b[0]
+; CHECK-GI-BASE-NEXT: ldr b6, [x0, #2]
+; CHECK-GI-BASE-NEXT: ldr b19, [x1, #18]
+; CHECK-GI-BASE-NEXT: mov v1.b[2], v7.b[0]
+; CHECK-GI-BASE-NEXT: mov v3.b[2], v16.b[0]
+; CHECK-GI-BASE-NEXT: mov v0.b[2], v6.b[0]
+; CHECK-GI-BASE-NEXT: ldr b6, [x0, #3]
+; CHECK-GI-BASE-NEXT: ldr b7, [x0, #11]
+; CHECK-GI-BASE-NEXT: mov v4.b[2], v18.b[0]
+; CHECK-GI-BASE-NEXT: ldr b16, [x1, #3]
+; CHECK-GI-BASE-NEXT: ldr b17, [x0, #19]
+; CHECK-GI-BASE-NEXT: mov v5.b[2], v19.b[0]
+; CHECK-GI-BASE-NEXT: ldr b18, [x1, #11]
+; CHECK-GI-BASE-NEXT: ldr b19, [x1, #19]
+; CHECK-GI-BASE-NEXT: mov v1.b[3], v7.b[0]
+; CHECK-GI-BASE-NEXT: mov v3.b[3], v16.b[0]
+; CHECK-GI-BASE-NEXT: mov v2.b[3], v17.b[0]
+; CHECK-GI-BASE-NEXT: mov v0.b[3], v6.b[0]
+; CHECK-GI-BASE-NEXT: ldr b6, [x0, #4]
+; CHECK-GI-BASE-NEXT: ldr b7, [x0, #12]
+; CHECK-GI-BASE-NEXT: mov v4.b[3], v18.b[0]
+; CHECK-GI-BASE-NEXT: ldr b16, [x1, #4]
+; CHECK-GI-BASE-NEXT: ldr b17, [x0, #20]
+; CHECK-GI-BASE-NEXT: mov v5.b[3], v19.b[0]
+; CHECK-GI-BASE-NEXT: ldr b18, [x1, #12]
+; CHECK-GI-BASE-NEXT: ldr b19, [x1, #20]
+; CHECK-GI-BASE-NEXT: mov v1.b[4], v7.b[0]
+; CHECK-GI-BASE-NEXT: mov v3.b[4], v16.b[0]
+; CHECK-GI-BASE-NEXT: mov v2.b[4], v17.b[0]
+; CHECK-GI-BASE-NEXT: mov v0.b[4], v6.b[0]
+; CHECK-GI-BASE-NEXT: ldr b6, [x0, #5]
+; CHECK-GI-BASE-NEXT: ldr b7, [x0, #13]
+; CHECK-GI-BASE-NEXT: mov v4.b[4], v18.b[0]
+; CHECK-GI-BASE-NEXT: ldr b16, [x1, #5]
+; CHECK-GI-BASE-NEXT: ldr b17, [x0, #21]
+; CHECK-GI-BASE-NEXT: mov v5.b[4], v19.b[0]
+; CHECK-GI-BASE-NEXT: ldr b18, [x1, #13]
+; CHECK-GI-BASE-NEXT: ldr b19, [x1, #21]
+; CHECK-GI-BASE-NEXT: mov v1.b[5], v7.b[0]
+; CHECK-GI-BASE-NEXT: mov v3.b[5], v16.b[0]
+; CHECK-GI-BASE-NEXT: mov v2.b[5], v17.b[0]
+; CHECK-GI-BASE-NEXT: mov v0.b[5], v6.b[0]
+; CHECK-GI-BASE-NEXT: ldr b6, [x0, #6]
+; CHECK-GI-BASE-NEXT: ldr b7, [x0, #14]
+; CHECK-GI-BASE-NEXT: mov v4.b[5], v18.b[0]
+; CHECK-GI-BASE-NEXT: ldr b16, [x1, #6]
+; CHECK-GI-BASE-NEXT: ldr b17, [x0, #22]
+; CHECK-GI-BASE-NEXT: mov v5.b[5], v19.b[0]
+; CHECK-GI-BASE-NEXT: ldr b18, [x1, #14]
+; CHECK-GI-BASE-NEXT: ldr b19, [x1, #22]
+; CHECK-GI-BASE-NEXT: mov v1.b[6], v7.b[0]
+; CHECK-GI-BASE-NEXT: mov v3.b[6], v16.b[0]
+; CHECK-GI-BASE-NEXT: mov v2.b[6], v17.b[0]
+; CHECK-GI-BASE-NEXT: mov v0.b[6], v6.b[0]
+; CHECK-GI-BASE-NEXT: ldr b6, [x0, #7]
+; CHECK-GI-BASE-NEXT: ldr b7, [x0, #15]
+; CHECK-GI-BASE-NEXT: mov v4.b[6], v18.b[0]
+; CHECK-GI-BASE-NEXT: ldr b16, [x1, #7]
+; CHECK-GI-BASE-NEXT: ldr b17, [x0, #23]
+; CHECK-GI-BASE-NEXT: mov v5.b[6], v19.b[0]
+; CHECK-GI-BASE-NEXT: ldr b18, [x1, #15]
+; CHECK-GI-BASE-NEXT: ldr b19, [x1, #23]
+; CHECK-GI-BASE-NEXT: mov v1.b[7], v7.b[0]
+; CHECK-GI-BASE-NEXT: mov v3.b[7], v16.b[0]
+; CHECK-GI-BASE-NEXT: mov v2.b[7], v17.b[0]
+; CHECK-GI-BASE-NEXT: mov v0.b[7], v6.b[0]
+; CHECK-GI-BASE-NEXT: mov v4.b[7], v18.b[0]
+; CHECK-GI-BASE-NEXT: mov v5.b[7], v19.b[0]
+; CHECK-GI-BASE-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT: ushll v3.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT: ushll v2.8h, v2.8b, #0
+; CHECK-GI-BASE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT: ushll v4.8h, v4.8b, #0
+; CHECK-GI-BASE-NEXT: ushll v5.8h, v5.8b, #0
+; CHECK-GI-BASE-NEXT: umull v6.4s, v3.4h, v0.4h
+; CHECK-GI-BASE-NEXT: umull2 v0.4s, v3.8h, v0.8h
+; CHECK-GI-BASE-NEXT: umull v3.4s, v4.4h, v1.4h
+; CHECK-GI-BASE-NEXT: umull2 v1.4s, v4.8h, v1.8h
+; CHECK-GI-BASE-NEXT: umull v4.4s, v5.4h, v2.4h
+; CHECK-GI-BASE-NEXT: umull2 v2.4s, v5.8h, v2.8h
+; CHECK-GI-BASE-NEXT: addv s5, v6.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
-; CHECK-GI-BASE-NEXT: fmov w0, s0
+; CHECK-GI-BASE-NEXT: addv s1, v1.4s
+; CHECK-GI-BASE-NEXT: addv s3, v3.4s
+; CHECK-GI-BASE-NEXT: addv s4, v4.4s
+; CHECK-GI-BASE-NEXT: addv s2, v2.4s
+; CHECK-GI-BASE-NEXT: fmov w8, s5
+; CHECK-GI-BASE-NEXT: fmov w9, s0
+; CHECK-GI-BASE-NEXT: fmov w11, s1
+; CHECK-GI-BASE-NEXT: fmov w10, s3
+; CHECK-GI-BASE-NEXT: fmov w12, s4
+; CHECK-GI-BASE-NEXT: fmov w13, s2
+; CHECK-GI-BASE-NEXT: add w8, w8, w9
+; CHECK-GI-BASE-NEXT: add w8, w8, w10
+; CHECK-GI-BASE-NEXT: add w9, w11, w12
+; CHECK-GI-BASE-NEXT: add w9, w9, w13
+; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_udot_v24i8:
@@ -2257,39 +2352,245 @@ define i32 @test_udot_v48i8(ptr %p1, ptr %p2) {
;
; CHECK-GI-BASE-LABEL: test_udot_v48i8:
; CHECK-GI-BASE: // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT: ldp q0, q4, [x1]
-; CHECK-GI-BASE-NEXT: ldr q2, [x0, #32]
-; CHECK-GI-BASE-NEXT: ldp q1, q3, [x0]
-; CHECK-GI-BASE-NEXT: ldr q7, [x1, #32]
-; CHECK-GI-BASE-NEXT: ushll2 v16.8h, v2.16b, #0
-; CHECK-GI-BASE-NEXT: ushll2 v6.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT: ldr b0, [x0]
+; CHECK-GI-BASE-NEXT: ldr b7, [x0, #1]
+; CHECK-GI-BASE-NEXT: ldr b1, [x0, #8]
+; CHECK-GI-BASE-NEXT: ldr b3, [x0, #9]
+; CHECK-GI-BASE-NEXT: ldr b2, [x0, #16]
+; CHECK-GI-BASE-NEXT: ldr b4, [x0, #17]
+; CHECK-GI-BASE-NEXT: mov v0.b[1], v7.b[0]
+; CHECK-GI-BASE-NEXT: ldr b5, [x0, #2]
+; CHECK-GI-BASE-NEXT: ldr b19, [x0, #10]
+; CHECK-GI-BASE-NEXT: mov v1.b[1], v3.b[0]
+; CHECK-GI-BASE-NEXT: mov v2.b[1], v4.b[0]
+; CHECK-GI-BASE-NEXT: ldr b4, [x0, #32]
+; CHECK-GI-BASE-NEXT: ldr b16, [x0, #33]
+; CHECK-GI-BASE-NEXT: ldr b20, [x0, #18]
+; CHECK-GI-BASE-NEXT: ldr b6, [x0, #3]
+; CHECK-GI-BASE-NEXT: ldr b3, [x0, #24]
+; CHECK-GI-BASE-NEXT: ldr b7, [x0, #25]
+; CHECK-GI-BASE-NEXT: ldr b24, [x0, #34]
+; CHECK-GI-BASE-NEXT: mov v0.b[2], v5.b[0]
+; CHECK-GI-BASE-NEXT: mov v4.b[1], v16.b[0]
+; CHECK-GI-BASE-NEXT: ldr b5, [x0, #40]
+; CHECK-GI-BASE-NEXT: mov v1.b[2], v19.b[0]
+; CHECK-GI-BASE-NEXT: mov v2.b[2], v20.b[0]
+; CHECK-GI-BASE-NEXT: mov v3.b[1], v7.b[0]
+; CHECK-GI-BASE-NEXT: ldr b19, [x0, #41]
+; CHECK-GI-BASE-NEXT: ldr b20, [x0, #11]
+; CHECK-GI-BASE-NEXT: ldr b18, [x0, #4]
+; CHECK-GI-BASE-NEXT: ldr b25, [x0, #42]
+; CHECK-GI-BASE-NEXT: ldr b21, [x0, #12]
+; CHECK-GI-BASE-NEXT: ldr b7, [x0, #5]
+; CHECK-GI-BASE-NEXT: mov v5.b[1], v19.b[0]
+; CHECK-GI-BASE-NEXT: mov v0.b[3], v6.b[0]
+; CHECK-GI-BASE-NEXT: mov v4.b[2], v24.b[0]
+; CHECK-GI-BASE-NEXT: ldr b6, [x0, #19]
+; CHECK-GI-BASE-NEXT: mov v1.b[3], v20.b[0]
+; CHECK-GI-BASE-NEXT: ldr b20, [x0, #26]
+; CHECK-GI-BASE-NEXT: ldr b19, [x0, #13]
+; CHECK-GI-BASE-NEXT: ldr b17, [x0, #6]
+; CHECK-GI-BASE-NEXT: ldr b22, [x0, #14]
+; CHECK-GI-BASE-NEXT: mov v2.b[3], v6.b[0]
+; CHECK-GI-BASE-NEXT: ldr b6, [x0, #20]
+; CHECK-GI-BASE-NEXT: mov v3.b[2], v20.b[0]
+; CHECK-GI-BASE-NEXT: mov v5.b[2], v25.b[0]
+; CHECK-GI-BASE-NEXT: mov v0.b[4], v18.b[0]
+; CHECK-GI-BASE-NEXT: ldr b18, [x0, #35]
+; CHECK-GI-BASE-NEXT: mov v1.b[4], v21.b[0]
+; CHECK-GI-BASE-NEXT: ldr b25, [x0, #27]
+; CHECK-GI-BASE-NEXT: ldr b20, [x0, #21]
+; CHECK-GI-BASE-NEXT: mov v4.b[3], v18.b[0]
+; CHECK-GI-BASE-NEXT: ldr b18, [x0, #44]
+; CHECK-GI-BASE-NEXT: ldr b24, [x0, #22]
+; CHECK-GI-BASE-NEXT: mov v2.b[4], v6.b[0]
+; CHECK-GI-BASE-NEXT: ldr b6, [x0, #43]
+; CHECK-GI-BASE-NEXT: mov v3.b[3], v25.b[0]
+; CHECK-GI-BASE-NEXT: mov v0.b[5], v7.b[0]
+; CHECK-GI-BASE-NEXT: ldr b25, [x0, #28]
+; CHECK-GI-BASE-NEXT: ldr b16, [x0, #7]
+; CHECK-GI-BASE-NEXT: mov v5.b[3], v6.b[0]
+; CHECK-GI-BASE-NEXT: ldr b6, [x0, #36]
+; CHECK-GI-BASE-NEXT: mov v1.b[5], v19.b[0]
+; CHECK-GI-BASE-NEXT: ldr b19, [x0, #37]
+; CHECK-GI-BASE-NEXT: ldr b23, [x0, #15]
+; CHECK-GI-BASE-NEXT: ldr b26, [x0, #29]
+; CHECK-GI-BASE-NEXT: mov v4.b[4], v6.b[0]
+; CHECK-GI-BASE-NEXT: mov v2.b[5], v20.b[0]
+; CHECK-GI-BASE-NEXT: mov v3.b[4], v25.b[0]
+; CHECK-GI-BASE-NEXT: mov v0.b[6], v17.b[0]
+; CHECK-GI-BASE-NEXT: ldr b17, [x0, #45]
+; CHECK-GI-BASE-NEXT: ldr b20, [x0, #38]
+; CHECK-GI-BASE-NEXT: mov v5.b[4], v18.b[0]
+; CHECK-GI-BASE-NEXT: mov v1.b[6], v22.b[0]
+; CHECK-GI-BASE-NEXT: ldr b6, [x1]
+; CHECK-GI-BASE-NEXT: ldr b21, [x0, #23]
+; CHECK-GI-BASE-NEXT: ldr b25, [x1, #17]
+; CHECK-GI-BASE-NEXT: ldr b22, [x1, #2]
+; CHECK-GI-BASE-NEXT: mov v4.b[5], v19.b[0]
+; CHECK-GI-BASE-NEXT: mov v2.b[6], v24.b[0]
+; CHECK-GI-BASE-NEXT: ldr b24, [x1, #1]
+; CHECK-GI-BASE-NEXT: mov v3.b[5], v26.b[0]
+; CHECK-GI-BASE-NEXT: mov v0.b[7], v16.b[0]
+; CHECK-GI-BASE-NEXT: ldr b26, [x1, #9]
+; CHECK-GI-BASE-NEXT: mov v5.b[5], v17.b[0]
+; CHECK-GI-BASE-NEXT: ldr b17, [x1, #8]
+; CHECK-GI-BASE-NEXT: mov v1.b[7], v23.b[0]
+; CHECK-GI-BASE-NEXT: ldr b16, [x1, #16]
+; CHECK-GI-BASE-NEXT: ldr b19, [x1, #24]
+; CHECK-GI-BASE-NEXT: ldr b23, [x1, #25]
+; CHECK-GI-BASE-NEXT: mov v4.b[6], v20.b[0]
+; CHECK-GI-BASE-NEXT: ldr b20, [x1, #32]
+; CHECK-GI-BASE-NEXT: mov v6.b[1], v24.b[0]
+; CHECK-GI-BASE-NEXT: ldr b24, [x1, #33]
+; CHECK-GI-BASE-NEXT: mov v2.b[7], v21.b[0]
+; CHECK-GI-BASE-NEXT: mov v17.b[1], v26.b[0]
+; CHECK-GI-BASE-NEXT: ldr b21, [x1, #40]
+; CHECK-GI-BASE-NEXT: mov v16.b[1], v25.b[0]
+; CHECK-GI-BASE-NEXT: ldr b25, [x1, #41]
+; CHECK-GI-BASE-NEXT: mov v19.b[1], v23.b[0]
+; CHECK-GI-BASE-NEXT: mov v20.b[1], v24.b[0]
+; CHECK-GI-BASE-NEXT: ldr b23, [x1, #10]
+; CHECK-GI-BASE-NEXT: ldr b24, [x1, #18]
+; CHECK-GI-BASE-NEXT: mov v21.b[1], v25.b[0]
+; CHECK-GI-BASE-NEXT: ldr b25, [x1, #26]
+; CHECK-GI-BASE-NEXT: mov v6.b[2], v22.b[0]
+; CHECK-GI-BASE-NEXT: ldr b22, [x1, #34]
+; CHECK-GI-BASE-NEXT: mov v17.b[2], v23.b[0]
+; CHECK-GI-BASE-NEXT: mov v16.b[2], v24.b[0]
+; CHECK-GI-BASE-NEXT: ldr b24, [x1, #42]
+; CHECK-GI-BASE-NEXT: ldr b23, [x1, #3]
+; CHECK-GI-BASE-NEXT: mov v19.b[2], v25.b[0]
+; CHECK-GI-BASE-NEXT: mov v20.b[2], v22.b[0]
+; CHECK-GI-BASE-NEXT: ldr b25, [x1, #11]
+; CHECK-GI-BASE-NEXT: ldr b22, [x1, #19]
+; CHECK-GI-BASE-NEXT: mov v21.b[2], v24.b[0]
+; ...
[truncated]
|
This is the parent PR for #85042, which fixes the codegen issues regarding the large numbers of loads generated for the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was looking at G_TRUNC recently which is quite similar in terms of what is falling back. It was running into some of the same regressions that this sees in zext_v3i8_v3i64, among other issues.
@@ -5496,6 +5496,26 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx, | |||
return Legalized; | |||
} | |||
|
|||
case TargetOpcode::G_SEXT: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is a case above that handles G_TRUNC and a few others that have different type sizes. It could be extended to handle these operations too.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Awesome, that works - though I have to set AArch64LegalizerInfo to use index 0 instead of 1.
Would it be possible to replace |
Created using spr 1.3.5
Hmm, what would the lower bound be in this case? |
The idea would be to (where we can), generally always produce legal types after legalization. Clamp all vector types to 64bit or 128bit, and after legalization only those types would remain, with the intermediate legalization artefacts combined away. So v8i8, v4i16, v2i32 (v1i64 isn't as important in this case). |
I don't think legal vector arguments to *ext can be 128 bits wide, so this would basically mean clamping them to always be 64 bits wide. I feel this should be a separate patch. |
Yeah the legal operations are always essentially that given size. I agree that it's good not to try and solve every problem in a single ticket, and this is a nice step forward. LGTM |
No description provided.