-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AArch64][SVE] Use INS when moving elements from bottom 128b of SVE type #114034
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Moving elements from a scalable vector to a fixed-lengh vector should use INS when we know that the extracted element is in the bottom 128-bits of the scalable vector. This avoids inserting unecessary UMOV/FMOV instructions.
@llvm/pr-subscribers-backend-aarch64 Author: None (SpencerAbson) ChangesMoving elements from a scalable vector to a fixed-lengh vector should use INS (vector, element) when we know that the extracted element is in the bottom 128-bits of the scalable vector. This avoids inserting unecessary UMOV/FMOV instructions. Patch is 28.13 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114034.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 2b69903b133fe3..f678ce1058bafd 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3367,6 +3367,47 @@ let Predicates = [HasSVEorSME] in {
(UMOVvi64 (v2i64 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexD:$index)>;
} // End HasNEON
+ // Use INS (element) when moving an element from the bottom 128-bits of an SVE type to a NEON vector.
+ multiclass Neon_ins_sve_elt_pattern<ValueType NeonTy, ValueType NeonQTy, ValueType SVETy, ValueType ScalTy,
+ Operand IdxTy, Operand NarrowIdxTy, Instruction INS> {
+ // Insert into 128-bit NEON type from lowest 128-bits of SVE type
+ def : Pat<(NeonQTy (vector_insert V128:$src,
+ (ScalTy (vector_extract SVETy:$Rn, IdxTy:$idx_extract)),
+ (IdxTy:$idx_insert))),
+ (INS V128:$src, IdxTy:$idx_insert,
+ (NeonQTy (EXTRACT_SUBREG SVETy:$Rn, zsub)), IdxTy:$idx_extract)>;
+
+ // Insert into 64-bit NEON type from lowest 128-bits of SVE type
+ def : Pat<(NeonTy (vector_insert V64:$src,
+ (ScalTy (vector_extract SVETy:$Rn, IdxTy:$idx_extract)),
+ (NarrowIdxTy:$idx_insert))),
+ (EXTRACT_SUBREG
+ (INS
+ (INSERT_SUBREG (NeonQTy (IMPLICIT_DEF)), V64:$src, dsub), NarrowIdxTy:$idx_insert,
+ (NeonQTy (EXTRACT_SUBREG SVETy:$Rn, zsub)), IdxTy:$idx_extract),
+ dsub)>;
+ }
+
+ // Inserting into <1 x double/i64> will just create a new vector from the scalar value.
+ multiclass Neon_ins_64b_sve_elt_pattern<ValueType NeonTy, ValueType NeonQTy, ValueType SVETy,
+ ValueType ScalTy> {
+ // Insert into 128-bit NEON type from lowest 128-bits of SVE type
+ def : Pat<(NeonQTy (vector_insert V128:$src,
+ (ScalTy (vector_extract SVETy:$Rn, VectorIndexD:$idx_extract)),
+ (VectorIndexD:$idx_insert))),
+ (INSvi64lane
+ V128:$src, VectorIndexD:$idx_insert, (NeonQTy (EXTRACT_SUBREG SVETy:$Rn, zsub)),
+ VectorIndexD:$idx_extract)>;
+
+ // Insert into 64-bit NEON type from lowest 128-bits of SVE type
+ def : Pat<(NeonTy (scalar_to_vector
+ (ScalTy (vector_extract SVETy:$Rn, VectorIndexD:$idx_extract)))),
+ (EXTRACT_SUBREG
+ (INSvi64lane (IMPLICIT_DEF), 0, (NeonQTy (EXTRACT_SUBREG SVETy:$Rn, zsub)),
+ VectorIndexD:$idx_extract),
+ dsub)>;
+ }
+
let Predicates = [HasNEON] in {
def : Pat<(sext_inreg (vector_extract nxv16i8:$vec, VectorIndexB:$index), i8),
(SMOVvi8to32 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>;
@@ -3380,6 +3421,15 @@ let Predicates = [HasSVEorSME] in {
def : Pat<(sext (i32 (vector_extract nxv4i32:$vec, VectorIndexS:$index))),
(SMOVvi32to64 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>;
+
+ defm : Neon_ins_sve_elt_pattern<v8i8, v16i8, nxv16i8, i32, VectorIndexB, VectorIndexH, INSvi8lane>;
+ defm : Neon_ins_sve_elt_pattern<v4f16, v8f16, nxv8f16, f16, VectorIndexH, VectorIndexS, INSvi16lane>;
+ defm : Neon_ins_sve_elt_pattern<v4bf16, v8bf16, nxv8bf16, bf16, VectorIndexH, VectorIndexS, INSvi16lane>;
+ defm : Neon_ins_sve_elt_pattern<v4i16, v8i16, nxv8i16, i32, VectorIndexH, VectorIndexS, INSvi16lane>;
+ defm : Neon_ins_sve_elt_pattern<v2f32, v4f32, nxv4f32, f32, VectorIndexS, VectorIndexD, INSvi32lane>;
+ defm : Neon_ins_sve_elt_pattern<v2i32, v4i32, nxv4i32, i32, VectorIndexS, VectorIndexD, INSvi32lane>;
+ defm : Neon_ins_64b_sve_elt_pattern<v1f64, v2f64, nxv2f64, f64>;
+ defm : Neon_ins_64b_sve_elt_pattern<v1i64, v2i64, nxv2i64, i64>;
} // End HasNEON
// Extract first element from vector.
diff --git a/llvm/test/CodeGen/AArch64/neon-insert-sve-elt.ll b/llvm/test/CodeGen/AArch64/neon-insert-sve-elt.ll
new file mode 100644
index 00000000000000..0f4eec4fdfda1b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-insert-sve-elt.ll
@@ -0,0 +1,469 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+neon < %s | FileCheck %s
+
+; Inserting an element from the bottom 128-bits of an SVE type into a NEON vector should use INS (element) to
+; avoid pointless FMOV trips.
+
+; --------- extraction from nxv16i8
+
+define <8 x i8> @test_lane0_nxv16i8(<8 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: test_lane0_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.b[0], v1.b[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 16 x i8> %b, i32 0
+ %d = insertelement <8 x i8> %a, i8 %c, i32 0
+ ret <8 x i8> %d
+}
+
+define <8 x i8> @test_lane15_nxv16i8(<8 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: test_lane15_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.b[7], v1.b[15]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 16 x i8> %b, i32 15
+ %d = insertelement <8 x i8> %a, i8 %c, i32 7
+ ret <8 x i8> %d
+}
+
+define <16 x i8> @test_q_lane0_nxv16i8(<16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: test_q_lane0_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.b[0], v1.b[0]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 16 x i8> %b, i32 0
+ %d = insertelement <16 x i8> %a, i8 %c, i32 0
+ ret <16 x i8> %d
+}
+
+define <16 x i8> @test_q_lane15_nxv16i8(<16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: test_q_lane15_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.b[15], v1.b[15]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 16 x i8> %b, i32 15
+ %d = insertelement <16 x i8> %a, i8 %c, i32 15
+ ret <16 x i8> %d
+}
+
+; (negative test) Extracted element is not within Vn
+define <16 x i8> @test_q_lane16_nxv16i8(<16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: test_q_lane16_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.b, z1.b[16]
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: mov v0.b[15], w8
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 16 x i8> %b, i32 16
+ %d = insertelement <16 x i8> %a, i8 %c, i32 15
+ ret <16 x i8> %d
+}
+
+; --------- extraction from nxv8f16
+
+define <4 x half> @test_lane0_nxv8f16(<4 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: test_lane0_nxv8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 8 x half> %b, i32 0
+ %d = insertelement <4 x half> %a, half %c, i32 0
+ ret <4 x half> %d
+}
+
+define <4 x half> @test_lane7_nxv8f16(<4 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: test_lane7_nxv8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.h[3], v1.h[7]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 8 x half> %b, i32 7
+ %d = insertelement <4 x half> %a, half %c, i32 3
+ ret <4 x half> %d
+}
+
+define <8 x half> @test_q_lane0_nxv8f16(<8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: test_q_lane0_nxv8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 8 x half> %b, i32 0
+ %d = insertelement <8 x half> %a, half %c, i32 0
+ ret <8 x half> %d
+}
+
+define <8 x half> @test_q_lane7_nxv8f16(<8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: test_q_lane7_nxv8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.h[7], v1.h[7]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 8 x half> %b, i32 7
+ %d = insertelement <8 x half> %a, half %c, i32 7
+ ret <8 x half> %d
+}
+
+; (negative test) Extracted element is not within Vn
+define <8 x half> @test_q_lane8_nxv8f16(<8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: test_q_lane8_nxv8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.h, z1.h[8]
+; CHECK-NEXT: mov v0.h[7], v1.h[0]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 8 x half> %b, i32 8
+ %d = insertelement <8 x half> %a, half %c, i32 7
+ ret <8 x half> %d
+}
+
+; --------- extraction from nxv8bf16
+
+define <4 x bfloat> @test_lane0_nxv8bf16(<4 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: test_lane0_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 8 x bfloat> %b, i32 0
+ %d = insertelement <4 x bfloat> %a, bfloat %c, i32 0
+ ret <4 x bfloat> %d
+}
+
+define <4 x bfloat> @test_lane7_nxv8bf16(<4 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: test_lane7_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.h[3], v1.h[7]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 8 x bfloat> %b, i32 7
+ %d = insertelement <4 x bfloat> %a, bfloat %c, i32 3
+ ret <4 x bfloat> %d
+}
+
+define <8 x bfloat> @test_q_lane0_nxv8bf16(<8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: test_q_lane0_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 8 x bfloat> %b, i32 0
+ %d = insertelement <8 x bfloat> %a, bfloat %c, i32 0
+ ret <8 x bfloat> %d
+}
+
+define <8 x bfloat> @test_q_lane7_nxv8bf16(<8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: test_q_lane7_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.h[7], v1.h[7]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 8 x bfloat> %b, i32 7
+ %d = insertelement <8 x bfloat> %a, bfloat %c, i32 7
+ ret <8 x bfloat> %d
+}
+
+; (negative test) Extracted element is not within Vn
+define <8 x bfloat> @test_q_lane8_nxv8bf16(<8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: test_q_lane8_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.h, z1.h[8]
+; CHECK-NEXT: mov v0.h[7], v1.h[0]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 8 x bfloat> %b, i32 8
+ %d = insertelement <8 x bfloat> %a, bfloat %c, i32 7
+ ret <8 x bfloat> %d
+}
+
+; --------- extraction from nxv8i16
+
+define <4 x i16> @test_lane0_nxv8i16(<4 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: test_lane0_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 8 x i16> %b, i32 0
+ %d = insertelement <4 x i16> %a, i16 %c, i32 0
+ ret <4 x i16> %d
+}
+
+define <4 x i16> @test_lane7_nxv8i16(<4 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: test_lane7_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.h[3], v1.h[7]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 8 x i16> %b, i32 7
+ %d = insertelement <4 x i16> %a, i16 %c, i32 3
+ ret <4 x i16> %d
+}
+
+define <8 x i16> @test_q_lane0_nxv8i16(<8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: test_q_lane0_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.h[0], v1.h[0]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 8 x i16> %b, i32 0
+ %d = insertelement <8 x i16> %a, i16 %c, i32 0
+ ret <8 x i16> %d
+}
+
+define <8 x i16> @test_q_lane7_nxv8i16(<8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: test_q_lane7_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.h[7], v1.h[7]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 8 x i16> %b, i32 7
+ %d = insertelement <8 x i16> %a, i16 %c, i32 7
+ ret <8 x i16> %d
+}
+
+; (negative test) Extracted element is not within Vn
+define <8 x i16> @test_q_lane8_nxv8i16(<8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: test_q_lane8_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.h, z1.h[8]
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: mov v0.h[7], w8
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 8 x i16> %b, i32 8
+ %d = insertelement <8 x i16> %a, i16 %c, i32 7
+ ret <8 x i16> %d
+}
+
+; --------- extraction from nxv4f32
+
+define <2 x float> @test_lane0_nxv4f32(<2 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: test_lane0_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 4 x float> %b, i32 0
+ %d = insertelement <2 x float> %a, float %c, i32 0
+ ret <2 x float> %d
+}
+
+define <2 x float> @test_lane3_nxv4f32(<2 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: test_lane3_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.s[1], v1.s[3]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 4 x float> %b, i32 3
+ %d = insertelement <2 x float> %a, float %c, i32 1
+ ret <2 x float> %d
+}
+
+define <4 x float> @test_q_lane0_nxv4f32(<4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: test_q_lane0_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 4 x float> %b, i32 0
+ %d = insertelement <4 x float> %a, float %c, i32 0
+ ret <4 x float> %d
+}
+
+define <4 x float> @test_q_lane3_nxv4f32(<4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: test_q_lane3_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.s[3], v1.s[3]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 4 x float> %b, i32 3
+ %d = insertelement <4 x float> %a, float %c, i32 3
+ ret <4 x float> %d
+}
+
+; (negative test) Extracted element is not within Vn
+define <4 x float> @test_q_lane4_nxv4f32(<4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: test_q_lane4_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.s, z1.s[4]
+; CHECK-NEXT: mov v0.s[3], v1.s[0]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 4 x float> %b, i32 4
+ %d = insertelement <4 x float> %a, float %c, i32 3
+ ret <4 x float> %d
+}
+
+; --------- extraction from nxv4i32
+
+define <2 x i32> @test_lane0_nxv4i32(<2 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_lane0_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 4 x i32> %b, i32 0
+ %d = insertelement <2 x i32> %a, i32 %c, i32 0
+ ret <2 x i32> %d
+}
+
+define <2 x i32> @test_lane3_nxv4i32(<2 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_lane3_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v0.s[1], v1.s[3]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 4 x i32> %b, i32 3
+ %d = insertelement <2 x i32> %a, i32 %c, i32 1
+ ret <2 x i32> %d
+}
+
+define <4 x i32> @test_q_lane0_nxv4i32(<4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_q_lane0_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 4 x i32> %b, i32 0
+ %d = insertelement <4 x i32> %a, i32 %c, i32 0
+ ret <4 x i32> %d
+}
+
+define <4 x i32> @test_q_lane3_nxv4i32(<4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_q_lane3_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.s[3], v1.s[3]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 4 x i32> %b, i32 3
+ %d = insertelement <4 x i32> %a, i32 %c, i32 3
+ ret <4 x i32> %d
+}
+
+; (negative test) Extracted element is not within Vn
+define <4 x i32> @test_q_lane4_nxv4i32(<4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_q_lane4_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.s, z1.s[4]
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: mov v0.s[3], w8
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 4 x i32> %b, i32 4
+ %d = insertelement <4 x i32> %a, i32 %c, i32 3
+ ret <4 x i32> %d
+}
+
+; --------- extraction from nxv2f64
+
+define <1 x double> @test_lane0_nxv2f64(<1 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: test_lane0_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.d[0], v1.d[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x double> %b, i32 0
+ %d = insertelement <1 x double> %a, double %c, i32 0
+ ret <1 x double> %d
+}
+
+define <1 x double> @test_lane1_nxv2f64(<1 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: test_lane1_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.d[0], v1.d[1]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x double> %b, i32 1
+ %d = insertelement <1 x double> %a, double %c, i32 0
+ ret <1 x double> %d
+}
+
+define <2 x double> @test_q_lane0_nxv2f64(<2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: test_q_lane0_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.d[0], v1.d[0]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x double> %b, i32 0
+ %d = insertelement <2 x double> %a, double %c, i32 0
+ ret <2 x double> %d
+}
+
+define <2 x double> @test_q_lane1_nxv2f64(<2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: test_q_lane1_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.d[1], v1.d[1]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x double> %b, i32 1
+ %d = insertelement <2 x double> %a, double %c, i32 1
+ ret <2 x double> %d
+}
+
+; (negative test) Extracted element is not within Vn
+define <2 x double> @test_q_lane2_nxv2f64(<2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: test_q_lane2_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, z1.d[2]
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x double> %b, i32 2
+ %d = insertelement <2 x double> %a, double %c, i32 1
+ ret <2 x double> %d
+}
+
+; --------- extraction from nxv2i64
+
+define <1 x i64> @test_lane0_nxv2i64(<1 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_lane0_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.d[0], v1.d[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 0
+ %d = insertelement <1 x i64> %a, i64 %c, i32 0
+ ret <1 x i64> %d
+}
+
+define <1 x i64> @test_lane1_nxv2i64(<1 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_lane1_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.d[0], v1.d[1]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 1
+ %d = insertelement <1 x i64> %a, i64 %c, i32 0
+ ret <1 x i64> %d
+}
+
+define <2 x i64> @test_q_lane0_nxv2i64(<2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_q_lane0_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.d[0], v1.d[0]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 x i64> %b, i32 0
+ %d = insertelement <2 x i64> %a, i64 %c, i32 0
+ ret <2 x i64> %d
+}
+
+define <2 x i64> @test_q_lane1_nxv2i64(<2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_q_lane1_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.d[1], v1.d[1]
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 2 ...
[truncated]
|
Is it possible to extend the existing |
…ype (llvm#114034) Moving elements from a scalable vector to a fixed-lengh vector should use[ INS (vector, element) ](https://developer.arm.com/documentation/100069/0606/SIMD-Vector-Instructions/INS--vector--element-) when we know that the extracted element is in the bottom 128-bits of the scalable vector. This avoids inserting unecessary UMOV/FMOV instructions.
…ype (llvm#114034) Moving elements from a scalable vector to a fixed-lengh vector should use[ INS (vector, element) ](https://developer.arm.com/documentation/100069/0606/SIMD-Vector-Instructions/INS--vector--element-) when we know that the extracted element is in the bottom 128-bits of the scalable vector. This avoids inserting unecessary UMOV/FMOV instructions.
Moving elements from a scalable vector to a fixed-lengh vector should use INS (vector, element) when we know that the extracted element is in the bottom 128-bits of the scalable vector. This avoids inserting unecessary UMOV/FMOV instructions.