Skip to content

Commit c2bd5c2

Browse files
authored
[AArch64] Avoid GPR trip when moving truncated i32 vector elements (#114541)
This patch implements a DAG combine whereby ``` a: v2i64 = ... b: i64 = extract_vector_elt a, Constant:i64<n> c: i32 = truncate b ``` Becomes ``` a: v2i64 = ... b: v4i32 = AArch64ISD::NVCAST a c: i32 = extract_vector_elt c, Constant:i64<2n> ``` The primary goal of this work is to enable the use of [INS (element)](https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/INS--element---Insert-vector-element-from-another-vector-element-?lang=en) when moving a truncated i32 element between vectors. This combine canonicalises the structure of the DAG for all legal instances of the pattern above (by removing the explicit `trunc` operator in this specific case), allowing us to take advantage of existing ISEL patterns for this behavior.
1 parent 919aead commit c2bd5c2

16 files changed

+250
-146
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20945,17 +20945,47 @@ static SDValue performBuildVectorCombine(SDNode *N,
2094520945
return SDValue();
2094620946
}
2094720947

20948-
static SDValue performTruncateCombine(SDNode *N,
20949-
SelectionDAG &DAG) {
20948+
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
20949+
TargetLowering::DAGCombinerInfo &DCI) {
20950+
SDLoc DL(N);
2095020951
EVT VT = N->getValueType(0);
2095120952
SDValue N0 = N->getOperand(0);
2095220953
if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
2095320954
N0.getOpcode() == AArch64ISD::DUP) {
2095420955
SDValue Op = N0.getOperand(0);
2095520956
if (VT.getScalarType() == MVT::i32 &&
2095620957
N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
20957-
Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op);
20958-
return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op);
20958+
Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op);
20959+
return DAG.getNode(N0.getOpcode(), DL, VT, Op);
20960+
}
20961+
20962+
// Performing the following combine produces a preferable form for ISEL.
20963+
// i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx*2))
20964+
if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20965+
N0.hasOneUse()) {
20966+
SDValue Op = N0.getOperand(0);
20967+
SDValue ExtractIndexNode = N0.getOperand(1);
20968+
if (!isa<ConstantSDNode>(ExtractIndexNode))
20969+
return SDValue();
20970+
20971+
// For a legal DAG, EXTRACT_VECTOR_ELT can only have produced an i32 or i64.
20972+
// So we can only expect: i32 (trunc (i64 (extract Vi64, idx))).
20973+
assert((VT == MVT::i32 && N0.getValueType() == MVT::i64) &&
20974+
"Unexpected legalisation result!");
20975+
20976+
EVT SrcVectorType = Op.getValueType();
20977+
// We also assume that SrcVectorType cannot be a V64 (see
20978+
// LowerEXTRACT_VECTOR_ELT).
20979+
assert((SrcVectorType == MVT::v2i64 || SrcVectorType == MVT::nxv2i64) &&
20980+
"Unexpected legalisation result!");
20981+
20982+
unsigned ExtractIndex =
20983+
cast<ConstantSDNode>(ExtractIndexNode)->getZExtValue();
20984+
MVT CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
20985+
20986+
Op = DAG.getNode(AArch64ISD::NVCAST, DL, CastVT, Op);
20987+
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
20988+
DAG.getVectorIdxConstant(ExtractIndex * 2, DL));
2095920989
}
2096020990

2096120991
return SDValue();
@@ -26258,7 +26288,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
2625826288
case ISD::BUILD_VECTOR:
2625926289
return performBuildVectorCombine(N, DCI, DAG);
2626026290
case ISD::TRUNCATE:
26261-
return performTruncateCombine(N, DAG);
26291+
return performTruncateCombine(N, DAG, DCI);
2626226292
case AArch64ISD::ANDS:
2626326293
return performFlagSettingCombine(N, DCI, ISD::AND);
2626426294
case AArch64ISD::ADC:

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 20 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -6977,6 +6977,12 @@ def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
69776977
def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
69786978
(DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;
69796979

6980+
// Also covers DUP (truncate i64 to i32)
6981+
def : Pat<(v2i32 (AArch64dup (i32 (extractelt (v4i32 V128:$Rn), imm:$idx)))),
6982+
(DUPv2i32lane V128:$Rn, imm:$idx)>;
6983+
def : Pat<(v4i32 (AArch64dup (i32 (extractelt (v4i32 V128:$Rn), imm:$idx)))),
6984+
(DUPv4i32lane V128:$Rn, imm:$idx)>;
6985+
69806986
// If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane
69816987
// instruction even if the types don't match: we just have to remap the lane
69826988
// carefully. N.b. this trick only applies to truncations.
@@ -6990,44 +6996,20 @@ def VecIndex_x8 : SDNodeXForm<imm, [{
69906996
return CurDAG->getTargetConstant(8 * N->getZExtValue(), SDLoc(N), MVT::i64);
69916997
}]>;
69926998

6993-
multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT,
6994-
ValueType Src128VT, ValueType ScalVT,
6995-
Instruction DUP, SDNodeXForm IdxXFORM> {
6996-
def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src128VT V128:$Rn),
6997-
imm:$idx)))),
6998-
(DUP V128:$Rn, (IdxXFORM imm:$idx))>;
6999-
7000-
def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src64VT V64:$Rn),
7001-
imm:$idx)))),
7002-
(DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
7003-
}
7004-
7005-
defm : DUPWithTruncPats<v8i8, v4i16, v8i16, i32, DUPv8i8lane, VecIndex_x2>;
7006-
defm : DUPWithTruncPats<v8i8, v2i32, v4i32, i32, DUPv8i8lane, VecIndex_x4>;
7007-
defm : DUPWithTruncPats<v4i16, v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
7008-
7009-
defm : DUPWithTruncPats<v16i8, v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
7010-
defm : DUPWithTruncPats<v16i8, v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
7011-
defm : DUPWithTruncPats<v8i16, v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
7012-
7013-
multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
7014-
SDNodeXForm IdxXFORM> {
7015-
def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn),
7016-
imm:$idx))))),
7017-
(DUP V128:$Rn, (IdxXFORM imm:$idx))>;
7018-
7019-
def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v1i64 V64:$Rn),
7020-
imm:$idx))))),
7021-
(DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
7022-
}
7023-
7024-
defm : DUPWithTrunci64Pats<v8i8, DUPv8i8lane, VecIndex_x8>;
7025-
defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane, VecIndex_x4>;
7026-
defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane, VecIndex_x2>;
7027-
7028-
defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>;
7029-
defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>;
7030-
defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>;
6999+
class DUPWithTruncPat<ValueType ResVT, ValueType SrcVT, ValueType ScalVT,
7000+
Instruction DUP, SDNodeXForm IdxXFORM>
7001+
: Pat<(ResVT (AArch64dup (ScalVT (vector_extract (SrcVT V128:$Rn), imm:$idx)))),
7002+
(DUP V128:$Rn, (IdxXFORM imm:$idx))>;
7003+
7004+
// DUP (truncate i16 to i8)
7005+
def : DUPWithTruncPat<v8i8, v8i16, i32, DUPv8i8lane, VecIndex_x2>;
7006+
def : DUPWithTruncPat<v16i8, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
7007+
// DUP (truncate i32/64 to i8)
7008+
def : DUPWithTruncPat<v8i8, v4i32, i32, DUPv8i8lane, VecIndex_x4>;
7009+
def : DUPWithTruncPat<v16i8, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
7010+
// DUP (truncate i32/i64 to i16)
7011+
def : DUPWithTruncPat<v4i16, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
7012+
def : DUPWithTruncPat<v8i16, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
70317013

70327014
// SMOV and UMOV definitions, with some extra patterns for convenience
70337015
defm SMOV : SMov;

llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -384,9 +384,9 @@ define void @insert_vec_v4i16_uaddlv_from_v4i32(ptr %0) {
384384
; CHECK-NEXT: movi.2d v1, #0000000000000000
385385
; CHECK-NEXT: uaddlv.4s d0, v0
386386
; CHECK-NEXT: mov.h v1[0], v0[0]
387-
; CHECK-NEXT: ushll.4s v0, v1, #0
388-
; CHECK-NEXT: ucvtf.4s v0, v0
389-
; CHECK-NEXT: str q0, [x0]
387+
; CHECK-NEXT: ushll.4s v1, v1, #0
388+
; CHECK-NEXT: ucvtf.4s v1, v1
389+
; CHECK-NEXT: str q1, [x0]
390390
; CHECK-NEXT: ret
391391

392392
entry:
@@ -403,13 +403,13 @@ define void @insert_vec_v16i16_uaddlv_from_v4i32(ptr %0) {
403403
; CHECK: ; %bb.0: ; %entry
404404
; CHECK-NEXT: movi.2d v0, #0000000000000000
405405
; CHECK-NEXT: movi.2d v1, #0000000000000000
406+
; CHECK-NEXT: movi.2d v2, #0000000000000000
406407
; CHECK-NEXT: uaddlv.4s d0, v0
408+
; CHECK-NEXT: stp q2, q2, [x0, #32]
407409
; CHECK-NEXT: mov.h v1[0], v0[0]
408-
; CHECK-NEXT: movi.2d v0, #0000000000000000
409410
; CHECK-NEXT: ushll.4s v1, v1, #0
410-
; CHECK-NEXT: stp q0, q0, [x0, #32]
411411
; CHECK-NEXT: ucvtf.4s v1, v1
412-
; CHECK-NEXT: stp q1, q0, [x0]
412+
; CHECK-NEXT: stp q1, q2, [x0]
413413
; CHECK-NEXT: ret
414414

415415
entry:
@@ -430,9 +430,9 @@ define void @insert_vec_v8i8_uaddlv_from_v4i32(ptr %0) {
430430
; CHECK-NEXT: uaddlv.4s d0, v0
431431
; CHECK-NEXT: mov.h v1[0], v0[0]
432432
; CHECK-NEXT: bic.4h v1, #255, lsl #8
433-
; CHECK-NEXT: ushll.4s v0, v1, #0
434-
; CHECK-NEXT: ucvtf.4s v0, v0
435-
; CHECK-NEXT: str q0, [x0]
433+
; CHECK-NEXT: ushll.4s v1, v1, #0
434+
; CHECK-NEXT: ucvtf.4s v1, v1
435+
; CHECK-NEXT: str q1, [x0]
436436
; CHECK-NEXT: ret
437437

438438
entry:
@@ -449,14 +449,14 @@ define void @insert_vec_v16i8_uaddlv_from_v4i32(ptr %0) {
449449
; CHECK: ; %bb.0: ; %entry
450450
; CHECK-NEXT: movi.2d v0, #0000000000000000
451451
; CHECK-NEXT: movi.2d v1, #0000000000000000
452+
; CHECK-NEXT: movi.2d v2, #0000000000000000
452453
; CHECK-NEXT: uaddlv.4s d0, v0
454+
; CHECK-NEXT: stp q2, q2, [x0, #32]
453455
; CHECK-NEXT: mov.h v1[0], v0[0]
454-
; CHECK-NEXT: movi.2d v0, #0000000000000000
455456
; CHECK-NEXT: bic.4h v1, #255, lsl #8
456-
; CHECK-NEXT: stp q0, q0, [x0, #32]
457457
; CHECK-NEXT: ushll.4s v1, v1, #0
458458
; CHECK-NEXT: ucvtf.4s v1, v1
459-
; CHECK-NEXT: stp q1, q0, [x0]
459+
; CHECK-NEXT: stp q1, q2, [x0]
460460
; CHECK-NEXT: ret
461461

462462
entry:
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+neon < %s | FileCheck %s
3+
4+
; Inserting a truncated (i64 to i32) element from the bottom 128-bits of any vector type into a NEON vector should use INS (element) of the
5+
; truncated size to avoid pointless GPR trips.
6+
7+
8+
define <2 x i32> @test_s_trunc_d_lane0(<2 x i32> %a, <1 x i64> %b) {
9+
; CHECK-LABEL: test_s_trunc_d_lane0:
10+
; CHECK: // %bb.0:
11+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
12+
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
13+
; CHECK-NEXT: mov v0.s[0], v1.s[0]
14+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
15+
; CHECK-NEXT: ret
16+
%c = extractelement <1 x i64> %b, i32 0
17+
%d = trunc i64 %c to i32
18+
%e = insertelement <2 x i32> %a, i32 %d, i64 0
19+
ret <2 x i32> %e
20+
}
21+
22+
define <2 x i32> @test_s_trunc_d_qlane1(<2 x i32> %a, <2 x i64> %b) {
23+
; CHECK-LABEL: test_s_trunc_d_qlane1:
24+
; CHECK: // %bb.0:
25+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
26+
; CHECK-NEXT: mov v0.s[0], v1.s[2]
27+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
28+
; CHECK-NEXT: ret
29+
%c = extractelement <2 x i64> %b, i32 1
30+
%d = trunc i64 %c to i32
31+
%e = insertelement <2 x i32> %a, i32 %d, i64 0
32+
ret <2 x i32> %e
33+
}
34+
35+
define <4 x i32> @test_qs_trunc_d_lane0(<4 x i32> %a, <1 x i64> %b) {
36+
; CHECK-LABEL: test_qs_trunc_d_lane0:
37+
; CHECK: // %bb.0:
38+
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
39+
; CHECK-NEXT: mov v0.s[0], v1.s[0]
40+
; CHECK-NEXT: ret
41+
%c = extractelement <1 x i64> %b, i32 0
42+
%d = trunc i64 %c to i32
43+
%e = insertelement <4 x i32> %a, i32 %d, i64 0
44+
ret <4 x i32> %e
45+
}
46+
47+
define <4 x i32> @test_qs_trunc_d_qlane1(<4 x i32> %a, <2 x i64> %b) {
48+
; CHECK-LABEL: test_qs_trunc_d_qlane1:
49+
; CHECK: // %bb.0:
50+
; CHECK-NEXT: mov v0.s[3], v1.s[2]
51+
; CHECK-NEXT: ret
52+
%c = extractelement <2 x i64> %b, i32 1
53+
%d = trunc i64 %c to i32
54+
%e = insertelement <4 x i32> %a, i32 %d, i64 3
55+
ret <4 x i32> %e
56+
}
57+
58+
; ---- From the bottom 128b of an SVE vector
59+
60+
define <2 x i32> @test_s_trunc_dsve_lane0(<2 x i32> %a, <vscale x 2 x i64> %b) {
61+
; CHECK-LABEL: test_s_trunc_dsve_lane0:
62+
; CHECK: // %bb.0:
63+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
64+
; CHECK-NEXT: mov v0.s[0], v1.s[0]
65+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
66+
; CHECK-NEXT: ret
67+
%c = extractelement <vscale x 2 x i64> %b, i32 0
68+
%d = trunc i64 %c to i32
69+
%e = insertelement <2 x i32> %a, i32 %d, i64 0
70+
ret <2 x i32> %e
71+
}
72+
73+
define <2 x i32> @test_s_trunc_dsve_lane1(<2 x i32> %a, <vscale x 2 x i64> %b) {
74+
; CHECK-LABEL: test_s_trunc_dsve_lane1:
75+
; CHECK: // %bb.0:
76+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
77+
; CHECK-NEXT: mov v0.s[1], v1.s[2]
78+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
79+
; CHECK-NEXT: ret
80+
%c = extractelement <vscale x 2 x i64> %b, i32 1
81+
%d = trunc i64 %c to i32
82+
%e = insertelement <2 x i32> %a, i32 %d, i64 1
83+
ret <2 x i32> %e
84+
}
85+
86+
; (negative test) Extracted element is not within V-register.
87+
define <2 x i32> @test_s_trunc_dsve_lane2(<2 x i32> %a, <vscale x 2 x i64> %b) {
88+
; CHECK-LABEL: test_s_trunc_dsve_lane2:
89+
; CHECK: // %bb.0:
90+
; CHECK-NEXT: mov z1.s, z1.s[4]
91+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
92+
; CHECK-NEXT: fmov w8, s1
93+
; CHECK-NEXT: mov v0.s[1], w8
94+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
95+
; CHECK-NEXT: ret
96+
%c = extractelement <vscale x 2 x i64> %b, i32 2
97+
%d = trunc i64 %c to i32
98+
%e = insertelement <2 x i32> %a, i32 %d, i64 1
99+
ret <2 x i32> %e
100+
}
101+
102+
define <4 x i32> @test_qs_trunc_dsve_lane0(<4 x i32> %a, <vscale x 2 x i64> %b) {
103+
; CHECK-LABEL: test_qs_trunc_dsve_lane0:
104+
; CHECK: // %bb.0:
105+
; CHECK-NEXT: mov v0.s[0], v1.s[0]
106+
; CHECK-NEXT: ret
107+
%c = extractelement <vscale x 2 x i64> %b, i32 0
108+
%d = trunc i64 %c to i32
109+
%e = insertelement <4 x i32> %a, i32 %d, i64 0
110+
ret <4 x i32> %e
111+
}
112+
113+
define <4 x i32> @test_qs_trunc_dsve_lane1(<4 x i32> %a, <vscale x 2 x i64> %b) {
114+
; CHECK-LABEL: test_qs_trunc_dsve_lane1:
115+
; CHECK: // %bb.0:
116+
; CHECK-NEXT: mov v0.s[3], v1.s[2]
117+
; CHECK-NEXT: ret
118+
%c = extractelement <vscale x 2 x i64> %b, i32 1
119+
%d = trunc i64 %c to i32
120+
%e = insertelement <4 x i32> %a, i32 %d, i64 3
121+
ret <4 x i32> %e
122+
}
123+
124+
; (negative test) Extracted element is not within V-register.
125+
define <4 x i32> @test_qs_trunc_dsve_lane2(<4 x i32> %a, <vscale x 2 x i64> %b) {
126+
; CHECK-LABEL: test_qs_trunc_dsve_lane2:
127+
; CHECK: // %bb.0:
128+
; CHECK-NEXT: mov z1.s, z1.s[4]
129+
; CHECK-NEXT: fmov w8, s1
130+
; CHECK-NEXT: mov v0.s[3], w8
131+
; CHECK-NEXT: ret
132+
%c = extractelement <vscale x 2 x i64> %b, i32 2
133+
%d = trunc i64 %c to i32
134+
%e = insertelement <4 x i32> %a, i32 %d, i64 3
135+
ret <4 x i32> %e
136+
}

llvm/test/CodeGen/AArch64/sve-doublereduct.ll

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,7 @@ define i32 @add_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
9191
; CHECK-NEXT: ptrue p0.s
9292
; CHECK-NEXT: add z0.s, z0.s, z2.s
9393
; CHECK-NEXT: uaddv d0, p0, z0.s
94-
; CHECK-NEXT: fmov x0, d0
95-
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
94+
; CHECK-NEXT: fmov w0, s0
9695
; CHECK-NEXT: ret
9796
%r1 = call i32 @llvm.vector.reduce.add.i32.nxv8i32(<vscale x 8 x i32> %a)
9897
%r2 = call i32 @llvm.vector.reduce.add.i32.nxv4i32(<vscale x 4 x i32> %b)
@@ -112,8 +111,7 @@ define i16 @add_ext_i16(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
112111
; CHECK-NEXT: add z1.h, z1.h, z3.h
113112
; CHECK-NEXT: add z0.h, z0.h, z1.h
114113
; CHECK-NEXT: uaddv d0, p0, z0.h
115-
; CHECK-NEXT: fmov x0, d0
116-
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
114+
; CHECK-NEXT: fmov w0, s0
117115
; CHECK-NEXT: ret
118116
%ae = zext <vscale x 16 x i8> %a to <vscale x 16 x i16>
119117
%be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
@@ -139,8 +137,7 @@ define i16 @add_ext_v32i16(<vscale x 32 x i8> %a, <vscale x 16 x i8> %b) {
139137
; CHECK-NEXT: add z1.h, z2.h, z5.h
140138
; CHECK-NEXT: add z0.h, z0.h, z1.h
141139
; CHECK-NEXT: uaddv d0, p0, z0.h
142-
; CHECK-NEXT: fmov x0, d0
143-
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
140+
; CHECK-NEXT: fmov w0, s0
144141
; CHECK-NEXT: ret
145142
%ae = zext <vscale x 32 x i8> %a to <vscale x 32 x i16>
146143
%be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>

llvm/test/CodeGen/AArch64/sve-extract-element.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -644,8 +644,8 @@ define i1 @test_lane4_2xi1(<vscale x 2 x i1> %a) #0 {
644644
; CHECK-LABEL: test_lane4_2xi1:
645645
; CHECK: // %bb.0:
646646
; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1
647-
; CHECK-NEXT: mov z0.d, z0.d[4]
648-
; CHECK-NEXT: fmov x8, d0
647+
; CHECK-NEXT: mov z0.s, z0.s[8]
648+
; CHECK-NEXT: fmov w8, s0
649649
; CHECK-NEXT: and w0, w8, #0x1
650650
; CHECK-NEXT: ret
651651
%b = extractelement <vscale x 2 x i1> %a, i32 4

llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -238,11 +238,8 @@ define <2 x i1> @extract_v2i1_nxv2i1(<vscale x 2 x i1> %inmask) {
238238
; CHECK-LABEL: extract_v2i1_nxv2i1:
239239
; CHECK: // %bb.0:
240240
; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1
241-
; CHECK-NEXT: fmov x0, d0
242-
; CHECK-NEXT: mov x8, v0.d[1]
243-
; CHECK-NEXT: fmov s0, w0
244-
; CHECK-NEXT: mov v0.s[1], w8
245-
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
241+
; CHECK-NEXT: mov v0.s[1], v0.s[2]
242+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
246243
; CHECK-NEXT: ret
247244
%mask = call <2 x i1> @llvm.vector.extract.v2i1.nxv2i1(<vscale x 2 x i1> %inmask, i64 0)
248245
ret <2 x i1> %mask

0 commit comments

Comments
 (0)