Skip to content

Commit 6180806

Browse files
committed
[AArch64][SVE] Mark fixed-type FP extending/truncating loads/stores as custom
This allows the generic DAG combine to fold fp_extend/fp_trunc into loads/stores which we can then lower into a integer extending load/truncating store plus an FP_EXTEND/FP_ROUND. The nuance here is that fixed-type FP_EXTEND/FP_ROUND require unpacked types hence lowering them introduces an unpack/zip. By allowing these nodes to be combined with loads/store we make it much easier to have this unpack/zip combined into the load/store by our custom lowering. Differential Revision: https://reviews.llvm.org/D114580
1 parent cffe3a0 commit 6180806

File tree

3 files changed

+79
-85
lines changed

3 files changed

+79
-85
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15007,7 +15007,7 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
1500715007

1500815008
// fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
1500915009
if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
15010-
TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
15010+
TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, VT, N0.getValueType())) {
1501115011
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
1501215012
SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
1501315013
LN0->getChain(),

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 52 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1545,7 +1545,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
15451545
setCondCodeAction(ISD::SETUNE, VT, Expand);
15461546
}
15471547

1548-
// Mark integer truncating stores as having custom lowering
1548+
// Mark integer truncating stores/extending loads as having custom lowering
15491549
if (VT.isInteger()) {
15501550
MVT InnerVT = VT.changeVectorElementType(MVT::i8);
15511551
while (InnerVT != VT) {
@@ -1557,6 +1557,18 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
15571557
}
15581558
}
15591559

1560+
// Mark floating-point truncating stores/extending loads as having custom
1561+
// lowering
1562+
if (VT.isFloatingPoint()) {
1563+
MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1564+
while (InnerVT != VT) {
1565+
setTruncStoreAction(VT, InnerVT, Custom);
1566+
setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom);
1567+
InnerVT = InnerVT.changeVectorElementType(
1568+
MVT::getFloatingPointVT(2 * InnerVT.getScalarSizeInBits()));
1569+
}
1570+
}
1571+
15601572
// Lower fixed length vector operations to scalable equivalents.
15611573
setOperationAction(ISD::ABS, VT, Custom);
15621574
setOperationAction(ISD::ADD, VT, Custom);
@@ -18658,12 +18670,29 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
1865818670
SDLoc DL(Op);
1865918671
EVT VT = Op.getValueType();
1866018672
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
18673+
EVT LoadVT = ContainerVT;
18674+
EVT MemVT = Load->getMemoryVT();
18675+
18676+
auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
18677+
18678+
if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
18679+
LoadVT = ContainerVT.changeTypeToInteger();
18680+
MemVT = MemVT.changeTypeToInteger();
18681+
}
1866118682

1866218683
auto NewLoad = DAG.getMaskedLoad(
18663-
ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
18664-
getPredicateForFixedLengthVector(DAG, DL, VT), DAG.getUNDEF(ContainerVT),
18665-
Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(),
18666-
Load->getExtensionType());
18684+
LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
18685+
DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
18686+
Load->getAddressingMode(), Load->getExtensionType());
18687+
18688+
if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
18689+
EVT ExtendVT = ContainerVT.changeVectorElementType(
18690+
Load->getMemoryVT().getVectorElementType());
18691+
18692+
NewLoad = getSVESafeBitCast(ExtendVT, NewLoad, DAG);
18693+
NewLoad = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
18694+
Pg, NewLoad, DAG.getUNDEF(ContainerVT));
18695+
}
1866718696

1866818697
auto Result = convertFromScalableVector(DAG, VT, NewLoad);
1866918698
SDValue MergedValues[2] = {Result, Load->getChain()};
@@ -18738,13 +18767,26 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
1873818767
SDLoc DL(Op);
1873918768
EVT VT = Store->getValue().getValueType();
1874018769
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
18770+
EVT MemVT = Store->getMemoryVT();
1874118771

18772+
auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
1874218773
auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
18743-
return DAG.getMaskedStore(
18744-
Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
18745-
getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(),
18746-
Store->getMemOperand(), Store->getAddressingMode(),
18747-
Store->isTruncatingStore());
18774+
18775+
if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
18776+
EVT TruncVT = ContainerVT.changeVectorElementType(
18777+
Store->getMemoryVT().getVectorElementType());
18778+
MemVT = MemVT.changeTypeToInteger();
18779+
NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
18780+
NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
18781+
DAG.getUNDEF(TruncVT));
18782+
NewValue =
18783+
getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
18784+
}
18785+
18786+
return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
18787+
Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
18788+
Store->getMemOperand(), Store->getAddressingMode(),
18789+
Store->isTruncatingStore());
1874818790
}
1874918791

1875018792
SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(

llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll

Lines changed: 26 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,8 @@ define <4 x float> @fcvt_v4f16_v4f32(<4 x half> %op1) #0 {
4848
define void @fcvt_v8f16_v8f32(<8 x half>* %a, <8 x float>* %b) #0 {
4949
; CHECK-LABEL: fcvt_v8f16_v8f32:
5050
; CHECK: // %bb.0:
51-
; CHECK-NEXT: ldr q0, [x0]
5251
; CHECK-NEXT: ptrue p0.s, vl8
53-
; CHECK-NEXT: uunpklo z0.s, z0.h
52+
; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
5453
; CHECK-NEXT: fcvt z0.s, p0/m, z0.h
5554
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
5655
; CHECK-NEXT: ret
@@ -76,16 +75,15 @@ define void @fcvt_v16f16_v16f32(<16 x half>* %a, <16 x float>* %b) #0 {
7675
; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1]
7776
; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
7877
; VBITS_EQ_256-NEXT: ret
79-
78+
;
8079
; VBITS_GE_512-LABEL: fcvt_v16f16_v16f32:
8180
; VBITS_GE_512: // %bb.0:
82-
; VBITS_GE_512-NEXT: ptrue p0.h, vl16
83-
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
8481
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
85-
; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
82+
; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x0]
8683
; VBITS_GE_512-NEXT: fcvt z0.s, p0/m, z0.h
8784
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
8885
; VBITS_GE_512-NEXT: ret
86+
8987
%op1 = load <16 x half>, <16 x half>* %a
9088
%res = fpext <16 x half> %op1 to <16 x float>
9189
store <16 x float> %res, <16 x float>* %b
@@ -95,10 +93,8 @@ define void @fcvt_v16f16_v16f32(<16 x half>* %a, <16 x float>* %b) #0 {
9593
define void @fcvt_v32f16_v32f32(<32 x half>* %a, <32 x float>* %b) #0 {
9694
; VBITS_GE_1024-LABEL: fcvt_v32f16_v32f32:
9795
; VBITS_GE_1024: // %bb.0:
98-
; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
99-
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
10096
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
101-
; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
97+
; VBITS_GE_1024-NEXT: ld1sh { z0.s }, p0/z, [x0]
10298
; VBITS_GE_1024-NEXT: fcvt z0.s, p0/m, z0.h
10399
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
104100
; VBITS_GE_1024-NEXT: ret
@@ -111,10 +107,8 @@ define void @fcvt_v32f16_v32f32(<32 x half>* %a, <32 x float>* %b) #0 {
111107
define void @fcvt_v64f16_v64f32(<64 x half>* %a, <64 x float>* %b) #0 {
112108
; VBITS_GE_2048-LABEL: fcvt_v64f16_v64f32:
113109
; VBITS_GE_2048: // %bb.0:
114-
; VBITS_GE_2048-NEXT: ptrue p0.h, vl64
115-
; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
116110
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
117-
; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h
111+
; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x0]
118112
; VBITS_GE_2048-NEXT: fcvt z0.s, p0/m, z0.h
119113
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
120114
; VBITS_GE_2048-NEXT: ret
@@ -156,10 +150,8 @@ define <2 x double> @fcvt_v2f16_v2f64(<2 x half> %op1) #0 {
156150
define void @fcvt_v4f16_v4f64(<4 x half>* %a, <4 x double>* %b) #0 {
157151
; CHECK-LABEL: fcvt_v4f16_v4f64:
158152
; CHECK: // %bb.0:
159-
; CHECK-NEXT: ldr d0, [x0]
160153
; CHECK-NEXT: ptrue p0.d, vl4
161-
; CHECK-NEXT: uunpklo z0.s, z0.h
162-
; CHECK-NEXT: uunpklo z0.d, z0.s
154+
; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0]
163155
; CHECK-NEXT: fcvt z0.d, p0/m, z0.h
164156
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
165157
; CHECK-NEXT: ret
@@ -170,7 +162,6 @@ define void @fcvt_v4f16_v4f64(<4 x half>* %a, <4 x double>* %b) #0 {
170162
}
171163

172164
define void @fcvt_v8f16_v8f64(<8 x half>* %a, <8 x double>* %b) #0 {
173-
; Ensure sensible type legalisation.
174165
; VBITS_EQ_256-LABEL: fcvt_v8f16_v8f64:
175166
; VBITS_EQ_256: // %bb.0:
176167
; VBITS_EQ_256-NEXT: ldr q0, [x0]
@@ -186,16 +177,15 @@ define void @fcvt_v8f16_v8f64(<8 x half>* %a, <8 x double>* %b) #0 {
186177
; VBITS_EQ_256-NEXT: fcvt z1.d, p0/m, z1.h
187178
; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
188179
; VBITS_EQ_256-NEXT: ret
189-
180+
;
190181
; VBITS_GE_512-LABEL: fcvt_v8f16_v8f64:
191182
; VBITS_GE_512: // %bb.0:
192-
; VBITS_GE_512-NEXT: ldr q0, [x0]
193183
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
194-
; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
195-
; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
184+
; VBITS_GE_512-NEXT: ld1sh { z0.d }, p0/z, [x0]
196185
; VBITS_GE_512-NEXT: fcvt z0.d, p0/m, z0.h
197186
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
198187
; VBITS_GE_512-NEXT: ret
188+
199189
%op1 = load <8 x half>, <8 x half>* %a
200190
%res = fpext <8 x half> %op1 to <8 x double>
201191
store <8 x double> %res, <8 x double>* %b
@@ -205,11 +195,8 @@ define void @fcvt_v8f16_v8f64(<8 x half>* %a, <8 x double>* %b) #0 {
205195
define void @fcvt_v16f16_v16f64(<16 x half>* %a, <16 x double>* %b) #0 {
206196
; VBITS_GE_1024-LABEL: fcvt_v16f16_v16f64:
207197
; VBITS_GE_1024: // %bb.0:
208-
; VBITS_GE_1024-NEXT: ptrue p0.h, vl16
209-
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
210198
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
211-
; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
212-
; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
199+
; VBITS_GE_1024-NEXT: ld1sh { z0.d }, p0/z, [x0]
213200
; VBITS_GE_1024-NEXT: fcvt z0.d, p0/m, z0.h
214201
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
215202
; VBITS_GE_1024-NEXT: ret
@@ -222,11 +209,8 @@ define void @fcvt_v16f16_v16f64(<16 x half>* %a, <16 x double>* %b) #0 {
222209
define void @fcvt_v32f16_v32f64(<32 x half>* %a, <32 x double>* %b) #0 {
223210
; VBITS_GE_2048-LABEL: fcvt_v32f16_v32f64:
224211
; VBITS_GE_2048: // %bb.0:
225-
; VBITS_GE_2048-NEXT: ptrue p0.h, vl32
226-
; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
227212
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
228-
; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h
229-
; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s
213+
; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p0/z, [x0]
230214
; VBITS_GE_2048-NEXT: fcvt z0.d, p0/m, z0.h
231215
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
232216
; VBITS_GE_2048-NEXT: ret
@@ -264,9 +248,8 @@ define <2 x double> @fcvt_v2f32_v2f64(<2 x float> %op1) #0 {
264248
define void @fcvt_v4f32_v4f64(<4 x float>* %a, <4 x double>* %b) #0 {
265249
; CHECK-LABEL: fcvt_v4f32_v4f64:
266250
; CHECK: // %bb.0:
267-
; CHECK-NEXT: ldr q0, [x0]
268251
; CHECK-NEXT: ptrue p0.d, vl4
269-
; CHECK-NEXT: uunpklo z0.d, z0.s
252+
; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
270253
; CHECK-NEXT: fcvt z0.d, p0/m, z0.s
271254
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
272255
; CHECK-NEXT: ret
@@ -292,16 +275,15 @@ define void @fcvt_v8f32_v8f64(<8 x float>* %a, <8 x double>* %b) #0 {
292275
; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1]
293276
; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
294277
; VBITS_EQ_256-NEXT: ret
295-
278+
;
296279
; VBITS_GE_512-LABEL: fcvt_v8f32_v8f64:
297280
; VBITS_GE_512: // %bb.0:
298-
; VBITS_GE_512-NEXT: ptrue p0.s, vl8
299-
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
300281
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
301-
; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
282+
; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x0]
302283
; VBITS_GE_512-NEXT: fcvt z0.d, p0/m, z0.s
303284
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
304285
; VBITS_GE_512-NEXT: ret
286+
305287
%op1 = load <8 x float>, <8 x float>* %a
306288
%res = fpext <8 x float> %op1 to <8 x double>
307289
store <8 x double> %res, <8 x double>* %b
@@ -311,10 +293,8 @@ define void @fcvt_v8f32_v8f64(<8 x float>* %a, <8 x double>* %b) #0 {
311293
define void @fcvt_v16f32_v16f64(<16 x float>* %a, <16 x double>* %b) #0 {
312294
; VBITS_GE_1024-LABEL: fcvt_v16f32_v16f64:
313295
; VBITS_GE_1024: // %bb.0:
314-
; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
315-
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
316296
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
317-
; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
297+
; VBITS_GE_1024-NEXT: ld1sw { z0.d }, p0/z, [x0]
318298
; VBITS_GE_1024-NEXT: fcvt z0.d, p0/m, z0.s
319299
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
320300
; VBITS_GE_1024-NEXT: ret
@@ -327,10 +307,8 @@ define void @fcvt_v16f32_v16f64(<16 x float>* %a, <16 x double>* %b) #0 {
327307
define void @fcvt_v32f32_v32f64(<32 x float>* %a, <32 x double>* %b) #0 {
328308
; VBITS_GE_2048-LABEL: fcvt_v32f32_v32f64:
329309
; VBITS_GE_2048: // %bb.0:
330-
; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
331-
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
332310
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
333-
; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s
311+
; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p0/z, [x0]
334312
; VBITS_GE_2048-NEXT: fcvt z0.d, p0/m, z0.s
335313
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
336314
; VBITS_GE_2048-NEXT: ret
@@ -403,11 +381,8 @@ define void @fcvt_v16f32_v16f16(<16 x float>* %a, <16 x half>* %b) #0 {
403381
; VBITS_GE_512: // %bb.0:
404382
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
405383
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
406-
; VBITS_GE_512-NEXT: ptrue p0.s
407384
; VBITS_GE_512-NEXT: fcvt z0.h, p0/m, z0.s
408-
; VBITS_GE_512-NEXT: ptrue p0.h, vl16
409-
; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
410-
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
385+
; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x1]
411386
; VBITS_GE_512-NEXT: ret
412387
%op1 = load <16 x float>, <16 x float>* %a
413388
%res = fptrunc <16 x float> %op1 to <16 x half>
@@ -420,11 +395,8 @@ define void @fcvt_v32f32_v32f16(<32 x float>* %a, <32 x half>* %b) #0 {
420395
; VBITS_GE_1024: // %bb.0:
421396
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
422397
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
423-
; VBITS_GE_1024-NEXT: ptrue p0.s
424398
; VBITS_GE_1024-NEXT: fcvt z0.h, p0/m, z0.s
425-
; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
426-
; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h
427-
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
399+
; VBITS_GE_1024-NEXT: st1h { z0.s }, p0, [x1]
428400
; VBITS_GE_1024-NEXT: ret
429401
%op1 = load <32 x float>, <32 x float>* %a
430402
%res = fptrunc <32 x float> %op1 to <32 x half>
@@ -437,11 +409,8 @@ define void @fcvt_v64f32_v64f16(<64 x float>* %a, <64 x half>* %b) #0 {
437409
; VBITS_GE_2048: // %bb.0:
438410
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
439411
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
440-
; VBITS_GE_2048-NEXT: ptrue p0.s
441412
; VBITS_GE_2048-NEXT: fcvt z0.h, p0/m, z0.s
442-
; VBITS_GE_2048-NEXT: ptrue p0.h, vl64
443-
; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h
444-
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1]
413+
; VBITS_GE_2048-NEXT: st1h { z0.s }, p0, [x1]
445414
; VBITS_GE_2048-NEXT: ret
446415
%op1 = load <64 x float>, <64 x float>* %a
447416
%res = fptrunc <64 x float> %op1 to <64 x half>
@@ -533,12 +502,8 @@ define void @fcvt_v16f64_v16f16(<16 x double>* %a, <16 x half>* %b) #0 {
533502
; VBITS_GE_1024: // %bb.0:
534503
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
535504
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
536-
; VBITS_GE_1024-NEXT: ptrue p0.d
537505
; VBITS_GE_1024-NEXT: fcvt z0.h, p0/m, z0.d
538-
; VBITS_GE_1024-NEXT: ptrue p0.h, vl16
539-
; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s
540-
; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h
541-
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
506+
; VBITS_GE_1024-NEXT: st1h { z0.d }, p0, [x1]
542507
; VBITS_GE_1024-NEXT: ret
543508
%op1 = load <16 x double>, <16 x double>* %a
544509
%res = fptrunc <16 x double> %op1 to <16 x half>
@@ -551,12 +516,8 @@ define void @fcvt_v32f64_v32f16(<32 x double>* %a, <32 x half>* %b) #0 {
551516
; VBITS_GE_2048: // %bb.0:
552517
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
553518
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
554-
; VBITS_GE_2048-NEXT: ptrue p0.d
555519
; VBITS_GE_2048-NEXT: fcvt z0.h, p0/m, z0.d
556-
; VBITS_GE_2048-NEXT: ptrue p0.h, vl32
557-
; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
558-
; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h
559-
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1]
520+
; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [x1]
560521
; VBITS_GE_2048-NEXT: ret
561522
%op1 = load <32 x double>, <32 x double>* %a
562523
%res = fptrunc <32 x double> %op1 to <32 x half>
@@ -627,11 +588,8 @@ define void @fcvt_v8f64_v8f32(<8 x double>* %a, <8 x float>* %b) #0 {
627588
; VBITS_GE_512: // %bb.0:
628589
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
629590
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
630-
; VBITS_GE_512-NEXT: ptrue p0.d
631591
; VBITS_GE_512-NEXT: fcvt z0.s, p0/m, z0.d
632-
; VBITS_GE_512-NEXT: ptrue p0.s, vl8
633-
; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
634-
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
592+
; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x1]
635593
; VBITS_GE_512-NEXT: ret
636594
%op1 = load <8 x double>, <8 x double>* %a
637595
%res = fptrunc <8 x double> %op1 to <8 x float>
@@ -644,11 +602,8 @@ define void @fcvt_v16f64_v16f32(<16 x double>* %a, <16 x float>* %b) #0 {
644602
; VBITS_GE_1024: // %bb.0:
645603
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
646604
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
647-
; VBITS_GE_1024-NEXT: ptrue p0.d
648605
; VBITS_GE_1024-NEXT: fcvt z0.s, p0/m, z0.d
649-
; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
650-
; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s
651-
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
606+
; VBITS_GE_1024-NEXT: st1w { z0.d }, p0, [x1]
652607
; VBITS_GE_1024-NEXT: ret
653608
%op1 = load <16 x double>, <16 x double>* %a
654609
%res = fptrunc <16 x double> %op1 to <16 x float>
@@ -661,11 +616,8 @@ define void @fcvt_v32f64_v32f32(<32 x double>* %a, <32 x float>* %b) #0 {
661616
; VBITS_GE_2048: // %bb.0:
662617
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
663618
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
664-
; VBITS_GE_2048-NEXT: ptrue p0.d
665619
; VBITS_GE_2048-NEXT: fcvt z0.s, p0/m, z0.d
666-
; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
667-
; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
668-
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
620+
; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x1]
669621
; VBITS_GE_2048-NEXT: ret
670622
%op1 = load <32 x double>, <32 x double>* %a
671623
%res = fptrunc <32 x double> %op1 to <32 x float>

0 commit comments

Comments
 (0)