Skip to content

Commit 1528a4d

Browse files
committed
[llvm][sve] Lowering for VLS truncating stores
This adds custom lowering for truncating stores when operating on fixed length vectors in SVE. It also includes a DAG combine to fold extends followed by truncating stores into non-truncating stores in order to prevent this pattern appearing once truncating stores are supported. Currently truncating stores are not used in certain cases where the size of the vector is larger than the target vector width. Differential Revision: https://reviews.llvm.org/D104471
1 parent f97de4c commit 1528a4d

File tree

9 files changed

+311
-52
lines changed

9 files changed

+311
-52
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1271,6 +1271,14 @@ class TargetLoweringBase {
12711271
getTruncStoreAction(ValVT, MemVT) == Custom);
12721272
}
12731273

1274+
virtual bool canCombineTruncStore(EVT ValVT, EVT MemVT,
1275+
bool LegalOnly) const {
1276+
if (LegalOnly)
1277+
return isTruncStoreLegal(ValVT, MemVT);
1278+
1279+
return isTruncStoreLegalOrCustom(ValVT, MemVT);
1280+
}
1281+
12741282
/// Return how the indexed load should be treated: either it is legal, needs
12751283
/// to be promoted to a larger size, needs to be expanded to some other code
12761284
/// sequence, or the target has a custom expander for it.

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18089,10 +18089,11 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
1808918089

1809018090
// If this is an FP_ROUND or TRUNC followed by a store, fold this into a
1809118091
// truncating store. We can do this even if this is already a truncstore.
18092-
if ((Value.getOpcode() == ISD::FP_ROUND || Value.getOpcode() == ISD::TRUNCATE)
18093-
&& Value.getNode()->hasOneUse() && ST->isUnindexed() &&
18094-
TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
18095-
ST->getMemoryVT())) {
18092+
if ((Value.getOpcode() == ISD::FP_ROUND ||
18093+
Value.getOpcode() == ISD::TRUNCATE) &&
18094+
Value.getNode()->hasOneUse() && ST->isUnindexed() &&
18095+
TLI.canCombineTruncStore(Value.getOperand(0).getValueType(),
18096+
ST->getMemoryVT(), LegalOperations)) {
1809618097
return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0),
1809718098
Ptr, ST->getMemoryVT(), ST->getMemOperand());
1809818099
}

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1241,6 +1241,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
12411241
}
12421242
}
12431243

1244+
// SVE supports truncating stores of 64 and 128-bit vectors
1245+
setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1246+
setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1247+
setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1248+
setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1249+
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1250+
12441251
for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
12451252
MVT::nxv4f32, MVT::nxv2f64}) {
12461253
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
@@ -1487,6 +1494,16 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
14871494
setCondCodeAction(ISD::SETUNE, VT, Expand);
14881495
}
14891496

1497+
// Mark integer truncating stores as having custom lowering
1498+
if (VT.isInteger()) {
1499+
MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1500+
while (InnerVT != VT) {
1501+
setTruncStoreAction(VT, InnerVT, Custom);
1502+
InnerVT = InnerVT.changeVectorElementType(
1503+
MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1504+
}
1505+
}
1506+
14901507
// Lower fixed length vector operations to scalable equivalents.
14911508
setOperationAction(ISD::ABS, VT, Custom);
14921509
setOperationAction(ISD::ADD, VT, Custom);
@@ -4530,7 +4547,7 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
45304547
EVT MemVT = StoreNode->getMemoryVT();
45314548

45324549
if (VT.isVector()) {
4533-
if (useSVEForFixedLengthVectorVT(VT))
4550+
if (useSVEForFixedLengthVectorVT(VT, true))
45344551
return LowerFixedLengthVectorStoreToSVE(Op, DAG);
45354552

45364553
unsigned AS = StoreNode->getAddressSpace();
@@ -4542,7 +4559,8 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
45424559
return scalarizeVectorStore(StoreNode, DAG);
45434560
}
45444561

4545-
if (StoreNode->isTruncatingStore()) {
4562+
if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
4563+
MemVT == MVT::v4i8) {
45464564
return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
45474565
}
45484566
// 256 bit non-temporal stores can be lowered to STNP. Do this as part of
@@ -15122,6 +15140,30 @@ static bool performTBISimplification(SDValue Addr,
1512215140
return false;
1512315141
}
1512415142

15143+
static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
15144+
auto OpCode = N->getOpcode();
15145+
assert(OpCode == ISD::STORE ||
15146+
OpCode == ISD::MSTORE && "Expected STORE dag node in input!");
15147+
15148+
if (auto Store = dyn_cast<StoreSDNode>(N)) {
15149+
if (!Store->isTruncatingStore() || Store->isIndexed())
15150+
return SDValue();
15151+
SDValue Ext = Store->getValue();
15152+
auto ExtOpCode = Ext.getOpcode();
15153+
if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
15154+
ExtOpCode != ISD::ANY_EXTEND)
15155+
return SDValue();
15156+
SDValue Orig = Ext->getOperand(0);
15157+
if (Store->getMemoryVT() != Orig->getValueType(0))
15158+
return SDValue();
15159+
return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
15160+
Store->getBasePtr(), Store->getPointerInfo(),
15161+
Store->getAlign());
15162+
}
15163+
15164+
return SDValue();
15165+
}
15166+
1512515167
static SDValue performSTORECombine(SDNode *N,
1512615168
TargetLowering::DAGCombinerInfo &DCI,
1512715169
SelectionDAG &DAG,
@@ -15133,6 +15175,9 @@ static SDValue performSTORECombine(SDNode *N,
1513315175
performTBISimplification(N->getOperand(2), DCI, DAG))
1513415176
return SDValue(N, 0);
1513515177

15178+
if (SDValue Store = foldTruncStoreOfExt(DAG, N))
15179+
return Store;
15180+
1513615181
return SDValue();
1513715182
}
1513815183

llvm/lib/Target/AMDGPU/R600ISelLowering.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,15 @@ class R600TargetLowering final : public AMDGPUTargetLowering {
5454
MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
5555
bool *IsFast = nullptr) const override;
5656

57+
virtual bool canCombineTruncStore(EVT ValVT, EVT MemVT,
58+
bool LegalOperations) const override {
59+
// R600 has "custom" lowering for truncating stores despite not supporting
60+
// those instructions. If we allow that custom lowering in the DAG combiner
61+
// then all truncates are merged into truncating stores, giving worse code
62+
// generation. This hook prevents the DAG combiner performing that combine.
63+
return isTruncStoreLegal(ValVT, MemVT);
64+
}
65+
5766
private:
5867
unsigned Gen;
5968
/// Each OpenCL kernel has nine implicit parameters that are stored in the

llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,7 @@ define void @masked_gather_v2i8(<2 x i8>* %a, <2 x i8*>* %b) #0 {
3636
; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG0]]/z, z[[CMP]].s, #0
3737
; CHECK-NEXT: ld1sb { z[[RES:[0-9]+]].d }, [[MASK]]/z, [z[[PTRS]].d]
3838
; CHECK-NEXT: xtn v[[XTN:[0-9]+]].2s, v[[RES]].2d
39-
; CHECK-NEXT: mov [[RES_HI:w[0-9]+]], v[[XTN]].s[1]
40-
; CHECK-NEXT: fmov [[RES_LO:w[0-9]+]], s[[XTN]]
41-
; CHECK-NEXT: strb [[RES_LO]], [x0]
42-
; CHECK-NEXT: strb [[RES_HI]], [x0, #1]
39+
; CHECK-NEXT: st1b { z[[XTN]].s }, [[PG0]], [x0]
4340
; CHECK-NEXT: ret
4441
%cval = load <2 x i8>, <2 x i8>* %a
4542
%ptrs = load <2 x i8*>, <2 x i8*>* %b
@@ -61,8 +58,7 @@ define void @masked_gather_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 {
6158
; CHECK-NEXT: ld1sb { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d]
6259
; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s
6360
; CHECK-NEXT: uzp1 z[[UZP2:[0-9]+]].h, [[UZP1]].h, [[UZP1]].h
64-
; CHECK-NEXT: uzp1 v[[UZP3:[0-9]+]].8b, v[[UZP2]].8b, v[[UZP2]].8b
65-
; CHECK-NEXT: str s[[UZP3]], [x0]
61+
; CHECK-NEXT: st1b { z[[UZP2]].h }, [[PG0]], [x0]
6662
; CHECK-NEXT: ret
6763
%cval = load <4 x i8>, <4 x i8>* %a
6864
%ptrs = load <4 x i8*>, <4 x i8*>* %b
@@ -178,10 +174,7 @@ define void @masked_gather_v2i16(<2 x i16>* %a, <2 x i16*>* %b) #0 {
178174
; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG0]]/z, z[[CMP]].s, #0
179175
; CHECK-NEXT: ld1sh { z[[RES:[0-9]+]].d }, [[MASK]]/z, [z[[PTRS]].d]
180176
; CHECK-NEXT: xtn v[[XTN:[0-9]+]].2s, v[[RES]].2d
181-
; CHECK-NEXT: mov [[RES_HI:w[0-9]+]], v[[XTN]].s[1]
182-
; CHECK-NEXT: fmov [[RES_LO:w[0-9]+]], s[[XTN]]
183-
; CHECK-NEXT: strh [[RES_LO]], [x0]
184-
; CHECK-NEXT: strh [[RES_HI]], [x0, #2]
177+
; CHECK-NEXT: st1h { z[[RES]].s }, [[PG0]], [x0]
185178
; CHECK-NEXT: ret
186179
%cval = load <2 x i16>, <2 x i16>* %a
187180
%ptrs = load <2 x i16*>, <2 x i16*>* %b
Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
2+
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256
3+
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
4+
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
5+
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
6+
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
7+
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
8+
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
9+
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
10+
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
11+
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
12+
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
13+
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
14+
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
15+
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
16+
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512
17+
18+
target triple = "aarch64-unknown-linux-gnu"
19+
20+
; Don't use SVE when its registers are no bigger than NEON.
21+
; NO_SVE-NOT: ptrue
22+
23+
define void @store_trunc_v2i64i8(<2 x i64>* %ap, <2 x i8>* %dest) #0 {
24+
; CHECK-LABEL: store_trunc_v2i64i8
25+
; CHECK: ldr q[[Q0:[0-9]+]], [x0]
26+
; CHECK: ptrue p[[P0:[0-9]+]].d, vl2
27+
; CHECK-NEXT: st1b { z[[Q0]].d }, p[[P0]], [x{{[0-9]+}}]
28+
; CHECK-NEXT: ret
29+
%a = load <2 x i64>, <2 x i64>* %ap
30+
%val = trunc <2 x i64> %a to <2 x i8>
31+
store <2 x i8> %val, <2 x i8>* %dest
32+
ret void
33+
}
34+
35+
define void @store_trunc_v4i64i8(<4 x i64>* %ap, <4 x i8>* %dest) #0 {
36+
; CHECK-LABEL: store_trunc_v4i64i8
37+
; CHECK: ptrue p[[P0:[0-9]+]].d, vl4
38+
; CHECK-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
39+
; CHECK-NEXT: st1b { z[[Q0]].d }, p[[P0]], [x{{[0-9]+}}]
40+
; CHECK-NEXT: ret
41+
%a = load <4 x i64>, <4 x i64>* %ap
42+
%val = trunc <4 x i64> %a to <4 x i8>
43+
store <4 x i8> %val, <4 x i8>* %dest
44+
ret void
45+
}
46+
47+
define void @store_trunc_v8i64i8(<8 x i64>* %ap, <8 x i8>* %dest) #0 {
48+
; CHECK-LABEL: store_trunc_v8i64i8:
49+
; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
50+
; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
51+
; VBITS_GE_512-NEXT: st1b { [[Z0]].d }, p[[P0]], [x{{[0-9]+}}]
52+
; VBITS_GE_512-NEXT: ret
53+
54+
; Ensure sensible type legalisation
55+
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
56+
; VBITS_EQ_256-DAG: ld1d { [[Z0:z[0-9]+]].d }, [[PG]]/z, [x8]
57+
; VBITS_EQ_256-DAG: ld1d { [[Z1:z[0-9]+]].d }, [[PG]]/z, [x0]
58+
; VBITS_EQ_256-DAG: ptrue [[PG]].s, vl4
59+
; VBITS_EQ_256-DAG: uzp1 [[Z0]].s, [[Z0]].s, [[Z0]].s
60+
; VBITS_EQ_256-DAG: uzp1 [[Z1]].s, [[Z1]].s, [[Z1]].s
61+
; VBITS_EQ_256-DAG: splice [[Z1]].s, [[PG]], [[Z1]].s, [[Z0]].s
62+
; VBITS_EQ_256-DAG: ptrue [[PG]].s, vl8
63+
; VBITS_EQ_256-DAG: st1b { [[Z1]].s }, [[PG]], [x1]
64+
; VBITS_EQ_256-DAG: ret
65+
%a = load <8 x i64>, <8 x i64>* %ap
66+
%val = trunc <8 x i64> %a to <8 x i8>
67+
store <8 x i8> %val, <8 x i8>* %dest
68+
ret void
69+
}
70+
71+
define void @store_trunc_v16i64i8(<16 x i64>* %ap, <16 x i8>* %dest) #0 {
72+
; CHECK-LABEL: store_trunc_v16i64i8:
73+
; VBITS_GE_1024: ptrue p[[P0:[0-9]+]].d, vl16
74+
; VBITS_GE_1024-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
75+
; VBITS_GE_1024-NEXT: st1b { [[Z0]].d }, p[[P0]], [x{{[0-9]+}}]
76+
; VBITS_GE_1024-NEXT: ret
77+
%a = load <16 x i64>, <16 x i64>* %ap
78+
%val = trunc <16 x i64> %a to <16 x i8>
79+
store <16 x i8> %val, <16 x i8>* %dest
80+
ret void
81+
}
82+
83+
define void @store_trunc_v32i64i8(<32 x i64>* %ap, <32 x i8>* %dest) #0 {
84+
; CHECK-LABEL: store_trunc_v32i64i8:
85+
; VBITS_GE_2048: ptrue p[[P0:[0-9]+]].d, vl32
86+
; VBITS_GE_2048-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
87+
; VBITS_GE_2048-NEXT: st1b { [[Z0]].d }, p[[P0]], [x{{[0-9]+}}]
88+
; VBITS_GE_2048-NEXT: ret
89+
%a = load <32 x i64>, <32 x i64>* %ap
90+
%val = trunc <32 x i64> %a to <32 x i8>
91+
store <32 x i8> %val, <32 x i8>* %dest
92+
ret void
93+
}
94+
95+
define void @store_trunc_v8i64i16(<8 x i64>* %ap, <8 x i16>* %dest) #0 {
96+
; CHECK-LABEL: store_trunc_v8i64i16:
97+
; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
98+
; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
99+
; VBITS_GE_512-NEXT: st1h { [[Z0]].d }, p[[P0]], [x{{[0-9]+}}]
100+
; VBITS_GE_512-NEXT: ret
101+
102+
; Ensure sensible type legalisation.
103+
; Currently does not use the truncating store
104+
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
105+
; VBITS_EQ_256-DAG: ld1d { [[Z0:z[0-9]+]].d }, [[PG]]/z, [x8]
106+
; VBITS_EQ_256-DAG: ld1d { [[Z1:z[0-9]+]].d }, [[PG]]/z, [x0]
107+
; VBITS_EQ_256-DAG: uzp1 [[Z0]].s, [[Z0]].s, [[Z0]].s
108+
; VBITS_EQ_256-DAG: uzp1 [[Z1]].s, [[Z1]].s, [[Z1]].s
109+
; VBITS_EQ_256-DAG: uzp1 [[Z1]].h, [[Z1]].h, [[Z1]].h
110+
; VBITS_EQ_256-DAG: uzp1 [[Z0]].h, [[Z0]].h, [[Z0]].h
111+
; VBITS_EQ_256-DAG: mov v[[V0:[0-9]+]].d[1], v{{[0-9]+}}.d[0]
112+
; VBITS_EQ_256-DAG: str q[[V0]], [x1]
113+
; VBITS_EQ_256-DAG: ret
114+
%a = load <8 x i64>, <8 x i64>* %ap
115+
%val = trunc <8 x i64> %a to <8 x i16>
116+
store <8 x i16> %val, <8 x i16>* %dest
117+
ret void
118+
}
119+
120+
define void @store_trunc_v8i64i32(<8 x i64>* %ap, <8 x i32>* %dest) #0 {
121+
; CHECK-LABEL: store_trunc_v8i64i32:
122+
; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
123+
; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
124+
; VBITS_GE_512-NEXT: st1w { [[Z0]].d }, p[[P0]], [x{{[0-9]+}}]
125+
; VBITS_GE_512-NEXT: ret
126+
127+
; Ensure sensible type legalisation
128+
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
129+
; VBITS_EQ_256-DAG: ld1d { [[Z0:z[0-9]+]].d }, [[PG]]/z, [x8]
130+
; VBITS_EQ_256-DAG: ld1d { [[Z1:z[0-9]+]].d }, [[PG]]/z, [x0]
131+
; VBITS_EQ_256-DAG: ptrue [[PG]].s, vl4
132+
; VBITS_EQ_256-DAG: uzp1 [[Z0]].s, [[Z0]].s, [[Z0]].s
133+
; VBITS_EQ_256-DAG: uzp1 [[Z1]].s, [[Z1]].s, [[Z1]].s
134+
; VBITS_EQ_256-DAG: splice [[Z1]].s, [[PG]], [[Z1]].s, [[Z0]].s
135+
; VBITS_EQ_256-DAG: ptrue [[PG]].s, vl8
136+
; VBITS_EQ_256-DAG: st1w { [[Z1]].s }, [[PG]], [x1]
137+
; VBITS_EQ_256-DAG: ret
138+
%a = load <8 x i64>, <8 x i64>* %ap
139+
%val = trunc <8 x i64> %a to <8 x i32>
140+
store <8 x i32> %val, <8 x i32>* %dest
141+
ret void
142+
}
143+
144+
define void @store_trunc_v16i32i8(<16 x i32>* %ap, <16 x i8>* %dest) #0 {
145+
; CHECK-LABEL: store_trunc_v16i32i8:
146+
; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16
147+
; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
148+
; VBITS_GE_512-NEXT: st1b { [[Z0]].s }, p[[P0]], [x{{[0-9]+}}]
149+
; VBITS_GE_512-NEXT: ret
150+
151+
; Ensure sensible type legalisation.
152+
; Currently does not use the truncating store
153+
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
154+
; VBITS_EQ_256-DAG: ld1w { [[Z0:z[0-9]+]].s }, [[PG]]/z, [x8]
155+
; VBITS_EQ_256-DAG: ld1w { [[Z1:z[0-9]+]].s }, [[PG]]/z, [x0]
156+
; VBITS_EQ_256-DAG: uzp1 [[Z0]].h, [[Z0]].h, [[Z0]].h
157+
; VBITS_EQ_256-DAG: uzp1 [[Z1]].h, [[Z1]].h, [[Z1]].h
158+
; VBITS_EQ_256-DAG: uzp1 [[Z1]].b, [[Z1]].b, [[Z1]].b
159+
; VBITS_EQ_256-DAG: uzp1 [[Z0]].b, [[Z0]].b, [[Z0]].b
160+
; VBITS_EQ_256-DAG: mov v[[V0:[0-9]+]].d[1], v{{[0-9]+}}.d[0]
161+
; VBITS_EQ_256-DAG: str q[[V0]], [x1]
162+
; VBITS_EQ_256-DAG: ret
163+
%a = load <16 x i32>, <16 x i32>* %ap
164+
%val = trunc <16 x i32> %a to <16 x i8>
165+
store <16 x i8> %val, <16 x i8>* %dest
166+
ret void
167+
}
168+
169+
define void @store_trunc_v16i32i16(<16 x i32>* %ap, <16 x i16>* %dest) #0 {
170+
; CHECK-LABEL: store_trunc_v16i32i16:
171+
; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16
172+
; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
173+
; VBITS_GE_512-NEXT: st1h { [[Z0]].s }, p[[P0]], [x{{[0-9]+}}]
174+
; VBITS_GE_512-NEXT: ret
175+
176+
; Ensure sensible type legalisation
177+
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
178+
; VBITS_EQ_256-DAG: ld1w { [[Z0:z[0-9]+]].s }, [[PG]]/z, [x8]
179+
; VBITS_EQ_256-DAG: ld1w { [[Z1:z[0-9]+]].s }, [[PG]]/z, [x0]
180+
; VBITS_EQ_256-DAG: ptrue [[PG]].h, vl8
181+
; VBITS_EQ_256-DAG: uzp1 [[Z0]].h, [[Z0]].h, [[Z0]].h
182+
; VBITS_EQ_256-DAG: uzp1 [[Z1]].h, [[Z1]].h, [[Z1]].h
183+
; VBITS_EQ_256-DAG: splice [[Z1]].h, [[PG]], [[Z1]].h, [[Z0]].h
184+
; VBITS_EQ_256-DAG: ptrue [[PG]].h, vl16
185+
; VBITS_EQ_256-DAG: st1h { [[Z1]].h }, [[PG]], [x1]
186+
; VBITS_EQ_256-DAG: ret
187+
%a = load <16 x i32>, <16 x i32>* %ap
188+
%val = trunc <16 x i32> %a to <16 x i16>
189+
store <16 x i16> %val, <16 x i16>* %dest
190+
ret void
191+
}
192+
193+
define void @store_trunc_v32i16i8(<32 x i16>* %ap, <32 x i8>* %dest) #0 {
194+
; CHECK-LABEL: store_trunc_v32i16i8:
195+
; VBITS_GE_512: ptrue p[[P0:[0-9]+]].h, vl32
196+
; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0]
197+
; VBITS_GE_512-NEXT: st1b { [[Z0]].h }, p[[P0]], [x{{[0-9]+}}]
198+
; VBITS_GE_512-NEXT: ret
199+
200+
; Ensure sensible type legalisation
201+
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
202+
; VBITS_EQ_256-DAG: ld1h { [[Z0:z[0-9]+]].h }, [[PG]]/z, [x8]
203+
; VBITS_EQ_256-DAG: ld1h { [[Z1:z[0-9]+]].h }, [[PG]]/z, [x0]
204+
; VBITS_EQ_256-DAG: ptrue [[PG]].b, vl16
205+
; VBITS_EQ_256-DAG: uzp1 [[Z0]].b, [[Z0]].b, [[Z0]].b
206+
; VBITS_EQ_256-DAG: uzp1 [[Z1]].b, [[Z1]].b, [[Z1]].b
207+
; VBITS_EQ_256-DAG: splice [[Z1]].b, [[PG]], [[Z1]].b, [[Z0]].b
208+
; VBITS_EQ_256-DAG: ptrue [[PG]].b, vl32
209+
; VBITS_EQ_256-DAG: st1b { [[Z1]].b }, [[PG]], [x1]
210+
; VBITS_EQ_256-DAG: ret
211+
%a = load <32 x i16>, <32 x i16>* %ap
212+
%val = trunc <32 x i16> %a to <32 x i8>
213+
store <32 x i8> %val, <32 x i8>* %dest
214+
ret void
215+
}
216+
217+
218+
attributes #0 = { "target-features"="+sve" }

0 commit comments

Comments
 (0)