Skip to content
This repository was archived by the owner on Mar 28, 2020. It is now read-only.

Commit 751c17b

Browse files
committed
[AArch64] Add custom lowering for v4i8 trunc store
This patch adds a custom trunc store lowering for v4i8 vector types. Since there is not v.4b register, the v4i8 is promoted to v4i16 (v.4h) and default action for v4i8 is to extract each element and issue 4 byte stores. A better strategy would be to extended the promoted v4i16 to v8i16 (with undef elements) and extract and store the word lane which represents the v4i8 subvectores. The construction: define void @foo(<4 x i16> %x, i8* nocapture %p) { %0 = trunc <4 x i16> %x to <4 x i8> %1 = bitcast i8* %p to <4 x i8>* store <4 x i8> %0, <4 x i8>* %1, align 4, !tbaa !2 ret void } Can be optimized from: umov w8, v0.h[3] umov w9, v0.h[2] umov w10, v0.h[1] umov w11, v0.h[0] strb w8, [x0, #3] strb w9, [x0, #2] strb w10, [x0, #1] strb w11, [x0] ret To: xtn v0.8b, v0.8h str s0, [x0] ret The patch also adjust the memory cost for autovectorization, so the C code: void foo (const int *src, int width, unsigned char *dst) { for (int i = 0; i < width; i++) *dst++ = *src++; } can be vectorized to: .LBB0_4: // %vector.body // =>This Inner Loop Header: Depth=1 ldr q0, [x0], #16 subs x12, x12, #4 // =4 xtn v0.4h, v0.4s xtn v0.8b, v0.8h st1 { v0.s }[0], [x2], #4 b.ne .LBB0_4 Instead of byte operations. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@335735 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 212054e commit 751c17b

File tree

6 files changed

+95
-10
lines changed

6 files changed

+95
-10
lines changed

lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -742,6 +742,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
742742
setOperationAction(ISD::FTRUNC, Ty, Legal);
743743
setOperationAction(ISD::FROUND, Ty, Legal);
744744
}
745+
746+
setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
745747
}
746748

747749
PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
@@ -2673,6 +2675,68 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
26732675
}
26742676
}
26752677

2678+
// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
2679+
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
2680+
EVT VT, EVT MemVT,
2681+
SelectionDAG &DAG) {
2682+
assert(VT.isVector() && "VT should be a vector type");
2683+
assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
2684+
2685+
SDValue Value = ST->getValue();
2686+
2687+
// It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
2688+
// the word lane which represent the v4i8 subvector. It optimizes the store
2689+
// to:
2690+
//
2691+
// xtn v0.8b, v0.8h
2692+
// str s0, [x0]
2693+
2694+
SDValue Undef = DAG.getUNDEF(MVT::i16);
2695+
SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
2696+
{Undef, Undef, Undef, Undef});
2697+
2698+
SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
2699+
Value, UndefVec);
2700+
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
2701+
2702+
Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
2703+
SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
2704+
Trunc, DAG.getConstant(0, DL, MVT::i64));
2705+
2706+
return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
2707+
ST->getBasePtr(), ST->getMemOperand());
2708+
}
2709+
2710+
// Custom lowering for any store, vector or scalar and/or default or with
2711+
// a truncate operations. Currently only custom lower truncate operation
2712+
// from vector v4i16 to v4i8.
2713+
SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
2714+
SelectionDAG &DAG) const {
2715+
SDLoc Dl(Op);
2716+
StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
2717+
assert (StoreNode && "Can only custom lower store nodes");
2718+
2719+
SDValue Value = StoreNode->getValue();
2720+
2721+
EVT VT = Value.getValueType();
2722+
EVT MemVT = StoreNode->getMemoryVT();
2723+
2724+
assert (VT.isVector() && "Can only custom lower vector store types");
2725+
2726+
unsigned AS = StoreNode->getAddressSpace();
2727+
unsigned Align = StoreNode->getAlignment();
2728+
if (Align < MemVT.getStoreSize() &&
2729+
!allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) {
2730+
return scalarizeVectorStore(StoreNode, DAG);
2731+
}
2732+
2733+
if (StoreNode->isTruncatingStore()) {
2734+
return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
2735+
}
2736+
2737+
return SDValue();
2738+
}
2739+
26762740
SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
26772741
SelectionDAG &DAG) const {
26782742
LLVM_DEBUG(dbgs() << "Custom lowering: ");
@@ -2784,6 +2848,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
27842848
return LowerMULH(Op, DAG);
27852849
case ISD::INTRINSIC_WO_CHAIN:
27862850
return LowerINTRINSIC_WO_CHAIN(Op, DAG);
2851+
case ISD::STORE:
2852+
return LowerSTORE(Op, DAG);
27872853
case ISD::VECREDUCE_ADD:
27882854
case ISD::VECREDUCE_SMAX:
27892855
case ISD::VECREDUCE_SMIN:

lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -524,6 +524,8 @@ class AArch64TargetLowering : public TargetLowering {
524524
SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
525525
SDValue ThisVal) const;
526526

527+
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
528+
527529
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
528530

529531
bool isEligibleForTailCallOptimization(

lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -634,14 +634,22 @@ int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
634634
return LT.first * 2 * AmortizationCost;
635635
}
636636

637-
if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8) &&
638-
Ty->getVectorNumElements() < 8) {
639-
// We scalarize the loads/stores because there is not v.4b register and we
640-
// have to promote the elements to v.4h.
641-
unsigned NumVecElts = Ty->getVectorNumElements();
642-
unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
643-
// We generate 2 instructions per vector element.
644-
return NumVectorizableInstsToAmortize * NumVecElts * 2;
637+
if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8)) {
638+
unsigned ProfitableNumElements;
639+
if (Opcode == Instruction::Store)
640+
// We use a custom trunc store lowering so v.4b should be profitable.
641+
ProfitableNumElements = 4;
642+
else
643+
// We scalarize the loads because there is not v.4b register and we
644+
// have to promote the elements to v.2.
645+
ProfitableNumElements = 8;
646+
647+
if (Ty->getVectorNumElements() < ProfitableNumElements) {
648+
unsigned NumVecElts = Ty->getVectorNumElements();
649+
unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
650+
// We generate 2 instructions per vector element.
651+
return NumVectorizableInstsToAmortize * NumVecElts * 2;
652+
}
645653
}
646654

647655
return LT.first;

test/Analysis/CostModel/AArch64/store.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ define void @getMemoryOpCost() {
5959
; these types (they get extended to v.4h/v.2s).
6060
; CHECK: cost of 16 {{.*}} store
6161
store <2 x i8> undef, <2 x i8> * undef
62-
; CHECK: cost of 64 {{.*}} store
62+
; CHECK: cost of 1 {{.*}} store
6363
store <4 x i8> undef, <4 x i8> * undef
6464
; CHECK: cost of 16 {{.*}} load
6565
load <2 x i8> , <2 x i8> * undef

test/CodeGen/AArch64/neon-truncStore-extLoad.ll

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,16 @@ define void @truncStore.v4i32(<4 x i32> %a, <4 x i16>* %result) {
2020
ret void
2121
}
2222

23+
define void @truncStore.v4i8(<4 x i32> %a, <4 x i8>* %result) {
24+
; CHECK-LABEL: truncStore.v4i8:
25+
; CHECK: xtn [[TMP:(v[0-9]+)]].4h, v{{[0-9]+}}.4s
26+
; CHECK-NEXT: xtn [[TMP2:(v[0-9]+)]].8b, [[TMP]].8h
27+
; CHECK-NEXT: str s{{[0-9]+}}, [x{{[0-9]+}}]
28+
%b = trunc <4 x i32> %a to <4 x i8>
29+
store <4 x i8> %b, <4 x i8>* %result
30+
ret void
31+
}
32+
2333
define void @truncStore.v8i16(<8 x i16> %a, <8 x i8>* %result) {
2434
; CHECK-LABEL: truncStore.v8i16:
2535
; CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h

test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ target triple = "aarch64--linux-gnu"
1515
; CHECK: Found an estimated cost of 0 for VF 2 For instruction: {{.*}} load i8
1616
; CHECK: vector.body
1717
; CHECK: load i8
18-
; CHECK: load i8
1918
; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
2019

2120
define void @test(%pair* %p, i64 %n) {

0 commit comments

Comments
 (0)