-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AArch64] Generalize integer FPR lane stores for all types #134117
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This comment was marked as resolved.
This comment was marked as resolved.
3a0e00d
to
2906095
Compare
2906095
to
ef964aa
Compare
@llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) ChangesThis helps avoid some pointless Follow on from: #129756 Patch is 74.76 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134117.diff 36 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index fc1a95e33380b..42c4830e94220 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -338,6 +338,8 @@ def amdgpuBufferFatPointer : ValueType<160, 234>;
// FIXME: Remove this and the getPointerType() override if MVT::i82 is added.
def amdgpuBufferStridedPointer : ValueType<192, 235>;
+def vi8 : ValueType<8, 236>; // 8-bit integer in FPR (AArch64)
+
let isNormalValueType = false in {
def token : ValueType<0, 504>; // TokenTy
def MetadataVT : ValueType<0, 505> { // Metadata
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index 0554b6387c5e6..c769568253b12 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -198,6 +198,8 @@ std::string EVT::getEVTString() const {
return "amdgpuBufferFatPointer";
case MVT::amdgpuBufferStridedPointer:
return "amdgpuBufferStridedPointer";
+ case MVT::vi8:
+ return "vi8";
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1c8e3afdfd718..bdbae15719fea 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -401,6 +401,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
if (Subtarget->hasFPARMv8()) {
+ addRegisterClass(MVT::vi8, &AArch64::FPR8RegClass);
addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
@@ -1393,6 +1394,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
}
+ setTruncStoreAction(MVT::v1i64, MVT::v1i8, Legal);
+
for (auto Op :
{ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
ISD::FROUND, ISD::FROUNDEVEN, ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE,
@@ -23988,6 +23991,22 @@ static unsigned getFPSubregForVT(EVT VT) {
}
}
+static EVT get64BitVector(EVT ElVT) {
+ assert(ElVT.isSimple() && "Expected simple VT");
+ switch (ElVT.getSimpleVT().SimpleTy) {
+ case MVT::i8:
+ return MVT::v8i8;
+ case MVT::i16:
+ return MVT::v4i16;
+ case MVT::i32:
+ return MVT::v2i32;
+ case MVT::i64:
+ return MVT::v1i64;
+ default:
+ llvm_unreachable("Unexpected VT!");
+ }
+}
+
static SDValue performSTORECombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG,
@@ -24066,11 +24085,44 @@ static SDValue performSTORECombine(SDNode *N,
SDValue ExtIdx = Value.getOperand(1);
EVT VectorVT = Vector.getValueType();
EVT ElemVT = VectorVT.getVectorElementType();
- if (!ValueVT.isInteger() || ElemVT == MVT::i8 || MemVT == MVT::i8)
+ if (!ValueVT.isInteger())
return SDValue();
if (ValueVT != MemVT && !ST->isTruncatingStore())
return SDValue();
+ if (MemVT == MVT::i8) {
+ auto *ExtCst = dyn_cast<ConstantSDNode>(ExtIdx);
+ if (Subtarget->isNeonAvailable() &&
+ (VectorVT == MVT::v8i8 || VectorVT == MVT::v16i8) && ExtCst &&
+ !ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD) {
+ // These can lower to st1.b, which is preferable if we're unlikely to
+ // fold the addressing into the store.
+ return SDValue();
+ }
+
+ // Lower as truncstore of v1i64 -> v1i8 (which can lower to a bsub store).
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+ SDValue ExtVector;
+ EVT VecVT64 = get64BitVector(ElemVT);
+ if (ExtCst && ExtCst->isZero()) {
+ ExtVector =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT64, Vector, Zero);
+ } else {
+ SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+ Value.getValueType(), Vector, ExtIdx);
+ ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT64,
+ DAG.getUNDEF(VecVT64), Ext, Zero);
+ }
+
+ SDValue Cast = DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, ExtVector);
+ return DAG.getTruncStore(ST->getChain(), DL, Cast, ST->getBasePtr(),
+ MVT::v1i8, ST->getMemOperand());
+ }
+
+ // TODO: Handle storing i8s to wider types.
+ if (ElemVT == MVT::i8)
+ return SDValue();
+
// Heuristic: If there are other users of integer scalars extracted from
// this vector that won't fold into the store -- abandon folding. Applying
// this fold may extend the vector lifetime and disrupt paired stores.
@@ -28825,6 +28877,10 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
+ // Can be lowered to a bsub store in ISEL.
+ if (VT == MVT::v1i64 && MemVT == MVT::v1i8)
+ return SDValue();
+
if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
EVT TruncVT = ContainerVT.changeVectorElementType(
Store->getMemoryVT().getVectorElementType());
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 6c61e3a613f6f..349391d17b95b 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -3575,7 +3575,7 @@ defm LDRW : LoadUI<0b10, 0, 0b01, GPR32z, uimm12s4, "ldr",
(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
let Predicates = [HasFPARMv8] in {
defm LDRB : LoadUI<0b00, 1, 0b01, FPR8Op, uimm12s1, "ldr",
- [(set FPR8Op:$Rt,
+ [(set (i8 FPR8Op:$Rt),
(load (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)))]>;
defm LDRH : LoadUI<0b01, 1, 0b01, FPR16Op, uimm12s2, "ldr",
[(set (f16 FPR16Op:$Rt),
@@ -3763,7 +3763,7 @@ defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32z, "ldur",
(load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
let Predicates = [HasFPARMv8] in {
defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8Op, "ldur",
- [(set FPR8Op:$Rt,
+ [(set (i8 FPR8Op:$Rt),
(load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16Op, "ldur",
[(set (f16 FPR16Op:$Rt),
@@ -4333,7 +4333,7 @@ defm STRW : StoreUIz<0b10, 0, 0b00, GPR32z, uimm12s4, "str",
(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
let Predicates = [HasFPARMv8] in {
defm STRB : StoreUI<0b00, 1, 0b00, FPR8Op, uimm12s1, "str",
- [(store FPR8Op:$Rt,
+ [(store (i8 FPR8Op:$Rt),
(am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>;
defm STRH : StoreUI<0b01, 1, 0b00, FPR16Op, uimm12s2, "str",
[(store (f16 FPR16Op:$Rt),
@@ -4469,7 +4469,7 @@ defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32z, "stur",
(am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
let Predicates = [HasFPARMv8] in {
defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8Op, "stur",
- [(store FPR8Op:$Rt,
+ [(store (i8 FPR8Op:$Rt),
(am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
defm STURH : StoreUnscaled<0b01, 1, 0b00, FPR16Op, "stur",
[(store (f16 FPR16Op:$Rt),
@@ -4589,6 +4589,18 @@ def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
(STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+// v1i64 -> bsub truncating stores
+// Supporting pattern lower f32/64 -> v8i8
+def : Pat<(v8i8 (vector_insert (v8i8 (undef)), (i32 FPR32:$src), 0)),
+ (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
+def : Pat<(v8i8 (vector_insert (v8i8 (undef)), (i64 FPR64:$src), 0)),
+ (v8i8 (EXTRACT_SUBREG (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub), dsub))>;
+// Lower v1i64 -> v1i8 truncstore to bsub store
+def : Pat<(truncstorevi8 v1i64:$VT, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
+ (STURBi (vi8 (EXTRACT_SUBREG v1i64:$VT, bsub)), GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(truncstorevi8 v1i64:$VT, (am_indexed8 GPR64sp:$Rn, uimm12s4:$offset)),
+ (STRBui (vi8 (EXTRACT_SUBREG v1i64:$VT, bsub)), GPR64sp:$Rn, uimm12s4:$offset)>;
+
// Match stores from lane 0 to the appropriate subreg's store.
multiclass VecStoreULane0Pat<SDPatternOperator StoreOp,
ValueType VTy, ValueType STy,
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index fed9b7b173e9c..42ba1451650ed 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -497,7 +497,7 @@ def Q30 : AArch64Reg<30, "q30", [D30, D30_HI], ["v30", ""]>, DwarfRegAlias<B30
def Q31 : AArch64Reg<31, "q31", [D31, D31_HI], ["v31", ""]>, DwarfRegAlias<B31>;
}
-def FPR8 : RegisterClass<"AArch64", [i8], 8, (sequence "B%u", 0, 31)> {
+def FPR8 : RegisterClass<"AArch64", [i8, vi8], 8, (sequence "B%u", 0, 31)> {
let Size = 8;
let DecoderMethod = "DecodeSimpleRegisterClass<AArch64::FPR8RegClassID, 0, 32>";
}
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 3ee71c14c6bd4..8179b253a86de 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3208,6 +3208,8 @@ let Predicates = [HasSVE_or_SME] in {
// Insert scalar into undef[0]
def : Pat<(nxv16i8 (vector_insert (nxv16i8 (undef)), (i32 FPR32:$src), 0)),
(INSERT_SUBREG (nxv16i8 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
+ def : Pat<(nxv16i8 (vector_insert (nxv16i8 (undef)), (i64 FPR64:$src), 0)),
+ (INSERT_SUBREG (nxv16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
def : Pat<(nxv8i16 (vector_insert (nxv8i16 (undef)), (i32 FPR32:$src), 0)),
(INSERT_SUBREG (nxv8i16 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
def : Pat<(nxv4i32 (vector_insert (nxv4i32 (undef)), (i32 FPR32:$src), 0)),
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
index d39c9bf760621..713ddd9aefe01 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
@@ -108,17 +108,15 @@ entry:
define void @test_str_lane_s8(ptr %a, <vscale x 16 x i8> %b) {
; CHECK-NONSTREAMING-LABEL: test_str_lane_s8:
; CHECK-NONSTREAMING: // %bb.0: // %entry
-; CHECK-NONSTREAMING-NEXT: umov w8, v0.b[7]
-; CHECK-NONSTREAMING-NEXT: strb w8, [x0]
+; CHECK-NONSTREAMING-NEXT: mov v0.b[0], v0.b[7]
+; CHECK-NONSTREAMING-NEXT: str b0, [x0]
; CHECK-NONSTREAMING-NEXT: ret
;
; STREAMING-COMPAT-LABEL: test_str_lane_s8:
; STREAMING-COMPAT: // %bb.0: // %entry
; STREAMING-COMPAT-NEXT: mov z0.b, z0.b[7]
-; STREAMING-COMPAT-NEXT: fmov w8, s0
-; STREAMING-COMPAT-NEXT: strb w8, [x0]
+; STREAMING-COMPAT-NEXT: str b0, [x0]
; STREAMING-COMPAT-NEXT: ret
-
entry:
%0 = extractelement <vscale x 16 x i8> %b, i32 7
store i8 %0, ptr %a, align 1
@@ -128,10 +126,8 @@ entry:
define void @test_str_lane0_s8(ptr %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: test_str_lane0_s8:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
-
entry:
%0 = extractelement <vscale x 16 x i8> %b, i32 0
store i8 %0, ptr %a, align 1
@@ -201,6 +197,19 @@ define void @test_str_reduction_i32_to_i16(ptr %ptr, <vscale x 4 x i1> %p0, <vsc
ret void
}
+define void @test_str_reduction_i32_to_i8(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+
+ %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+ %trunc = trunc i64 %reduce to i8
+ store i8 %trunc, ptr %ptr, align 1
+ ret void
+}
+
define void @test_str_reduction_i32_to_i32_negative_offset(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
; CHECK-LABEL: test_str_reduction_i32_to_i32_negative_offset:
; CHECK: // %bb.0:
@@ -242,6 +251,20 @@ define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, <vscale x 4
ret void
}
+define void @test_str_reduction_i32_to_i8_negative_offset(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i8_negative_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: stur b0, [x0, #-8]
+; CHECK-NEXT: ret
+
+ %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+ %trunc = trunc i64 %reduce to i8
+ %out_ptr = getelementptr inbounds i8, ptr %ptr, i64 -8
+ store i8 %trunc, ptr %out_ptr, align 1
+ ret void
+}
+
define void @test_str_lane_s32_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: test_str_lane_s32_negative_offset:
; CHECK: // %bb.0: // %entry
@@ -299,17 +322,15 @@ entry:
define void @test_str_lane_s8_negative_offset(ptr %a, <vscale x 16 x i8> %b) {
; CHECK-NONSTREAMING-LABEL: test_str_lane_s8_negative_offset:
; CHECK-NONSTREAMING: // %bb.0: // %entry
-; CHECK-NONSTREAMING-NEXT: umov w8, v0.b[7]
-; CHECK-NONSTREAMING-NEXT: sturb w8, [x0, #-8]
+; CHECK-NONSTREAMING-NEXT: mov v0.b[0], v0.b[7]
+; CHECK-NONSTREAMING-NEXT: stur b0, [x0, #-8]
; CHECK-NONSTREAMING-NEXT: ret
;
; STREAMING-COMPAT-LABEL: test_str_lane_s8_negative_offset:
; STREAMING-COMPAT: // %bb.0: // %entry
; STREAMING-COMPAT-NEXT: mov z0.b, z0.b[7]
-; STREAMING-COMPAT-NEXT: fmov w8, s0
-; STREAMING-COMPAT-NEXT: sturb w8, [x0, #-8]
+; STREAMING-COMPAT-NEXT: stur b0, [x0, #-8]
; STREAMING-COMPAT-NEXT: ret
-
entry:
%0 = extractelement <vscale x 16 x i8> %b, i32 7
%out_ptr = getelementptr inbounds i8, ptr %a, i64 -8
@@ -320,10 +341,8 @@ entry:
define void @test_str_lane0_s8_negative_offset(ptr %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: test_str_lane0_s8_negative_offset:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: sturb w8, [x0, #-8]
+; CHECK-NEXT: stur b0, [x0, #-8]
; CHECK-NEXT: ret
-
entry:
%0 = extractelement <vscale x 16 x i8> %b, i32 0
%out_ptr = getelementptr inbounds i8, ptr %a, i64 -8
@@ -385,6 +404,53 @@ entry:
ret void
}
+
+define void @test_str_trunc_lane_s32_to_s8(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s8:
+; CHECK-NONSTREAMING: // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT: mov v0.s[0], v0.s[3]
+; CHECK-NONSTREAMING-NEXT: str b0, [x0]
+; CHECK-NONSTREAMING-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s8:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3]
+; STREAMING-COMPAT-NEXT: str b0, [x0]
+; STREAMING-COMPAT-NEXT: ret
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 3
+ %trunc = trunc i32 %0 to i8
+ store i8 %trunc, ptr %a, align 1
+ ret void
+}
+
+define void @test_str_trunc_lane0_s32_to_s8(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_trunc_lane0_s32_to_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 0
+ %trunc = trunc i32 %0 to i8
+ store i8 %trunc, ptr %a, align 1
+ ret void
+}
+
+define void @test_str_trunc_lane_s64_to_s8(ptr %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_str_trunc_lane_s64_to_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.d, z0.d[3]
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 2 x i64> %b, i32 3
+ %trunc = trunc i64 %0 to i8
+ store i8 %trunc, ptr %a, align 1
+ ret void
+}
+
define void @test_str_trunc_lane_s32_to_s16_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset:
; CHECK: // %bb.0: // %entry
@@ -413,3 +479,52 @@ entry:
store i16 %trunc, ptr %out_ptr, align 2
ret void
}
+
+define void @test_str_trunc_lane_s32_to_s8_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s8_negative_offset:
+; CHECK-NONSTREAMING: // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT: mov v0.s[0], v0.s[3]
+; CHECK-NONSTREAMING-NEXT: stur b0, [x0, #-8]
+; CHECK-NONSTREAMING-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s8_negative_offset:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3]
+; STREAMING-COMPAT-NEXT: stur b0, [x0, #-8]
+; STREAMING-COMPAT-NEXT: ret
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 3
+ %trunc = trunc i32 %0 to i8
+ %out_ptr = getelementptr inbounds i8, ptr %a, i64 -8
+ store i8 %trunc, ptr %out_ptr, align 1
+ ret void
+}
+
+define void @test_str_trunc_lane0_s32_to_s8_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_trunc_lane0_s32_to_s8_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stur b0, [x0, #-8]
+; CHECK-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 0
+ %trunc = trunc i32 %0 to i8
+ %out_ptr = getelementptr inbounds i8, ptr %a, i64 -8
+ store i8 %trunc, ptr %out_ptr, align 1
+ ret void
+}
+
+define void @test_str_trunc_lane_s64_to_s8_negative_offset(ptr %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_str_trunc_lane_s64_to_s8_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.d, z0.d[3]
+; CHECK-NEXT: stur b0, [x0, #-8]
+; CHECK-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 2 x i64> %b, i32 3
+ %trunc = trunc i64 %0 to i8
+ %out_ptr = getelementptr inbounds i8, ptr %a, i64 -8
+ store i8 %trunc, ptr %out_ptr, align 1
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll
index fc0ba336b21cc..ea5dbc03ca174 100644
--- a/llvm/test/CodeGen/AArch64/add.ll
+++ b/llvm/test/CodeGen/AArch64/add.ll
@@ -63,10 +63,9 @@ define void @v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
-; CHECK-SD-NEXT: strb w8, [x0, #1]
+; CHECK-SD-NEXT: mov v1.s[0], v0.s[1]
+; CHECK-SD-NEXT: str b0, [x0]
+; CHECK-SD-NEXT: stur b1, [x0, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i8:
@@ -101,11 +100,11 @@ define void @v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b
; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: umov w8, v0.h[2]
+; CHECK-SD-NEXT: mov v0.h[0], v0.h[2]
; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w9, [sp, #12]
-; CHECK-SD-NEXT: strb w8, [x0, #2]
-; CHECK-SD-NEXT: strh w9, [x0]
+; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: stur b0, [x0, #2]
+; CHECK-SD-NEXT: strh w8, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll
index 24f2549cce785..adfcf26f85ba4 100644
--- a/llvm/test/CodeGen/AArch64/andorxor.ll
+++ b/llvm/test/CodeGen/AArch64/andorxor.ll
@@ -183,10 +183,9 @@ define void @and_v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
-; CHECK-SD-NEXT: strb w8, [x0, #1]
+; CHECK-SD-NEXT: mov v1.s[0], v0.s[1]
+; CHECK-SD-NEXT: str b0, [x0]
+; CHECK-SD-NEXT: stur b1, [x0, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: and_v2i8:
@@ -220,1...
[truncated]
|
I believe the decision to have a dedicated MVT for mfp8 means we can simplify things further because we don't need the truncating store (given that was my failed suggestion to remove the need for a dedicated MVT) and can instead have isel to store
Whilst the new MVT makes this slightly more awkward I think we're only really talking about having logic like:
but beyond the VTs in play the resulting DAG will hopefully be the same across all the types. |
da2fbee
to
e14277e
Compare
It's not quite simplified (as the fold is a little more complex in general), but I've been able to rewrite this (and the previous fold) into a unified fold that works the same for all types. This has added quite a bit more test churn, though it mostly seems innocuous. I've split out one patch to regenerate the |
It might be easier if the generalization of the fold was a separate patch (and this PR lands the previous byte store combine). That way, the test changes due to changing the non-i8 fold would be in their own patch. WDYT? |
It's not worth it. If we'd come to this design earlier we would have just landed support for all types anyway. |
This helps avoid some pointless `fmovs` in some cases. Currently, this is done in ISEL as FPR bytes are problematic in SDAG (as neither GPR or FPR bytes are a legal type).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we're pretty much there now with the bones of the patch looking good. That said, there are a couple of idiom switches (most importantly the "int add" -> "vector mov") that I would like us to avoid if possible.
llvm/test/CodeGen/AArch64/add.ll
Outdated
; CHECK-SD-NEXT: mov v1.h[0], v0.h[2] | ||
; CHECK-SD-NEXT: str s0, [x0] | ||
; CHECK-SD-NEXT: str h1, [x0, #4] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks like a negative change because the original integer add is likely to be cheaper than the vector mov.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe it could be an improvement as st1
is basically str + mov
. Looking at the Neoverse V1 optimization guide, I get:
add => latency of 1
st1 (b/h/s) => latency of 4
vs
ins => latency of 2
str (b/h/s/d) => latency of 2
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see what you mean.
e14277e
to
9ce811e
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One last comment.
// Propagate zero constants (applying this fold may miss optimizations). | ||
if (ISD::isConstantSplatVectorAllZeros(Vector.getNode())) { | ||
SDValue ZeroElt = DAG.getConstant(0, DL, ValueVT); | ||
DAG.ReplaceAllUsesWith(Value, ZeroElt); | ||
return SDValue(); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd say we're missing an obvious combine and looking at DAGCombiner::visitBITCAST
I see
TODO: Support FP bitcasts after legalize types
It's not worth looking for trouble so I'll take the current fix and we can circle back later if necessary. That said, DAG.getConstant
is specific to integers so the code needs moving after the !ValueVT.isInteger()
bailout.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍 I added this fold:
+ // extract_vector_elt of undef index -> UNDEF
+ if (Index.isUndef())
+ return DAG.getUNDEF(ScalarVT);
+
+ // extract_vector_elt of zero splat -> zero
+ if (ISD::isConstantSplatVectorAllZeros(VecOp.getNode()))
+ return ScalarVT.isFloatingPoint() ? DAG.getConstantFP(0.0, DL, ScalarVT)
+ : DAG.getConstant(0, DL, ScalarVT);
+
to visitEXTRACT_VECTOR_ELT
, which also seemed to resolve this (and a few more cases). But I can maybe post that in a later patch (since it results in changes across a few targets).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, definitely future work. It's worth looking at visitBITCAST
first though because I think it only affects floating point build_vectors that are bit casted to integer. When the "redundant" bit casting is removed I believe existing combines will then take over.
This rewrites the fold from llvm#129756 to apply to all types, including stores of i8s. This required adding a new `aarch64mfp8` MVT to represent FPR8 types on AArch64, which can be used to extract and store 8-bit values using b sub-registers. Follow on from: llvm#129756 Closes: llvm#131793
This rewrites the fold from #129756 to apply to all types, including stores of i8s. This required adding a new
aarch64mfp8
MVT to represent FPR8 types on AArch64, which can be used to extract and store 8-bit values using b sub-registers.Follow on from: #129756
Closes: #131793