Skip to content

Commit 2f7cab8

Browse files
author
git apple-llvm automerger
committed
Merge commit '02a1d311bde4' from llvm.org/main into next
2 parents f9965c5 + 02a1d31 commit 2f7cab8

11 files changed

+838
-913
lines changed

llvm/lib/Target/AArch64/AArch64InstrFormats.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,11 @@ def dup_v4f32 :
177177
[(v2f32 (extract_subvector (v4f32 (AArch64duplane32 (v4f32 node:$LHS), node:$RHS)), (i64 0))),
178178
(v2f32 (AArch64duplane32 (v4f32 node:$LHS), node:$RHS))]>;
179179

180+
// Match either a scalar_to_vector (from SDAG) or a vector_insert of undef (from GISel)
181+
def vec_ins_or_scal_vec : PatFrags<(ops node:$src),
182+
[(vector_insert undef, node:$src, (i64 0)),
183+
(scalar_to_vector node:$src)]>;
184+
180185
//===----------------------------------------------------------------------===//
181186
// Asm Operand Classes.
182187
//

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 81 additions & 133 deletions
Original file line numberDiff line numberDiff line change
@@ -3324,63 +3324,6 @@ defm LDRSW : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>;
33243324
// Pre-fetch.
33253325
defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">;
33263326

3327-
def vec_ins_or_scal_vec : PatFrags<(ops node:$src),
3328-
[(vector_insert undef, node:$src, (i64 0)),
3329-
(scalar_to_vector node:$src)]>;
3330-
3331-
// For regular load, we do not have any alignment requirement.
3332-
// Thus, it is safe to directly map the vector loads with interesting
3333-
// addressing modes.
3334-
// FIXME: We could do the same for bitconvert to floating point vectors.
3335-
multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop,
3336-
ValueType ScalTy, ValueType VecTy,
3337-
Instruction LOADW, Instruction LOADX,
3338-
SubRegIndex sub> {
3339-
def : Pat<(VecTy (vec_ins_or_scal_vec (ScalTy
3340-
(loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))),
3341-
(INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
3342-
(LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset),
3343-
sub)>;
3344-
3345-
def : Pat<(VecTy (vec_ins_or_scal_vec (ScalTy
3346-
(loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))),
3347-
(INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
3348-
(LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset),
3349-
sub)>;
3350-
}
3351-
3352-
let AddedComplexity = 10 in {
3353-
defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v8i8, LDRBroW, LDRBroX, bsub>;
3354-
defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v16i8, LDRBroW, LDRBroX, bsub>;
3355-
3356-
defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>;
3357-
defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>;
3358-
3359-
defm : ScalToVecROLoadPat<ro16, load, i32, v4f16, LDRHroW, LDRHroX, hsub>;
3360-
defm : ScalToVecROLoadPat<ro16, load, i32, v8f16, LDRHroW, LDRHroX, hsub>;
3361-
3362-
defm : ScalToVecROLoadPat<ro32, load, i32, v2i32, LDRSroW, LDRSroX, ssub>;
3363-
defm : ScalToVecROLoadPat<ro32, load, i32, v4i32, LDRSroW, LDRSroX, ssub>;
3364-
3365-
defm : ScalToVecROLoadPat<ro32, load, f32, v2f32, LDRSroW, LDRSroX, ssub>;
3366-
defm : ScalToVecROLoadPat<ro32, load, f32, v4f32, LDRSroW, LDRSroX, ssub>;
3367-
3368-
defm : ScalToVecROLoadPat<ro64, load, i64, v2i64, LDRDroW, LDRDroX, dsub>;
3369-
3370-
defm : ScalToVecROLoadPat<ro64, load, f64, v2f64, LDRDroW, LDRDroX, dsub>;
3371-
3372-
3373-
def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
3374-
(load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
3375-
ro_Wextend64:$extend))))),
3376-
(LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
3377-
3378-
def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
3379-
(load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
3380-
ro_Xextend64:$extend))))),
3381-
(LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
3382-
}
3383-
33843327
// Match all load 64 bits width whose type is compatible with FPR64
33853328
multiclass VecROLoadPat<ROAddrMode ro, ValueType VecTy,
33863329
Instruction LOADW, Instruction LOADX> {
@@ -3504,42 +3447,6 @@ defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128Op, uimm12s16, "ldr",
35043447
def : Pat <(bf16 (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
35053448
(LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
35063449

3507-
// For regular load, we do not have any alignment requirement.
3508-
// Thus, it is safe to directly map the vector loads with interesting
3509-
// addressing modes.
3510-
// FIXME: We could do the same for bitconvert to floating point vectors.
3511-
def : Pat <(v8i8 (vec_ins_or_scal_vec (i32
3512-
(extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
3513-
(INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
3514-
(LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
3515-
def : Pat <(v16i8 (vec_ins_or_scal_vec (i32
3516-
(extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
3517-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
3518-
(LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
3519-
def : Pat <(v4i16 (vec_ins_or_scal_vec (i32
3520-
(extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
3521-
(INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
3522-
(LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
3523-
def : Pat <(v8i16 (vec_ins_or_scal_vec (i32
3524-
(extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
3525-
(INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
3526-
(LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
3527-
def : Pat <(v2i32 (vec_ins_or_scal_vec (i32
3528-
(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
3529-
(INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
3530-
(LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
3531-
def : Pat <(v4i32 (vec_ins_or_scal_vec (i32
3532-
(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
3533-
(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
3534-
(LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
3535-
def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
3536-
(load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
3537-
(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
3538-
def : Pat <(v2i64 (vec_ins_or_scal_vec (i64
3539-
(load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
3540-
(INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
3541-
(LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>;
3542-
35433450
// Match all load 64 bits width whose type is compatible with FPR64
35443451
let Predicates = [IsLE] in {
35453452
// We must use LD1 to perform vector loads in big-endian.
@@ -3905,12 +3812,13 @@ def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
39053812
def : InstAlias<"ldrsw $Rt, [$Rn, $offset]",
39063813
(LDURSWi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
39073814

3908-
// A LDR will implicitly zero the rest of the vector, so vector_insert(zeros,
3909-
// load, 0) can use a single load.
3910-
multiclass LoadInsertZeroPatterns<SDPatternOperator LoadOp, ValueType VT, ValueType HVT, ValueType SVT,
3911-
ValueType ScalarVT, Instruction LoadInst, Instruction UnscaledLoadInst,
3912-
ComplexPattern Addr, ComplexPattern UnscaledAddr, Operand AddrImm,
3913-
SubRegIndex SubReg> {
3815+
// A LDR will implicitly zero the rest of the vector, so vector_insert(zeros, load, 0)
3816+
// can use a single load. Same for scalar_to_vector(load) or insert(undef, load, 0).
3817+
multiclass LoadInsertVTPatterns<SDPatternOperator LoadOp, ValueType VT, ValueType ScalarVT,
3818+
Instruction LoadInst, Instruction UnscaledLoadInst,
3819+
Instruction ROWLoadInst, Instruction ROXLoadInst,
3820+
ROAddrMode ro, ComplexPattern Addr, ComplexPattern UnscaledAddr,
3821+
Operand AddrImm, SubRegIndex SubReg> {
39143822
// Scaled
39153823
def : Pat <(vector_insert (VT immAllZerosV),
39163824
(ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)),
@@ -3919,42 +3827,82 @@ multiclass LoadInsertZeroPatterns<SDPatternOperator LoadOp, ValueType VT, ValueT
39193827
def : Pat <(vector_insert (VT immAllZerosV),
39203828
(ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)),
39213829
(SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>;
3830+
// roW
3831+
def : Pat <(vector_insert (VT immAllZerosV),
3832+
(ScalarVT (LoadOp (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))), (i64 0)),
3833+
(SUBREG_TO_REG (i64 0), (ROWLoadInst GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend), SubReg)>;
3834+
// roX
3835+
def : Pat <(vector_insert (VT immAllZerosV),
3836+
(ScalarVT (LoadOp (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))), (i64 0)),
3837+
(SUBREG_TO_REG (i64 0), (ROXLoadInst GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend), SubReg)>;
39223838

3923-
// Half-vector patterns
3924-
def : Pat <(vector_insert (HVT immAllZerosV),
3925-
(ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)),
3926-
(SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>;
3927-
// Unscaled
3928-
def : Pat <(vector_insert (HVT immAllZerosV),
3929-
(ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)),
3930-
(SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>;
3931-
3932-
// SVE patterns
3933-
def : Pat <(vector_insert (SVT immAllZerosV),
3934-
(ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)),
3935-
(SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>;
3936-
// Unscaled
3937-
def : Pat <(vector_insert (SVT immAllZerosV),
3938-
(ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)),
3839+
// Undef equivalents of the patterns above.
3840+
def : Pat <(VT (vec_ins_or_scal_vec
3841+
(ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))))),
3842+
(SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>;
3843+
def : Pat <(VT (vec_ins_or_scal_vec
3844+
(ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))))),
39393845
(SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>;
3940-
}
3941-
3942-
defm : LoadInsertZeroPatterns<extloadi8, v16i8, v8i8, nxv16i8, i32, LDRBui, LDURBi,
3943-
am_indexed8, am_unscaled8, uimm12s1, bsub>;
3944-
defm : LoadInsertZeroPatterns<extloadi16, v8i16, v4i16, nxv8i16, i32, LDRHui, LDURHi,
3945-
am_indexed16, am_unscaled16, uimm12s2, hsub>;
3946-
defm : LoadInsertZeroPatterns<load, v4i32, v2i32, nxv4i32, i32, LDRSui, LDURSi,
3947-
am_indexed32, am_unscaled32, uimm12s4, ssub>;
3948-
defm : LoadInsertZeroPatterns<load, v2i64, v1i64, nxv2i64, i64, LDRDui, LDURDi,
3949-
am_indexed64, am_unscaled64, uimm12s8, dsub>;
3950-
defm : LoadInsertZeroPatterns<load, v8f16, v4f16, nxv8f16, f16, LDRHui, LDURHi,
3951-
am_indexed16, am_unscaled16, uimm12s2, hsub>;
3952-
defm : LoadInsertZeroPatterns<load, v8bf16, v4bf16, nxv8bf16, bf16, LDRHui, LDURHi,
3953-
am_indexed16, am_unscaled16, uimm12s2, hsub>;
3954-
defm : LoadInsertZeroPatterns<load, v4f32, v2f32, nxv4f32, f32, LDRSui, LDURSi,
3955-
am_indexed32, am_unscaled32, uimm12s4, ssub>;
3956-
defm : LoadInsertZeroPatterns<load, v2f64, v1f64, nxv2f64, f64, LDRDui, LDURDi,
3957-
am_indexed64, am_unscaled64, uimm12s8, dsub>;
3846+
def : Pat <(VT (vec_ins_or_scal_vec
3847+
(ScalarVT (LoadOp (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))))),
3848+
(SUBREG_TO_REG (i64 0), (ROWLoadInst GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend), SubReg)>;
3849+
def : Pat <(VT (vec_ins_or_scal_vec
3850+
(ScalarVT (LoadOp (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))))),
3851+
(SUBREG_TO_REG (i64 0), (ROXLoadInst GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend), SubReg)>;
3852+
}
3853+
3854+
multiclass LoadInsertPatterns<SDPatternOperator LoadOp, ValueType VT, ValueType HVT, ValueType SVT,
3855+
ValueType ScalarVT, Instruction LoadInst, Instruction UnscaledLoadInst,
3856+
Instruction ROWLoadInst, Instruction ROXLoadInst,
3857+
ROAddrMode ro, ComplexPattern Addr, ComplexPattern UnscaledAddr,
3858+
Operand AddrImm, SubRegIndex SubReg> {
3859+
defm : LoadInsertVTPatterns<LoadOp, VT, ScalarVT, LoadInst, UnscaledLoadInst, ROWLoadInst,
3860+
ROXLoadInst, ro, Addr, UnscaledAddr, AddrImm, SubReg>;
3861+
defm : LoadInsertVTPatterns<LoadOp, HVT, ScalarVT, LoadInst, UnscaledLoadInst, ROWLoadInst,
3862+
ROXLoadInst, ro, Addr, UnscaledAddr, AddrImm, SubReg>;
3863+
defm : LoadInsertVTPatterns<LoadOp, SVT, ScalarVT, LoadInst, UnscaledLoadInst, ROWLoadInst,
3864+
ROXLoadInst, ro, Addr, UnscaledAddr, AddrImm, SubReg>;
3865+
}
3866+
3867+
defm : LoadInsertPatterns<extloadi8, v16i8, v8i8, nxv16i8, i32,
3868+
LDRBui, LDURBi, LDRBroW, LDRBroX,
3869+
ro8, am_indexed8, am_unscaled8, uimm12s1, bsub>;
3870+
defm : LoadInsertPatterns<extloadi16, v8i16, v4i16, nxv8i16, i32,
3871+
LDRHui, LDURHi, LDRHroW, LDRHroX,
3872+
ro16, am_indexed16, am_unscaled16, uimm12s2, hsub>;
3873+
defm : LoadInsertPatterns<load, v4i32, v2i32, nxv4i32, i32,
3874+
LDRSui, LDURSi, LDRSroW, LDRSroX,
3875+
ro32, am_indexed32, am_unscaled32, uimm12s4, ssub>;
3876+
defm : LoadInsertPatterns<load, v2i64, isVoid, nxv2i64, i64,
3877+
LDRDui, LDURDi, LDRDroW, LDRDroX,
3878+
ro64, am_indexed64, am_unscaled64, uimm12s8, dsub>;
3879+
defm : LoadInsertPatterns<load, v8f16, v4f16, nxv8f16, f16,
3880+
LDRHui, LDURHi, LDRHroW, LDRHroX,
3881+
ro16, am_indexed16, am_unscaled16, uimm12s2, hsub>;
3882+
defm : LoadInsertPatterns<load, v8bf16, v4bf16, nxv8bf16, bf16,
3883+
LDRHui, LDURHi, LDRHroW, LDRHroX,
3884+
ro16, am_indexed16, am_unscaled16, uimm12s2, hsub>;
3885+
defm : LoadInsertPatterns<load, v4f32, v2f32, nxv4f32, f32,
3886+
LDRSui, LDURSi, LDRSroW, LDRSroX,
3887+
ro32, am_indexed32, am_unscaled32, uimm12s4, ssub>;
3888+
defm : LoadInsertPatterns<load, v2f64, isVoid, nxv2f64, f64,
3889+
LDRDui, LDURDi, LDRDroW, LDRDroX,
3890+
ro64, am_indexed64, am_unscaled64, uimm12s8, dsub>;
3891+
3892+
// Extra patterns for v1f64 scalar_to_vector(load), which need to avoid the
3893+
// SUBREG_TO_REG used above.
3894+
def : Pat <(v1i64 (scalar_to_vector (i64
3895+
(load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
3896+
(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
3897+
def : Pat <(v1i64 (scalar_to_vector (i64
3898+
(load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))))),
3899+
(LDURDi GPR64sp:$Rn, simm9:$offset)>;
3900+
def : Pat <(v1i64 (scalar_to_vector (i64
3901+
(load (ro64.Wpat GPR64sp:$Rn, GPR32:$Rm, ro64.Wext:$extend))))),
3902+
(LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro64.Wext:$extend)>;
3903+
def : Pat <(v1i64 (scalar_to_vector (i64
3904+
(load (ro64.Xpat GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend))))),
3905+
(LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend)>;
39583906

39593907
// Pre-fetch.
39603908
defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",

llvm/test/CodeGen/AArch64/build-vector-two-dup.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,9 @@ entry:
7878
define <16 x i8> @test5(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
7979
; CHECK-LABEL: test5:
8080
; CHECK: // %bb.0: // %entry
81-
; CHECK-NEXT: ldr b0, [x0]
8281
; CHECK-NEXT: adrp x8, .LCPI4_0
8382
; CHECK-NEXT: ld1r { v1.16b }, [x1]
83+
; CHECK-NEXT: ldr b0, [x0]
8484
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_0]
8585
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
8686
; CHECK-NEXT: ret

0 commit comments

Comments
 (0)