Skip to content

Commit 558d046

Browse files
davemgreentmsri
authored andcommitted
[AArch64] Extend and rewrite load zero and load undef patterns (llvm#108185)
The ldr instructions implicitly zero any upper lanes, so we can use them for insert(zerovec, load, 0) patterns. Likewise insert(undef, load, 0) or scalar_to_reg can reuse the scalar loads as the top bits are undef. This patch makes sure there are patterns for each type and for each of the normal, unaligned, roW and roX addressing modes.
1 parent f5fa16c commit 558d046

11 files changed

+838
-913
lines changed

llvm/lib/Target/AArch64/AArch64InstrFormats.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,11 @@ def dup_v4f32 :
177177
[(v2f32 (extract_subvector (v4f32 (AArch64duplane32 (v4f32 node:$LHS), node:$RHS)), (i64 0))),
178178
(v2f32 (AArch64duplane32 (v4f32 node:$LHS), node:$RHS))]>;
179179

180+
// Match either a scalar_to_vector (from SDAG) or a vector_insert of undef (from GISel)
181+
def vec_ins_or_scal_vec : PatFrags<(ops node:$src),
182+
[(vector_insert undef, node:$src, (i64 0)),
183+
(scalar_to_vector node:$src)]>;
184+
180185
//===----------------------------------------------------------------------===//
181186
// Asm Operand Classes.
182187
//

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 81 additions & 133 deletions
Original file line numberDiff line numberDiff line change
@@ -3321,63 +3321,6 @@ defm LDRSW : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>;
33213321
// Pre-fetch.
33223322
defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">;
33233323

3324-
def vec_ins_or_scal_vec : PatFrags<(ops node:$src),
3325-
[(vector_insert undef, node:$src, (i64 0)),
3326-
(scalar_to_vector node:$src)]>;
3327-
3328-
// For regular load, we do not have any alignment requirement.
3329-
// Thus, it is safe to directly map the vector loads with interesting
3330-
// addressing modes.
3331-
// FIXME: We could do the same for bitconvert to floating point vectors.
3332-
multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop,
3333-
ValueType ScalTy, ValueType VecTy,
3334-
Instruction LOADW, Instruction LOADX,
3335-
SubRegIndex sub> {
3336-
def : Pat<(VecTy (vec_ins_or_scal_vec (ScalTy
3337-
(loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))),
3338-
(INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
3339-
(LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset),
3340-
sub)>;
3341-
3342-
def : Pat<(VecTy (vec_ins_or_scal_vec (ScalTy
3343-
(loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))),
3344-
(INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
3345-
(LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset),
3346-
sub)>;
3347-
}
3348-
3349-
let AddedComplexity = 10 in {
3350-
defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v8i8, LDRBroW, LDRBroX, bsub>;
3351-
defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v16i8, LDRBroW, LDRBroX, bsub>;
3352-
3353-
defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>;
3354-
defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>;
3355-
3356-
defm : ScalToVecROLoadPat<ro16, load, i32, v4f16, LDRHroW, LDRHroX, hsub>;
3357-
defm : ScalToVecROLoadPat<ro16, load, i32, v8f16, LDRHroW, LDRHroX, hsub>;
3358-
3359-
defm : ScalToVecROLoadPat<ro32, load, i32, v2i32, LDRSroW, LDRSroX, ssub>;
3360-
defm : ScalToVecROLoadPat<ro32, load, i32, v4i32, LDRSroW, LDRSroX, ssub>;
3361-
3362-
defm : ScalToVecROLoadPat<ro32, load, f32, v2f32, LDRSroW, LDRSroX, ssub>;
3363-
defm : ScalToVecROLoadPat<ro32, load, f32, v4f32, LDRSroW, LDRSroX, ssub>;
3364-
3365-
defm : ScalToVecROLoadPat<ro64, load, i64, v2i64, LDRDroW, LDRDroX, dsub>;
3366-
3367-
defm : ScalToVecROLoadPat<ro64, load, f64, v2f64, LDRDroW, LDRDroX, dsub>;
3368-
3369-
3370-
def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
3371-
(load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
3372-
ro_Wextend64:$extend))))),
3373-
(LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
3374-
3375-
def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
3376-
(load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
3377-
ro_Xextend64:$extend))))),
3378-
(LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
3379-
}
3380-
33813324
// Match all load 64 bits width whose type is compatible with FPR64
33823325
multiclass VecROLoadPat<ROAddrMode ro, ValueType VecTy,
33833326
Instruction LOADW, Instruction LOADX> {
@@ -3501,42 +3444,6 @@ defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128Op, uimm12s16, "ldr",
35013444
def : Pat <(bf16 (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
35023445
(LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
35033446

3504-
// For regular load, we do not have any alignment requirement.
3505-
// Thus, it is safe to directly map the vector loads with interesting
3506-
// addressing modes.
3507-
// FIXME: We could do the same for bitconvert to floating point vectors.
3508-
def : Pat <(v8i8 (vec_ins_or_scal_vec (i32
3509-
(extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
3510-
(INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
3511-
(LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
3512-
def : Pat <(v16i8 (vec_ins_or_scal_vec (i32
3513-
(extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
3514-
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
3515-
(LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
3516-
def : Pat <(v4i16 (vec_ins_or_scal_vec (i32
3517-
(extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
3518-
(INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
3519-
(LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
3520-
def : Pat <(v8i16 (vec_ins_or_scal_vec (i32
3521-
(extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
3522-
(INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
3523-
(LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
3524-
def : Pat <(v2i32 (vec_ins_or_scal_vec (i32
3525-
(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
3526-
(INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
3527-
(LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
3528-
def : Pat <(v4i32 (vec_ins_or_scal_vec (i32
3529-
(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
3530-
(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
3531-
(LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
3532-
def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
3533-
(load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
3534-
(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
3535-
def : Pat <(v2i64 (vec_ins_or_scal_vec (i64
3536-
(load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
3537-
(INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
3538-
(LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>;
3539-
35403447
// Match all load 64 bits width whose type is compatible with FPR64
35413448
let Predicates = [IsLE] in {
35423449
// We must use LD1 to perform vector loads in big-endian.
@@ -3902,12 +3809,13 @@ def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
39023809
def : InstAlias<"ldrsw $Rt, [$Rn, $offset]",
39033810
(LDURSWi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
39043811

3905-
// A LDR will implicitly zero the rest of the vector, so vector_insert(zeros,
3906-
// load, 0) can use a single load.
3907-
multiclass LoadInsertZeroPatterns<SDPatternOperator LoadOp, ValueType VT, ValueType HVT, ValueType SVT,
3908-
ValueType ScalarVT, Instruction LoadInst, Instruction UnscaledLoadInst,
3909-
ComplexPattern Addr, ComplexPattern UnscaledAddr, Operand AddrImm,
3910-
SubRegIndex SubReg> {
3812+
// A LDR will implicitly zero the rest of the vector, so vector_insert(zeros, load, 0)
3813+
// can use a single load. Same for scalar_to_vector(load) or insert(undef, load, 0).
3814+
multiclass LoadInsertVTPatterns<SDPatternOperator LoadOp, ValueType VT, ValueType ScalarVT,
3815+
Instruction LoadInst, Instruction UnscaledLoadInst,
3816+
Instruction ROWLoadInst, Instruction ROXLoadInst,
3817+
ROAddrMode ro, ComplexPattern Addr, ComplexPattern UnscaledAddr,
3818+
Operand AddrImm, SubRegIndex SubReg> {
39113819
// Scaled
39123820
def : Pat <(vector_insert (VT immAllZerosV),
39133821
(ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)),
@@ -3916,42 +3824,82 @@ multiclass LoadInsertZeroPatterns<SDPatternOperator LoadOp, ValueType VT, ValueT
39163824
def : Pat <(vector_insert (VT immAllZerosV),
39173825
(ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)),
39183826
(SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>;
3827+
// roW
3828+
def : Pat <(vector_insert (VT immAllZerosV),
3829+
(ScalarVT (LoadOp (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))), (i64 0)),
3830+
(SUBREG_TO_REG (i64 0), (ROWLoadInst GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend), SubReg)>;
3831+
// roX
3832+
def : Pat <(vector_insert (VT immAllZerosV),
3833+
(ScalarVT (LoadOp (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))), (i64 0)),
3834+
(SUBREG_TO_REG (i64 0), (ROXLoadInst GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend), SubReg)>;
39193835

3920-
// Half-vector patterns
3921-
def : Pat <(vector_insert (HVT immAllZerosV),
3922-
(ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)),
3923-
(SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>;
3924-
// Unscaled
3925-
def : Pat <(vector_insert (HVT immAllZerosV),
3926-
(ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)),
3927-
(SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>;
3928-
3929-
// SVE patterns
3930-
def : Pat <(vector_insert (SVT immAllZerosV),
3931-
(ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)),
3932-
(SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>;
3933-
// Unscaled
3934-
def : Pat <(vector_insert (SVT immAllZerosV),
3935-
(ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)),
3836+
// Undef equivalents of the patterns above.
3837+
def : Pat <(VT (vec_ins_or_scal_vec
3838+
(ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))))),
3839+
(SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>;
3840+
def : Pat <(VT (vec_ins_or_scal_vec
3841+
(ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))))),
39363842
(SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>;
3937-
}
3938-
3939-
defm : LoadInsertZeroPatterns<extloadi8, v16i8, v8i8, nxv16i8, i32, LDRBui, LDURBi,
3940-
am_indexed8, am_unscaled8, uimm12s1, bsub>;
3941-
defm : LoadInsertZeroPatterns<extloadi16, v8i16, v4i16, nxv8i16, i32, LDRHui, LDURHi,
3942-
am_indexed16, am_unscaled16, uimm12s2, hsub>;
3943-
defm : LoadInsertZeroPatterns<load, v4i32, v2i32, nxv4i32, i32, LDRSui, LDURSi,
3944-
am_indexed32, am_unscaled32, uimm12s4, ssub>;
3945-
defm : LoadInsertZeroPatterns<load, v2i64, v1i64, nxv2i64, i64, LDRDui, LDURDi,
3946-
am_indexed64, am_unscaled64, uimm12s8, dsub>;
3947-
defm : LoadInsertZeroPatterns<load, v8f16, v4f16, nxv8f16, f16, LDRHui, LDURHi,
3948-
am_indexed16, am_unscaled16, uimm12s2, hsub>;
3949-
defm : LoadInsertZeroPatterns<load, v8bf16, v4bf16, nxv8bf16, bf16, LDRHui, LDURHi,
3950-
am_indexed16, am_unscaled16, uimm12s2, hsub>;
3951-
defm : LoadInsertZeroPatterns<load, v4f32, v2f32, nxv4f32, f32, LDRSui, LDURSi,
3952-
am_indexed32, am_unscaled32, uimm12s4, ssub>;
3953-
defm : LoadInsertZeroPatterns<load, v2f64, v1f64, nxv2f64, f64, LDRDui, LDURDi,
3954-
am_indexed64, am_unscaled64, uimm12s8, dsub>;
3843+
def : Pat <(VT (vec_ins_or_scal_vec
3844+
(ScalarVT (LoadOp (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))))),
3845+
(SUBREG_TO_REG (i64 0), (ROWLoadInst GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend), SubReg)>;
3846+
def : Pat <(VT (vec_ins_or_scal_vec
3847+
(ScalarVT (LoadOp (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))))),
3848+
(SUBREG_TO_REG (i64 0), (ROXLoadInst GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend), SubReg)>;
3849+
}
3850+
3851+
multiclass LoadInsertPatterns<SDPatternOperator LoadOp, ValueType VT, ValueType HVT, ValueType SVT,
3852+
ValueType ScalarVT, Instruction LoadInst, Instruction UnscaledLoadInst,
3853+
Instruction ROWLoadInst, Instruction ROXLoadInst,
3854+
ROAddrMode ro, ComplexPattern Addr, ComplexPattern UnscaledAddr,
3855+
Operand AddrImm, SubRegIndex SubReg> {
3856+
defm : LoadInsertVTPatterns<LoadOp, VT, ScalarVT, LoadInst, UnscaledLoadInst, ROWLoadInst,
3857+
ROXLoadInst, ro, Addr, UnscaledAddr, AddrImm, SubReg>;
3858+
defm : LoadInsertVTPatterns<LoadOp, HVT, ScalarVT, LoadInst, UnscaledLoadInst, ROWLoadInst,
3859+
ROXLoadInst, ro, Addr, UnscaledAddr, AddrImm, SubReg>;
3860+
defm : LoadInsertVTPatterns<LoadOp, SVT, ScalarVT, LoadInst, UnscaledLoadInst, ROWLoadInst,
3861+
ROXLoadInst, ro, Addr, UnscaledAddr, AddrImm, SubReg>;
3862+
}
3863+
3864+
defm : LoadInsertPatterns<extloadi8, v16i8, v8i8, nxv16i8, i32,
3865+
LDRBui, LDURBi, LDRBroW, LDRBroX,
3866+
ro8, am_indexed8, am_unscaled8, uimm12s1, bsub>;
3867+
defm : LoadInsertPatterns<extloadi16, v8i16, v4i16, nxv8i16, i32,
3868+
LDRHui, LDURHi, LDRHroW, LDRHroX,
3869+
ro16, am_indexed16, am_unscaled16, uimm12s2, hsub>;
3870+
defm : LoadInsertPatterns<load, v4i32, v2i32, nxv4i32, i32,
3871+
LDRSui, LDURSi, LDRSroW, LDRSroX,
3872+
ro32, am_indexed32, am_unscaled32, uimm12s4, ssub>;
3873+
defm : LoadInsertPatterns<load, v2i64, isVoid, nxv2i64, i64,
3874+
LDRDui, LDURDi, LDRDroW, LDRDroX,
3875+
ro64, am_indexed64, am_unscaled64, uimm12s8, dsub>;
3876+
defm : LoadInsertPatterns<load, v8f16, v4f16, nxv8f16, f16,
3877+
LDRHui, LDURHi, LDRHroW, LDRHroX,
3878+
ro16, am_indexed16, am_unscaled16, uimm12s2, hsub>;
3879+
defm : LoadInsertPatterns<load, v8bf16, v4bf16, nxv8bf16, bf16,
3880+
LDRHui, LDURHi, LDRHroW, LDRHroX,
3881+
ro16, am_indexed16, am_unscaled16, uimm12s2, hsub>;
3882+
defm : LoadInsertPatterns<load, v4f32, v2f32, nxv4f32, f32,
3883+
LDRSui, LDURSi, LDRSroW, LDRSroX,
3884+
ro32, am_indexed32, am_unscaled32, uimm12s4, ssub>;
3885+
defm : LoadInsertPatterns<load, v2f64, isVoid, nxv2f64, f64,
3886+
LDRDui, LDURDi, LDRDroW, LDRDroX,
3887+
ro64, am_indexed64, am_unscaled64, uimm12s8, dsub>;
3888+
3889+
// Extra patterns for v1f64 scalar_to_vector(load), which need to avoid the
3890+
// SUBREG_TO_REG used above.
3891+
def : Pat <(v1i64 (scalar_to_vector (i64
3892+
(load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
3893+
(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
3894+
def : Pat <(v1i64 (scalar_to_vector (i64
3895+
(load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))))),
3896+
(LDURDi GPR64sp:$Rn, simm9:$offset)>;
3897+
def : Pat <(v1i64 (scalar_to_vector (i64
3898+
(load (ro64.Wpat GPR64sp:$Rn, GPR32:$Rm, ro64.Wext:$extend))))),
3899+
(LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro64.Wext:$extend)>;
3900+
def : Pat <(v1i64 (scalar_to_vector (i64
3901+
(load (ro64.Xpat GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend))))),
3902+
(LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend)>;
39553903

39563904
// Pre-fetch.
39573905
defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",

llvm/test/CodeGen/AArch64/build-vector-two-dup.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,9 @@ entry:
7878
define <16 x i8> @test5(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
7979
; CHECK-LABEL: test5:
8080
; CHECK: // %bb.0: // %entry
81-
; CHECK-NEXT: ldr b0, [x0]
8281
; CHECK-NEXT: adrp x8, .LCPI4_0
8382
; CHECK-NEXT: ld1r { v1.16b }, [x1]
83+
; CHECK-NEXT: ldr b0, [x0]
8484
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_0]
8585
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
8686
; CHECK-NEXT: ret

0 commit comments

Comments
 (0)