@@ -3324,63 +3324,6 @@ defm LDRSW : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>;
3324
3324
// Pre-fetch.
3325
3325
defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">;
3326
3326
3327
- def vec_ins_or_scal_vec : PatFrags<(ops node:$src),
3328
- [(vector_insert undef, node:$src, (i64 0)),
3329
- (scalar_to_vector node:$src)]>;
3330
-
3331
- // For regular load, we do not have any alignment requirement.
3332
- // Thus, it is safe to directly map the vector loads with interesting
3333
- // addressing modes.
3334
- // FIXME: We could do the same for bitconvert to floating point vectors.
3335
- multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop,
3336
- ValueType ScalTy, ValueType VecTy,
3337
- Instruction LOADW, Instruction LOADX,
3338
- SubRegIndex sub> {
3339
- def : Pat<(VecTy (vec_ins_or_scal_vec (ScalTy
3340
- (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))),
3341
- (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
3342
- (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset),
3343
- sub)>;
3344
-
3345
- def : Pat<(VecTy (vec_ins_or_scal_vec (ScalTy
3346
- (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))),
3347
- (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
3348
- (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset),
3349
- sub)>;
3350
- }
3351
-
3352
- let AddedComplexity = 10 in {
3353
- defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v8i8, LDRBroW, LDRBroX, bsub>;
3354
- defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v16i8, LDRBroW, LDRBroX, bsub>;
3355
-
3356
- defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>;
3357
- defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>;
3358
-
3359
- defm : ScalToVecROLoadPat<ro16, load, i32, v4f16, LDRHroW, LDRHroX, hsub>;
3360
- defm : ScalToVecROLoadPat<ro16, load, i32, v8f16, LDRHroW, LDRHroX, hsub>;
3361
-
3362
- defm : ScalToVecROLoadPat<ro32, load, i32, v2i32, LDRSroW, LDRSroX, ssub>;
3363
- defm : ScalToVecROLoadPat<ro32, load, i32, v4i32, LDRSroW, LDRSroX, ssub>;
3364
-
3365
- defm : ScalToVecROLoadPat<ro32, load, f32, v2f32, LDRSroW, LDRSroX, ssub>;
3366
- defm : ScalToVecROLoadPat<ro32, load, f32, v4f32, LDRSroW, LDRSroX, ssub>;
3367
-
3368
- defm : ScalToVecROLoadPat<ro64, load, i64, v2i64, LDRDroW, LDRDroX, dsub>;
3369
-
3370
- defm : ScalToVecROLoadPat<ro64, load, f64, v2f64, LDRDroW, LDRDroX, dsub>;
3371
-
3372
-
3373
- def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
3374
- (load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
3375
- ro_Wextend64:$extend))))),
3376
- (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
3377
-
3378
- def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
3379
- (load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
3380
- ro_Xextend64:$extend))))),
3381
- (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
3382
- }
3383
-
3384
3327
// Match all load 64 bits width whose type is compatible with FPR64
3385
3328
multiclass VecROLoadPat<ROAddrMode ro, ValueType VecTy,
3386
3329
Instruction LOADW, Instruction LOADX> {
@@ -3504,42 +3447,6 @@ defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128Op, uimm12s16, "ldr",
3504
3447
def : Pat <(bf16 (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
3505
3448
(LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
3506
3449
3507
- // For regular load, we do not have any alignment requirement.
3508
- // Thus, it is safe to directly map the vector loads with interesting
3509
- // addressing modes.
3510
- // FIXME: We could do the same for bitconvert to floating point vectors.
3511
- def : Pat <(v8i8 (vec_ins_or_scal_vec (i32
3512
- (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
3513
- (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
3514
- (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
3515
- def : Pat <(v16i8 (vec_ins_or_scal_vec (i32
3516
- (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
3517
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
3518
- (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
3519
- def : Pat <(v4i16 (vec_ins_or_scal_vec (i32
3520
- (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
3521
- (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
3522
- (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
3523
- def : Pat <(v8i16 (vec_ins_or_scal_vec (i32
3524
- (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
3525
- (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
3526
- (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
3527
- def : Pat <(v2i32 (vec_ins_or_scal_vec (i32
3528
- (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
3529
- (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
3530
- (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
3531
- def : Pat <(v4i32 (vec_ins_or_scal_vec (i32
3532
- (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
3533
- (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
3534
- (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
3535
- def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
3536
- (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
3537
- (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
3538
- def : Pat <(v2i64 (vec_ins_or_scal_vec (i64
3539
- (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
3540
- (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
3541
- (LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>;
3542
-
3543
3450
// Match all load 64 bits width whose type is compatible with FPR64
3544
3451
let Predicates = [IsLE] in {
3545
3452
// We must use LD1 to perform vector loads in big-endian.
@@ -3905,12 +3812,13 @@ def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
3905
3812
def : InstAlias<"ldrsw $Rt, [$Rn, $offset]",
3906
3813
(LDURSWi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
3907
3814
3908
- // A LDR will implicitly zero the rest of the vector, so vector_insert(zeros,
3909
- // load, 0) can use a single load.
3910
- multiclass LoadInsertZeroPatterns<SDPatternOperator LoadOp, ValueType VT, ValueType HVT, ValueType SVT,
3911
- ValueType ScalarVT, Instruction LoadInst, Instruction UnscaledLoadInst,
3912
- ComplexPattern Addr, ComplexPattern UnscaledAddr, Operand AddrImm,
3913
- SubRegIndex SubReg> {
3815
+ // A LDR will implicitly zero the rest of the vector, so vector_insert(zeros, load, 0)
3816
+ // can use a single load. Same for scalar_to_vector(load) or insert(undef, load, 0).
3817
+ multiclass LoadInsertVTPatterns<SDPatternOperator LoadOp, ValueType VT, ValueType ScalarVT,
3818
+ Instruction LoadInst, Instruction UnscaledLoadInst,
3819
+ Instruction ROWLoadInst, Instruction ROXLoadInst,
3820
+ ROAddrMode ro, ComplexPattern Addr, ComplexPattern UnscaledAddr,
3821
+ Operand AddrImm, SubRegIndex SubReg> {
3914
3822
// Scaled
3915
3823
def : Pat <(vector_insert (VT immAllZerosV),
3916
3824
(ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)),
@@ -3919,42 +3827,82 @@ multiclass LoadInsertZeroPatterns<SDPatternOperator LoadOp, ValueType VT, ValueT
3919
3827
def : Pat <(vector_insert (VT immAllZerosV),
3920
3828
(ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)),
3921
3829
(SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>;
3830
+ // roW
3831
+ def : Pat <(vector_insert (VT immAllZerosV),
3832
+ (ScalarVT (LoadOp (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))), (i64 0)),
3833
+ (SUBREG_TO_REG (i64 0), (ROWLoadInst GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend), SubReg)>;
3834
+ // roX
3835
+ def : Pat <(vector_insert (VT immAllZerosV),
3836
+ (ScalarVT (LoadOp (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))), (i64 0)),
3837
+ (SUBREG_TO_REG (i64 0), (ROXLoadInst GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend), SubReg)>;
3922
3838
3923
- // Half-vector patterns
3924
- def : Pat <(vector_insert (HVT immAllZerosV),
3925
- (ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)),
3926
- (SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>;
3927
- // Unscaled
3928
- def : Pat <(vector_insert (HVT immAllZerosV),
3929
- (ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)),
3930
- (SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>;
3931
-
3932
- // SVE patterns
3933
- def : Pat <(vector_insert (SVT immAllZerosV),
3934
- (ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)),
3935
- (SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>;
3936
- // Unscaled
3937
- def : Pat <(vector_insert (SVT immAllZerosV),
3938
- (ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)),
3839
+ // Undef equivalents of the patterns above.
3840
+ def : Pat <(VT (vec_ins_or_scal_vec
3841
+ (ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))))),
3842
+ (SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>;
3843
+ def : Pat <(VT (vec_ins_or_scal_vec
3844
+ (ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))))),
3939
3845
(SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>;
3940
- }
3941
-
3942
- defm : LoadInsertZeroPatterns<extloadi8, v16i8, v8i8, nxv16i8, i32, LDRBui, LDURBi,
3943
- am_indexed8, am_unscaled8, uimm12s1, bsub>;
3944
- defm : LoadInsertZeroPatterns<extloadi16, v8i16, v4i16, nxv8i16, i32, LDRHui, LDURHi,
3945
- am_indexed16, am_unscaled16, uimm12s2, hsub>;
3946
- defm : LoadInsertZeroPatterns<load, v4i32, v2i32, nxv4i32, i32, LDRSui, LDURSi,
3947
- am_indexed32, am_unscaled32, uimm12s4, ssub>;
3948
- defm : LoadInsertZeroPatterns<load, v2i64, v1i64, nxv2i64, i64, LDRDui, LDURDi,
3949
- am_indexed64, am_unscaled64, uimm12s8, dsub>;
3950
- defm : LoadInsertZeroPatterns<load, v8f16, v4f16, nxv8f16, f16, LDRHui, LDURHi,
3951
- am_indexed16, am_unscaled16, uimm12s2, hsub>;
3952
- defm : LoadInsertZeroPatterns<load, v8bf16, v4bf16, nxv8bf16, bf16, LDRHui, LDURHi,
3953
- am_indexed16, am_unscaled16, uimm12s2, hsub>;
3954
- defm : LoadInsertZeroPatterns<load, v4f32, v2f32, nxv4f32, f32, LDRSui, LDURSi,
3955
- am_indexed32, am_unscaled32, uimm12s4, ssub>;
3956
- defm : LoadInsertZeroPatterns<load, v2f64, v1f64, nxv2f64, f64, LDRDui, LDURDi,
3957
- am_indexed64, am_unscaled64, uimm12s8, dsub>;
3846
+ def : Pat <(VT (vec_ins_or_scal_vec
3847
+ (ScalarVT (LoadOp (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))))),
3848
+ (SUBREG_TO_REG (i64 0), (ROWLoadInst GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend), SubReg)>;
3849
+ def : Pat <(VT (vec_ins_or_scal_vec
3850
+ (ScalarVT (LoadOp (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))))),
3851
+ (SUBREG_TO_REG (i64 0), (ROXLoadInst GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend), SubReg)>;
3852
+ }
3853
+
3854
+ multiclass LoadInsertPatterns<SDPatternOperator LoadOp, ValueType VT, ValueType HVT, ValueType SVT,
3855
+ ValueType ScalarVT, Instruction LoadInst, Instruction UnscaledLoadInst,
3856
+ Instruction ROWLoadInst, Instruction ROXLoadInst,
3857
+ ROAddrMode ro, ComplexPattern Addr, ComplexPattern UnscaledAddr,
3858
+ Operand AddrImm, SubRegIndex SubReg> {
3859
+ defm : LoadInsertVTPatterns<LoadOp, VT, ScalarVT, LoadInst, UnscaledLoadInst, ROWLoadInst,
3860
+ ROXLoadInst, ro, Addr, UnscaledAddr, AddrImm, SubReg>;
3861
+ defm : LoadInsertVTPatterns<LoadOp, HVT, ScalarVT, LoadInst, UnscaledLoadInst, ROWLoadInst,
3862
+ ROXLoadInst, ro, Addr, UnscaledAddr, AddrImm, SubReg>;
3863
+ defm : LoadInsertVTPatterns<LoadOp, SVT, ScalarVT, LoadInst, UnscaledLoadInst, ROWLoadInst,
3864
+ ROXLoadInst, ro, Addr, UnscaledAddr, AddrImm, SubReg>;
3865
+ }
3866
+
3867
+ defm : LoadInsertPatterns<extloadi8, v16i8, v8i8, nxv16i8, i32,
3868
+ LDRBui, LDURBi, LDRBroW, LDRBroX,
3869
+ ro8, am_indexed8, am_unscaled8, uimm12s1, bsub>;
3870
+ defm : LoadInsertPatterns<extloadi16, v8i16, v4i16, nxv8i16, i32,
3871
+ LDRHui, LDURHi, LDRHroW, LDRHroX,
3872
+ ro16, am_indexed16, am_unscaled16, uimm12s2, hsub>;
3873
+ defm : LoadInsertPatterns<load, v4i32, v2i32, nxv4i32, i32,
3874
+ LDRSui, LDURSi, LDRSroW, LDRSroX,
3875
+ ro32, am_indexed32, am_unscaled32, uimm12s4, ssub>;
3876
+ defm : LoadInsertPatterns<load, v2i64, isVoid, nxv2i64, i64,
3877
+ LDRDui, LDURDi, LDRDroW, LDRDroX,
3878
+ ro64, am_indexed64, am_unscaled64, uimm12s8, dsub>;
3879
+ defm : LoadInsertPatterns<load, v8f16, v4f16, nxv8f16, f16,
3880
+ LDRHui, LDURHi, LDRHroW, LDRHroX,
3881
+ ro16, am_indexed16, am_unscaled16, uimm12s2, hsub>;
3882
+ defm : LoadInsertPatterns<load, v8bf16, v4bf16, nxv8bf16, bf16,
3883
+ LDRHui, LDURHi, LDRHroW, LDRHroX,
3884
+ ro16, am_indexed16, am_unscaled16, uimm12s2, hsub>;
3885
+ defm : LoadInsertPatterns<load, v4f32, v2f32, nxv4f32, f32,
3886
+ LDRSui, LDURSi, LDRSroW, LDRSroX,
3887
+ ro32, am_indexed32, am_unscaled32, uimm12s4, ssub>;
3888
+ defm : LoadInsertPatterns<load, v2f64, isVoid, nxv2f64, f64,
3889
+ LDRDui, LDURDi, LDRDroW, LDRDroX,
3890
+ ro64, am_indexed64, am_unscaled64, uimm12s8, dsub>;
3891
+
3892
+ // Extra patterns for v1f64 scalar_to_vector(load), which need to avoid the
3893
+ // SUBREG_TO_REG used above.
3894
+ def : Pat <(v1i64 (scalar_to_vector (i64
3895
+ (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
3896
+ (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
3897
+ def : Pat <(v1i64 (scalar_to_vector (i64
3898
+ (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))))),
3899
+ (LDURDi GPR64sp:$Rn, simm9:$offset)>;
3900
+ def : Pat <(v1i64 (scalar_to_vector (i64
3901
+ (load (ro64.Wpat GPR64sp:$Rn, GPR32:$Rm, ro64.Wext:$extend))))),
3902
+ (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro64.Wext:$extend)>;
3903
+ def : Pat <(v1i64 (scalar_to_vector (i64
3904
+ (load (ro64.Xpat GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend))))),
3905
+ (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend)>;
3958
3906
3959
3907
// Pre-fetch.
3960
3908
defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
0 commit comments