@@ -3321,63 +3321,6 @@ defm LDRSW : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>;
3321
3321
// Pre-fetch.
3322
3322
defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">;
3323
3323
3324
- def vec_ins_or_scal_vec : PatFrags<(ops node:$src),
3325
- [(vector_insert undef, node:$src, (i64 0)),
3326
- (scalar_to_vector node:$src)]>;
3327
-
3328
- // For regular load, we do not have any alignment requirement.
3329
- // Thus, it is safe to directly map the vector loads with interesting
3330
- // addressing modes.
3331
- // FIXME: We could do the same for bitconvert to floating point vectors.
3332
- multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop,
3333
- ValueType ScalTy, ValueType VecTy,
3334
- Instruction LOADW, Instruction LOADX,
3335
- SubRegIndex sub> {
3336
- def : Pat<(VecTy (vec_ins_or_scal_vec (ScalTy
3337
- (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))),
3338
- (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
3339
- (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset),
3340
- sub)>;
3341
-
3342
- def : Pat<(VecTy (vec_ins_or_scal_vec (ScalTy
3343
- (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))),
3344
- (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
3345
- (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset),
3346
- sub)>;
3347
- }
3348
-
3349
- let AddedComplexity = 10 in {
3350
- defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v8i8, LDRBroW, LDRBroX, bsub>;
3351
- defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v16i8, LDRBroW, LDRBroX, bsub>;
3352
-
3353
- defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>;
3354
- defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>;
3355
-
3356
- defm : ScalToVecROLoadPat<ro16, load, i32, v4f16, LDRHroW, LDRHroX, hsub>;
3357
- defm : ScalToVecROLoadPat<ro16, load, i32, v8f16, LDRHroW, LDRHroX, hsub>;
3358
-
3359
- defm : ScalToVecROLoadPat<ro32, load, i32, v2i32, LDRSroW, LDRSroX, ssub>;
3360
- defm : ScalToVecROLoadPat<ro32, load, i32, v4i32, LDRSroW, LDRSroX, ssub>;
3361
-
3362
- defm : ScalToVecROLoadPat<ro32, load, f32, v2f32, LDRSroW, LDRSroX, ssub>;
3363
- defm : ScalToVecROLoadPat<ro32, load, f32, v4f32, LDRSroW, LDRSroX, ssub>;
3364
-
3365
- defm : ScalToVecROLoadPat<ro64, load, i64, v2i64, LDRDroW, LDRDroX, dsub>;
3366
-
3367
- defm : ScalToVecROLoadPat<ro64, load, f64, v2f64, LDRDroW, LDRDroX, dsub>;
3368
-
3369
-
3370
- def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
3371
- (load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
3372
- ro_Wextend64:$extend))))),
3373
- (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
3374
-
3375
- def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
3376
- (load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
3377
- ro_Xextend64:$extend))))),
3378
- (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
3379
- }
3380
-
3381
3324
// Match all load 64 bits width whose type is compatible with FPR64
3382
3325
multiclass VecROLoadPat<ROAddrMode ro, ValueType VecTy,
3383
3326
Instruction LOADW, Instruction LOADX> {
@@ -3501,42 +3444,6 @@ defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128Op, uimm12s16, "ldr",
3501
3444
def : Pat <(bf16 (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
3502
3445
(LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
3503
3446
3504
- // For regular load, we do not have any alignment requirement.
3505
- // Thus, it is safe to directly map the vector loads with interesting
3506
- // addressing modes.
3507
- // FIXME: We could do the same for bitconvert to floating point vectors.
3508
- def : Pat <(v8i8 (vec_ins_or_scal_vec (i32
3509
- (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
3510
- (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
3511
- (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
3512
- def : Pat <(v16i8 (vec_ins_or_scal_vec (i32
3513
- (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
3514
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
3515
- (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
3516
- def : Pat <(v4i16 (vec_ins_or_scal_vec (i32
3517
- (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
3518
- (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
3519
- (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
3520
- def : Pat <(v8i16 (vec_ins_or_scal_vec (i32
3521
- (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
3522
- (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
3523
- (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
3524
- def : Pat <(v2i32 (vec_ins_or_scal_vec (i32
3525
- (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
3526
- (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
3527
- (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
3528
- def : Pat <(v4i32 (vec_ins_or_scal_vec (i32
3529
- (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
3530
- (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
3531
- (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
3532
- def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
3533
- (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
3534
- (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
3535
- def : Pat <(v2i64 (vec_ins_or_scal_vec (i64
3536
- (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
3537
- (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
3538
- (LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>;
3539
-
3540
3447
// Match all load 64 bits width whose type is compatible with FPR64
3541
3448
let Predicates = [IsLE] in {
3542
3449
// We must use LD1 to perform vector loads in big-endian.
@@ -3902,12 +3809,13 @@ def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
3902
3809
def : InstAlias<"ldrsw $Rt, [$Rn, $offset]",
3903
3810
(LDURSWi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
3904
3811
3905
- // A LDR will implicitly zero the rest of the vector, so vector_insert(zeros,
3906
- // load, 0) can use a single load.
3907
- multiclass LoadInsertZeroPatterns<SDPatternOperator LoadOp, ValueType VT, ValueType HVT, ValueType SVT,
3908
- ValueType ScalarVT, Instruction LoadInst, Instruction UnscaledLoadInst,
3909
- ComplexPattern Addr, ComplexPattern UnscaledAddr, Operand AddrImm,
3910
- SubRegIndex SubReg> {
3812
+ // A LDR will implicitly zero the rest of the vector, so vector_insert(zeros, load, 0)
3813
+ // can use a single load. Same for scalar_to_vector(load) or insert(undef, load, 0).
3814
+ multiclass LoadInsertVTPatterns<SDPatternOperator LoadOp, ValueType VT, ValueType ScalarVT,
3815
+ Instruction LoadInst, Instruction UnscaledLoadInst,
3816
+ Instruction ROWLoadInst, Instruction ROXLoadInst,
3817
+ ROAddrMode ro, ComplexPattern Addr, ComplexPattern UnscaledAddr,
3818
+ Operand AddrImm, SubRegIndex SubReg> {
3911
3819
// Scaled
3912
3820
def : Pat <(vector_insert (VT immAllZerosV),
3913
3821
(ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)),
@@ -3916,42 +3824,82 @@ multiclass LoadInsertZeroPatterns<SDPatternOperator LoadOp, ValueType VT, ValueT
3916
3824
def : Pat <(vector_insert (VT immAllZerosV),
3917
3825
(ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)),
3918
3826
(SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>;
3827
+ // roW
3828
+ def : Pat <(vector_insert (VT immAllZerosV),
3829
+ (ScalarVT (LoadOp (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))), (i64 0)),
3830
+ (SUBREG_TO_REG (i64 0), (ROWLoadInst GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend), SubReg)>;
3831
+ // roX
3832
+ def : Pat <(vector_insert (VT immAllZerosV),
3833
+ (ScalarVT (LoadOp (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))), (i64 0)),
3834
+ (SUBREG_TO_REG (i64 0), (ROXLoadInst GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend), SubReg)>;
3919
3835
3920
- // Half-vector patterns
3921
- def : Pat <(vector_insert (HVT immAllZerosV),
3922
- (ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)),
3923
- (SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>;
3924
- // Unscaled
3925
- def : Pat <(vector_insert (HVT immAllZerosV),
3926
- (ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)),
3927
- (SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>;
3928
-
3929
- // SVE patterns
3930
- def : Pat <(vector_insert (SVT immAllZerosV),
3931
- (ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)),
3932
- (SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>;
3933
- // Unscaled
3934
- def : Pat <(vector_insert (SVT immAllZerosV),
3935
- (ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)),
3836
+ // Undef equivalents of the patterns above.
3837
+ def : Pat <(VT (vec_ins_or_scal_vec
3838
+ (ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))))),
3839
+ (SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>;
3840
+ def : Pat <(VT (vec_ins_or_scal_vec
3841
+ (ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))))),
3936
3842
(SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>;
3937
- }
3938
-
3939
- defm : LoadInsertZeroPatterns<extloadi8, v16i8, v8i8, nxv16i8, i32, LDRBui, LDURBi,
3940
- am_indexed8, am_unscaled8, uimm12s1, bsub>;
3941
- defm : LoadInsertZeroPatterns<extloadi16, v8i16, v4i16, nxv8i16, i32, LDRHui, LDURHi,
3942
- am_indexed16, am_unscaled16, uimm12s2, hsub>;
3943
- defm : LoadInsertZeroPatterns<load, v4i32, v2i32, nxv4i32, i32, LDRSui, LDURSi,
3944
- am_indexed32, am_unscaled32, uimm12s4, ssub>;
3945
- defm : LoadInsertZeroPatterns<load, v2i64, v1i64, nxv2i64, i64, LDRDui, LDURDi,
3946
- am_indexed64, am_unscaled64, uimm12s8, dsub>;
3947
- defm : LoadInsertZeroPatterns<load, v8f16, v4f16, nxv8f16, f16, LDRHui, LDURHi,
3948
- am_indexed16, am_unscaled16, uimm12s2, hsub>;
3949
- defm : LoadInsertZeroPatterns<load, v8bf16, v4bf16, nxv8bf16, bf16, LDRHui, LDURHi,
3950
- am_indexed16, am_unscaled16, uimm12s2, hsub>;
3951
- defm : LoadInsertZeroPatterns<load, v4f32, v2f32, nxv4f32, f32, LDRSui, LDURSi,
3952
- am_indexed32, am_unscaled32, uimm12s4, ssub>;
3953
- defm : LoadInsertZeroPatterns<load, v2f64, v1f64, nxv2f64, f64, LDRDui, LDURDi,
3954
- am_indexed64, am_unscaled64, uimm12s8, dsub>;
3843
+ def : Pat <(VT (vec_ins_or_scal_vec
3844
+ (ScalarVT (LoadOp (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))))),
3845
+ (SUBREG_TO_REG (i64 0), (ROWLoadInst GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend), SubReg)>;
3846
+ def : Pat <(VT (vec_ins_or_scal_vec
3847
+ (ScalarVT (LoadOp (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))))),
3848
+ (SUBREG_TO_REG (i64 0), (ROXLoadInst GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend), SubReg)>;
3849
+ }
3850
+
3851
+ multiclass LoadInsertPatterns<SDPatternOperator LoadOp, ValueType VT, ValueType HVT, ValueType SVT,
3852
+ ValueType ScalarVT, Instruction LoadInst, Instruction UnscaledLoadInst,
3853
+ Instruction ROWLoadInst, Instruction ROXLoadInst,
3854
+ ROAddrMode ro, ComplexPattern Addr, ComplexPattern UnscaledAddr,
3855
+ Operand AddrImm, SubRegIndex SubReg> {
3856
+ defm : LoadInsertVTPatterns<LoadOp, VT, ScalarVT, LoadInst, UnscaledLoadInst, ROWLoadInst,
3857
+ ROXLoadInst, ro, Addr, UnscaledAddr, AddrImm, SubReg>;
3858
+ defm : LoadInsertVTPatterns<LoadOp, HVT, ScalarVT, LoadInst, UnscaledLoadInst, ROWLoadInst,
3859
+ ROXLoadInst, ro, Addr, UnscaledAddr, AddrImm, SubReg>;
3860
+ defm : LoadInsertVTPatterns<LoadOp, SVT, ScalarVT, LoadInst, UnscaledLoadInst, ROWLoadInst,
3861
+ ROXLoadInst, ro, Addr, UnscaledAddr, AddrImm, SubReg>;
3862
+ }
3863
+
3864
+ defm : LoadInsertPatterns<extloadi8, v16i8, v8i8, nxv16i8, i32,
3865
+ LDRBui, LDURBi, LDRBroW, LDRBroX,
3866
+ ro8, am_indexed8, am_unscaled8, uimm12s1, bsub>;
3867
+ defm : LoadInsertPatterns<extloadi16, v8i16, v4i16, nxv8i16, i32,
3868
+ LDRHui, LDURHi, LDRHroW, LDRHroX,
3869
+ ro16, am_indexed16, am_unscaled16, uimm12s2, hsub>;
3870
+ defm : LoadInsertPatterns<load, v4i32, v2i32, nxv4i32, i32,
3871
+ LDRSui, LDURSi, LDRSroW, LDRSroX,
3872
+ ro32, am_indexed32, am_unscaled32, uimm12s4, ssub>;
3873
+ defm : LoadInsertPatterns<load, v2i64, isVoid, nxv2i64, i64,
3874
+ LDRDui, LDURDi, LDRDroW, LDRDroX,
3875
+ ro64, am_indexed64, am_unscaled64, uimm12s8, dsub>;
3876
+ defm : LoadInsertPatterns<load, v8f16, v4f16, nxv8f16, f16,
3877
+ LDRHui, LDURHi, LDRHroW, LDRHroX,
3878
+ ro16, am_indexed16, am_unscaled16, uimm12s2, hsub>;
3879
+ defm : LoadInsertPatterns<load, v8bf16, v4bf16, nxv8bf16, bf16,
3880
+ LDRHui, LDURHi, LDRHroW, LDRHroX,
3881
+ ro16, am_indexed16, am_unscaled16, uimm12s2, hsub>;
3882
+ defm : LoadInsertPatterns<load, v4f32, v2f32, nxv4f32, f32,
3883
+ LDRSui, LDURSi, LDRSroW, LDRSroX,
3884
+ ro32, am_indexed32, am_unscaled32, uimm12s4, ssub>;
3885
+ defm : LoadInsertPatterns<load, v2f64, isVoid, nxv2f64, f64,
3886
+ LDRDui, LDURDi, LDRDroW, LDRDroX,
3887
+ ro64, am_indexed64, am_unscaled64, uimm12s8, dsub>;
3888
+
3889
+ // Extra patterns for v1f64 scalar_to_vector(load), which need to avoid the
3890
+ // SUBREG_TO_REG used above.
3891
+ def : Pat <(v1i64 (scalar_to_vector (i64
3892
+ (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
3893
+ (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
3894
+ def : Pat <(v1i64 (scalar_to_vector (i64
3895
+ (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))))),
3896
+ (LDURDi GPR64sp:$Rn, simm9:$offset)>;
3897
+ def : Pat <(v1i64 (scalar_to_vector (i64
3898
+ (load (ro64.Wpat GPR64sp:$Rn, GPR32:$Rm, ro64.Wext:$extend))))),
3899
+ (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro64.Wext:$extend)>;
3900
+ def : Pat <(v1i64 (scalar_to_vector (i64
3901
+ (load (ro64.Xpat GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend))))),
3902
+ (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend)>;
3955
3903
3956
3904
// Pre-fetch.
3957
3905
defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
0 commit comments