Skip to content

Commit f335883

Browse files
momchil-velikovCarolineConcattohassnaaHamdi
authored
[AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset (#70474)
This patch adds a set of SVE2.1 quadword load/store intrisics: * Contiguous zero-extending load to quadword (single vector) sv<type>_t svld1uwq[_<typ>](svbool_t, const <type>_t *ptr); sv<type>_t svld1uwq_vnum[_<typ>](svbool_t, const <type> *ptr, int64_t vnum); sv<type>_t svld1udq[_<typ>](svbool_t, const <type>_t *ptr); sv<type>_t svld1udq_vnum[_<typ>](svbool_t, const <type>_t *ptr, int64_t vnum); * Contiguous truncating store of single vector operand void svst1uwq[_<typ>](svbool_t, const <type>_t *ptr, sv<type>_t data); void svst1uwq_vnum[_<typ>](svbool_t, const <type>_t *ptr, int64_t vnum, sv<type>_t data); void svst1udq[_<typ>](svbool_t, const <type>_t *ptr, sv<type>_t data); void svst1udq_vnum[_<typ>](svbool_t, const <type>_t *ptr, int64_t vnum, sv<type>_t data); * Gather load quadword sv<type>_t svld1q_gather[_u64base]_<typ>(svbool_t pg, svuint64_t zn); sv<type>_t svld1q_gather[_u64base]_offset_<typ>(svbool_t pg, svuint64_t zn, int64_t offset); * Scatter store quadword void svst1q_scatter[_u64base][_<typ>](svbool_t pg, svuint64_t zn, sv<type>_t data); void svst1q_scatter[_u64base]_offset[_<typ>](svbool_t pg, svuint64_t zn, int64_t offset, sv<type>_t data); * Contiguous load two, three or four quadword structures. sv<type>x2_t svld2q[_<typ>](svbool_t pg, const <type>_t *rn); sv<type>x2_t svld2q_vnum[_<typ>](svbool_t pg, const <type>_t *rn, uint64_t vnum); sv<type>x3_t svld3q[_<typ>](svbool_t pg, const <type>_t *rn); sv<type>x3_t svld3q_vnum[_<typ>](svbool_t pg, const <type>_t *rn, uint64_t vnum); sv<type>x4_t svld4q[_<typ>](svbool_t pg, const <type>_t *rn); sv<type>x4_t svld4q_vnum[_<typ>](svbool_t pg, const <type>_t *rn, uint64_t vnum); * Contiguous store two, three or four quadword structures. void svst2q[_<typ>](svbool_t pg, <type>_t *rn, sv<type>x2_t zt); void svst2q_vnum[_<typ>](svbool_t pg, <type>_t *rn, int64_t vnum, sv<type>x2_t zt); void svst3q[_<typ>](svbool_t pg, <type>_t *rn, sv<type>x3_t zt); void svst3q_vnum[_<typ>](svbool_t pg, <type>_t *rn, int64_t vnum, sv<type>x3_t zt); void svst4q[_<typ>](svbool_t pg, <type>_t *rn, sv<type>x4_t zt); void svst4q_vnum[_<typ>](svbool_t pg, <type>_t *rn, int64_t vnum, sv<type>x4_t zt); ACLE spec: ARM-software/acle#257 Co-authored-by: Caroline Concatto <[email protected]> Co-authored-by: Hassnaa Hamdi <[email protected]>
1 parent a3908d3 commit f335883

18 files changed

+7666
-16
lines changed

clang/include/clang/Basic/arm_sve.td

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,29 @@ let TargetGuard = "sve,bf16" in {
298298
def SVBFMLALT_LANE : SInst<"svbfmlalt_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfmlalt_lane_v2", [IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
299299
}
300300

301+
let TargetGuard = "sve2p1" in {
302+
// Contiguous zero-extending load to quadword (single vector).
303+
def SVLD1UWQ : MInst<"svld1uwq[_{d}]", "dPc", "iUif", [IsLoad], MemEltTyInt32, "aarch64_sve_ld1uwq">;
304+
def SVLD1UWQ_VNUM : MInst<"svld1uwq_vnum[_{d}]", "dPcl", "iUif", [IsLoad], MemEltTyInt32, "aarch64_sve_ld1uwq">;
305+
306+
def SVLD1UDQ : MInst<"svld1udq[_{d}]", "dPc", "lUld", [IsLoad], MemEltTyInt64, "aarch64_sve_ld1udq">;
307+
def SVLD1UDQ_VNUM : MInst<"svld1udq_vnum[_{d}]", "dPcl", "lUld", [IsLoad], MemEltTyInt64, "aarch64_sve_ld1udq">;
308+
309+
// Load one vector (vector base + scalar offset)
310+
def SVLD1Q_GATHER_U64BASE_OFFSET : MInst<"svld1q_gather[_{2}base]_offset_{d}", "dPgl", "cUcsUsiUilUlfhdb", [IsGatherLoad, IsByteIndexed], MemEltTyDefault, "aarch64_sve_ld1q_gather_scalar_offset">;
311+
def SVLD1Q_GATHER_U64BASE : MInst<"svld1q_gather[_{2}base]_{d}", "dPg", "cUcsUsiUilUlfhdb", [IsGatherLoad, IsByteIndexed], MemEltTyDefault, "aarch64_sve_ld1q_gather_scalar_offset">;
312+
313+
// Load N-element structure into N vectors (scalar base)
314+
defm SVLD2Q : StructLoad<"svld2q[_{2}]", "2Pc", "aarch64_sve_ld2q_sret">;
315+
defm SVLD3Q : StructLoad<"svld3q[_{2}]", "3Pc", "aarch64_sve_ld3q_sret">;
316+
defm SVLD4Q : StructLoad<"svld4q[_{2}]", "4Pc", "aarch64_sve_ld4q_sret">;
317+
318+
// Load N-element structure into N vectors (scalar base, VL displacement)
319+
defm SVLD2Q_VNUM : StructLoad<"svld2q_vnum[_{2}]", "2Pcl", "aarch64_sve_ld2q_sret">;
320+
defm SVLD3Q_VNUM : StructLoad<"svld3q_vnum[_{2}]", "3Pcl", "aarch64_sve_ld3q_sret">;
321+
defm SVLD4Q_VNUM : StructLoad<"svld4q_vnum[_{2}]", "4Pcl", "aarch64_sve_ld4q_sret">;
322+
}
323+
301324
////////////////////////////////////////////////////////////////////////////////
302325
// Stores
303326

@@ -420,6 +443,29 @@ let TargetGuard = "sve,bf16" in {
420443
def SVSTNT1_VNUM_BF : MInst<"svstnt1_vnum[_{d}]", "vPpld", "b", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">;
421444
}
422445

446+
let TargetGuard = "sve2p1" in {
447+
// Contiguous truncating store from quadword (single vector).
448+
def SVST1UWQ : MInst<"svst1uwq[_{d}]", "vPcd", "iUif", [IsStore], MemEltTyInt32, "aarch64_sve_st1uwq">;
449+
def SVST1UWQ_VNUM : MInst<"svst1uwq_vnum[_{d}]", "vPcld", "iUif", [IsStore], MemEltTyInt32, "aarch64_sve_st1uwq">;
450+
451+
def SVST1UDQ : MInst<"svst1udq[_{d}]", "vPcd", "lUld", [IsStore], MemEltTyInt64, "aarch64_sve_st1udq">;
452+
def SVST1UDQ_VNUM : MInst<"svst1udq_vnum[_{d}]", "vPcld", "lUld", [IsStore], MemEltTyInt64, "aarch64_sve_st1udq">;
453+
454+
// Store one vector (vector base + scalar offset)
455+
def SVST1Q_SCATTER_U64BASE_OFFSET : MInst<"svst1q_scatter[_{2}base]_offset[_{d}]", "vPgld", "cUcsUsiUilUlfhdb", [IsScatterStore, IsByteIndexed], MemEltTyDefault, "aarch64_sve_st1q_scatter_scalar_offset">;
456+
def SVST1Q_SCATTER_U64BASE : MInst<"svst1q_scatter[_{2}base][_{d}]", "vPgd", "cUcsUsiUilUlfhdb", [IsScatterStore, IsByteIndexed], MemEltTyDefault, "aarch64_sve_st1q_scatter_scalar_offset">;
457+
458+
// Store N vectors into N-element structure (scalar base)
459+
defm SVST2Q : StructStore<"svst2q[_{d}]", "vPc2", "aarch64_sve_st2q">;
460+
defm SVST3Q : StructStore<"svst3q[_{d}]", "vPc3", "aarch64_sve_st3q">;
461+
defm SVST4Q : StructStore<"svst4q[_{d}]", "vPc4", "aarch64_sve_st4q">;
462+
463+
// Store N vectors into N-element structure (scalar base, VL displacement)
464+
defm SVST2Q_VNUM : StructStore<"svst2q_vnum[_{d}]", "vPcl2", "aarch64_sve_st2q">;
465+
defm SVST3Q_VNUM : StructStore<"svst3q_vnum[_{d}]", "vPcl3", "aarch64_sve_st3q">;
466+
defm SVST4Q_VNUM : StructStore<"svst4q_vnum[_{d}]", "vPcl4", "aarch64_sve_st4q">;
467+
}
468+
423469
////////////////////////////////////////////////////////////////////////////////
424470
// Prefetches
425471

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 51 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9652,14 +9652,17 @@ Value *CodeGenFunction::EmitSVEStructLoad(const SVETypeFlags &TypeFlags,
96529652
case Intrinsic::aarch64_sve_ld2_sret:
96539653
case Intrinsic::aarch64_sve_ld1_pn_x2:
96549654
case Intrinsic::aarch64_sve_ldnt1_pn_x2:
9655+
case Intrinsic::aarch64_sve_ld2q_sret:
96559656
N = 2;
96569657
break;
96579658
case Intrinsic::aarch64_sve_ld3_sret:
9659+
case Intrinsic::aarch64_sve_ld3q_sret:
96589660
N = 3;
96599661
break;
96609662
case Intrinsic::aarch64_sve_ld4_sret:
96619663
case Intrinsic::aarch64_sve_ld1_pn_x4:
96629664
case Intrinsic::aarch64_sve_ldnt1_pn_x4:
9665+
case Intrinsic::aarch64_sve_ld4q_sret:
96639666
N = 4;
96649667
break;
96659668
default:
@@ -9697,14 +9700,17 @@ Value *CodeGenFunction::EmitSVEStructStore(const SVETypeFlags &TypeFlags,
96979700
case Intrinsic::aarch64_sve_st2:
96989701
case Intrinsic::aarch64_sve_st1_pn_x2:
96999702
case Intrinsic::aarch64_sve_stnt1_pn_x2:
9703+
case Intrinsic::aarch64_sve_st2q:
97009704
N = 2;
97019705
break;
97029706
case Intrinsic::aarch64_sve_st3:
9707+
case Intrinsic::aarch64_sve_st3q:
97039708
N = 3;
97049709
break;
97059710
case Intrinsic::aarch64_sve_st4:
97069711
case Intrinsic::aarch64_sve_st1_pn_x4:
97079712
case Intrinsic::aarch64_sve_stnt1_pn_x4:
9713+
case Intrinsic::aarch64_sve_st4q:
97089714
N = 4;
97099715
break;
97109716
default:
@@ -9780,7 +9786,7 @@ Value *CodeGenFunction::EmitSVEPrefetchLoad(const SVETypeFlags &TypeFlags,
97809786
Value *CodeGenFunction::EmitSVEMaskedLoad(const CallExpr *E,
97819787
llvm::Type *ReturnTy,
97829788
SmallVectorImpl<Value *> &Ops,
9783-
unsigned BuiltinID,
9789+
unsigned IntrinsicID,
97849790
bool IsZExtReturn) {
97859791
QualType LangPTy = E->getArg(1)->getType();
97869792
llvm::Type *MemEltTy = CGM.getTypes().ConvertType(
@@ -9789,28 +9795,46 @@ Value *CodeGenFunction::EmitSVEMaskedLoad(const CallExpr *E,
97899795
// The vector type that is returned may be different from the
97909796
// eventual type loaded from memory.
97919797
auto VectorTy = cast<llvm::ScalableVectorType>(ReturnTy);
9792-
auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
9798+
llvm::ScalableVectorType *MemoryTy = nullptr;
9799+
llvm::ScalableVectorType *PredTy = nullptr;
9800+
bool IsQuadLoad = false;
9801+
switch (IntrinsicID) {
9802+
case Intrinsic::aarch64_sve_ld1uwq:
9803+
case Intrinsic::aarch64_sve_ld1udq:
9804+
MemoryTy = llvm::ScalableVectorType::get(MemEltTy, 1);
9805+
PredTy = llvm::ScalableVectorType::get(
9806+
llvm::Type::getInt1Ty(getLLVMContext()), 1);
9807+
IsQuadLoad = true;
9808+
break;
9809+
default:
9810+
MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
9811+
PredTy = MemoryTy;
9812+
break;
9813+
}
97939814

9794-
Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy);
9815+
Value *Predicate = EmitSVEPredicateCast(Ops[0], PredTy);
97959816
Value *BasePtr = Ops[1];
97969817

97979818
// Does the load have an offset?
97989819
if (Ops.size() > 2)
97999820
BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Ops[2]);
98009821

9801-
Function *F = CGM.getIntrinsic(BuiltinID, MemoryTy);
9822+
Function *F = CGM.getIntrinsic(IntrinsicID, IsQuadLoad ? VectorTy : MemoryTy);
98029823
auto *Load =
98039824
cast<llvm::Instruction>(Builder.CreateCall(F, {Predicate, BasePtr}));
98049825
auto TBAAInfo = CGM.getTBAAAccessInfo(LangPTy->getPointeeType());
98059826
CGM.DecorateInstructionWithTBAA(Load, TBAAInfo);
98069827

9828+
if (IsQuadLoad)
9829+
return Load;
9830+
98079831
return IsZExtReturn ? Builder.CreateZExt(Load, VectorTy)
9808-
: Builder.CreateSExt(Load, VectorTy);
9832+
: Builder.CreateSExt(Load, VectorTy);
98099833
}
98109834

98119835
Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E,
98129836
SmallVectorImpl<Value *> &Ops,
9813-
unsigned BuiltinID) {
9837+
unsigned IntrinsicID) {
98149838
QualType LangPTy = E->getArg(1)->getType();
98159839
llvm::Type *MemEltTy = CGM.getTypes().ConvertType(
98169840
LangPTy->castAs<PointerType>()->getPointeeType());
@@ -9820,17 +9844,34 @@ Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E,
98209844
auto VectorTy = cast<llvm::ScalableVectorType>(Ops.back()->getType());
98219845
auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
98229846

9823-
Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy);
9847+
auto PredTy = MemoryTy;
9848+
auto AddrMemoryTy = MemoryTy;
9849+
bool IsQuadStore = false;
9850+
9851+
switch (IntrinsicID) {
9852+
case Intrinsic::aarch64_sve_st1uwq:
9853+
case Intrinsic::aarch64_sve_st1udq:
9854+
AddrMemoryTy = llvm::ScalableVectorType::get(MemEltTy, 1);
9855+
PredTy =
9856+
llvm::ScalableVectorType::get(IntegerType::get(getLLVMContext(), 1), 1);
9857+
IsQuadStore = true;
9858+
break;
9859+
default:
9860+
break;
9861+
}
9862+
Value *Predicate = EmitSVEPredicateCast(Ops[0], PredTy);
98249863
Value *BasePtr = Ops[1];
98259864

98269865
// Does the store have an offset?
98279866
if (Ops.size() == 4)
9828-
BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Ops[2]);
9867+
BasePtr = Builder.CreateGEP(AddrMemoryTy, BasePtr, Ops[2]);
98299868

98309869
// Last value is always the data
9831-
llvm::Value *Val = Builder.CreateTrunc(Ops.back(), MemoryTy);
9870+
Value *Val =
9871+
IsQuadStore ? Ops.back() : Builder.CreateTrunc(Ops.back(), MemoryTy);
98329872

9833-
Function *F = CGM.getIntrinsic(BuiltinID, MemoryTy);
9873+
Function *F =
9874+
CGM.getIntrinsic(IntrinsicID, IsQuadStore ? VectorTy : MemoryTy);
98349875
auto *Store =
98359876
cast<llvm::Instruction>(Builder.CreateCall(F, {Val, Predicate, BasePtr}));
98369877
auto TBAAInfo = CGM.getTBAAAccessInfo(LangPTy->getPointeeType());

0 commit comments

Comments
 (0)