Skip to content

[AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset #70474

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Nov 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions clang/include/clang/Basic/arm_sve.td
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,29 @@ let TargetGuard = "sve,bf16" in {
def SVBFMLALT_LANE : SInst<"svbfmlalt_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfmlalt_lane_v2", [IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
}

let TargetGuard = "sve2p1" in {
// Contiguous zero-extending load to quadword (single vector).
def SVLD1UWQ : MInst<"svld1uwq[_{d}]", "dPc", "iUif", [IsLoad], MemEltTyInt32, "aarch64_sve_ld1uwq">;
def SVLD1UWQ_VNUM : MInst<"svld1uwq_vnum[_{d}]", "dPcl", "iUif", [IsLoad], MemEltTyInt32, "aarch64_sve_ld1uwq">;

def SVLD1UDQ : MInst<"svld1udq[_{d}]", "dPc", "lUld", [IsLoad], MemEltTyInt64, "aarch64_sve_ld1udq">;
def SVLD1UDQ_VNUM : MInst<"svld1udq_vnum[_{d}]", "dPcl", "lUld", [IsLoad], MemEltTyInt64, "aarch64_sve_ld1udq">;

// Load one vector (vector base + scalar offset)
def SVLD1Q_GATHER_U64BASE_OFFSET : MInst<"svld1q_gather[_{2}base]_offset_{d}", "dPgl", "cUcsUsiUilUlfhdb", [IsGatherLoad, IsByteIndexed], MemEltTyDefault, "aarch64_sve_ld1q_gather_scalar_offset">;
def SVLD1Q_GATHER_U64BASE : MInst<"svld1q_gather[_{2}base]_{d}", "dPg", "cUcsUsiUilUlfhdb", [IsGatherLoad, IsByteIndexed], MemEltTyDefault, "aarch64_sve_ld1q_gather_scalar_offset">;

// Load N-element structure into N vectors (scalar base)
defm SVLD2Q : StructLoad<"svld2q[_{2}]", "2Pc", "aarch64_sve_ld2q_sret">;
defm SVLD3Q : StructLoad<"svld3q[_{2}]", "3Pc", "aarch64_sve_ld3q_sret">;
defm SVLD4Q : StructLoad<"svld4q[_{2}]", "4Pc", "aarch64_sve_ld4q_sret">;

// Load N-element structure into N vectors (scalar base, VL displacement)
defm SVLD2Q_VNUM : StructLoad<"svld2q_vnum[_{2}]", "2Pcl", "aarch64_sve_ld2q_sret">;
defm SVLD3Q_VNUM : StructLoad<"svld3q_vnum[_{2}]", "3Pcl", "aarch64_sve_ld3q_sret">;
defm SVLD4Q_VNUM : StructLoad<"svld4q_vnum[_{2}]", "4Pcl", "aarch64_sve_ld4q_sret">;
}

////////////////////////////////////////////////////////////////////////////////
// Stores

Expand Down Expand Up @@ -420,6 +443,29 @@ let TargetGuard = "sve,bf16" in {
def SVSTNT1_VNUM_BF : MInst<"svstnt1_vnum[_{d}]", "vPpld", "b", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">;
}

let TargetGuard = "sve2p1" in {
// Contiguous truncating store from quadword (single vector).
def SVST1UWQ : MInst<"svst1uwq[_{d}]", "vPcd", "iUif", [IsStore], MemEltTyInt32, "aarch64_sve_st1uwq">;
def SVST1UWQ_VNUM : MInst<"svst1uwq_vnum[_{d}]", "vPcld", "iUif", [IsStore], MemEltTyInt32, "aarch64_sve_st1uwq">;

def SVST1UDQ : MInst<"svst1udq[_{d}]", "vPcd", "lUld", [IsStore], MemEltTyInt64, "aarch64_sve_st1udq">;
def SVST1UDQ_VNUM : MInst<"svst1udq_vnum[_{d}]", "vPcld", "lUld", [IsStore], MemEltTyInt64, "aarch64_sve_st1udq">;

// Store one vector (vector base + scalar offset)
def SVST1Q_SCATTER_U64BASE_OFFSET : MInst<"svst1q_scatter[_{2}base]_offset[_{d}]", "vPgld", "cUcsUsiUilUlfhdb", [IsScatterStore, IsByteIndexed], MemEltTyDefault, "aarch64_sve_st1q_scatter_scalar_offset">;
def SVST1Q_SCATTER_U64BASE : MInst<"svst1q_scatter[_{2}base][_{d}]", "vPgd", "cUcsUsiUilUlfhdb", [IsScatterStore, IsByteIndexed], MemEltTyDefault, "aarch64_sve_st1q_scatter_scalar_offset">;

// Store N vectors into N-element structure (scalar base)
defm SVST2Q : StructStore<"svst2q[_{d}]", "vPc2", "aarch64_sve_st2q">;
defm SVST3Q : StructStore<"svst3q[_{d}]", "vPc3", "aarch64_sve_st3q">;
defm SVST4Q : StructStore<"svst4q[_{d}]", "vPc4", "aarch64_sve_st4q">;

// Store N vectors into N-element structure (scalar base, VL displacement)
defm SVST2Q_VNUM : StructStore<"svst2q_vnum[_{d}]", "vPcl2", "aarch64_sve_st2q">;
defm SVST3Q_VNUM : StructStore<"svst3q_vnum[_{d}]", "vPcl3", "aarch64_sve_st3q">;
defm SVST4Q_VNUM : StructStore<"svst4q_vnum[_{d}]", "vPcl4", "aarch64_sve_st4q">;
}

////////////////////////////////////////////////////////////////////////////////
// Prefetches

Expand Down
61 changes: 51 additions & 10 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9629,14 +9629,17 @@ Value *CodeGenFunction::EmitSVEStructLoad(const SVETypeFlags &TypeFlags,
case Intrinsic::aarch64_sve_ld2_sret:
case Intrinsic::aarch64_sve_ld1_pn_x2:
case Intrinsic::aarch64_sve_ldnt1_pn_x2:
case Intrinsic::aarch64_sve_ld2q_sret:
N = 2;
break;
case Intrinsic::aarch64_sve_ld3_sret:
case Intrinsic::aarch64_sve_ld3q_sret:
N = 3;
break;
case Intrinsic::aarch64_sve_ld4_sret:
case Intrinsic::aarch64_sve_ld1_pn_x4:
case Intrinsic::aarch64_sve_ldnt1_pn_x4:
case Intrinsic::aarch64_sve_ld4q_sret:
N = 4;
break;
default:
Expand Down Expand Up @@ -9674,14 +9677,17 @@ Value *CodeGenFunction::EmitSVEStructStore(const SVETypeFlags &TypeFlags,
case Intrinsic::aarch64_sve_st2:
case Intrinsic::aarch64_sve_st1_pn_x2:
case Intrinsic::aarch64_sve_stnt1_pn_x2:
case Intrinsic::aarch64_sve_st2q:
N = 2;
break;
case Intrinsic::aarch64_sve_st3:
case Intrinsic::aarch64_sve_st3q:
N = 3;
break;
case Intrinsic::aarch64_sve_st4:
case Intrinsic::aarch64_sve_st1_pn_x4:
case Intrinsic::aarch64_sve_stnt1_pn_x4:
case Intrinsic::aarch64_sve_st4q:
N = 4;
break;
default:
Expand Down Expand Up @@ -9757,7 +9763,7 @@ Value *CodeGenFunction::EmitSVEPrefetchLoad(const SVETypeFlags &TypeFlags,
Value *CodeGenFunction::EmitSVEMaskedLoad(const CallExpr *E,
llvm::Type *ReturnTy,
SmallVectorImpl<Value *> &Ops,
unsigned BuiltinID,
unsigned IntrinsicID,
bool IsZExtReturn) {
QualType LangPTy = E->getArg(1)->getType();
llvm::Type *MemEltTy = CGM.getTypes().ConvertType(
Expand All @@ -9766,28 +9772,46 @@ Value *CodeGenFunction::EmitSVEMaskedLoad(const CallExpr *E,
// The vector type that is returned may be different from the
// eventual type loaded from memory.
auto VectorTy = cast<llvm::ScalableVectorType>(ReturnTy);
auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
llvm::ScalableVectorType *MemoryTy = nullptr;
llvm::ScalableVectorType *PredTy = nullptr;
bool IsQuadLoad = false;
switch (IntrinsicID) {
case Intrinsic::aarch64_sve_ld1uwq:
case Intrinsic::aarch64_sve_ld1udq:
MemoryTy = llvm::ScalableVectorType::get(MemEltTy, 1);
PredTy = llvm::ScalableVectorType::get(
llvm::Type::getInt1Ty(getLLVMContext()), 1);
IsQuadLoad = true;
break;
default:
MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
PredTy = MemoryTy;
break;
}

Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy);
Value *Predicate = EmitSVEPredicateCast(Ops[0], PredTy);
Value *BasePtr = Ops[1];

// Does the load have an offset?
if (Ops.size() > 2)
BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Ops[2]);

Function *F = CGM.getIntrinsic(BuiltinID, MemoryTy);
Function *F = CGM.getIntrinsic(IntrinsicID, IsQuadLoad ? VectorTy : MemoryTy);
auto *Load =
cast<llvm::Instruction>(Builder.CreateCall(F, {Predicate, BasePtr}));
auto TBAAInfo = CGM.getTBAAAccessInfo(LangPTy->getPointeeType());
CGM.DecorateInstructionWithTBAA(Load, TBAAInfo);

if (IsQuadLoad)
return Load;

return IsZExtReturn ? Builder.CreateZExt(Load, VectorTy)
: Builder.CreateSExt(Load, VectorTy);
: Builder.CreateSExt(Load, VectorTy);
}

Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E,
SmallVectorImpl<Value *> &Ops,
unsigned BuiltinID) {
unsigned IntrinsicID) {
QualType LangPTy = E->getArg(1)->getType();
llvm::Type *MemEltTy = CGM.getTypes().ConvertType(
LangPTy->castAs<PointerType>()->getPointeeType());
Expand All @@ -9797,17 +9821,34 @@ Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E,
auto VectorTy = cast<llvm::ScalableVectorType>(Ops.back()->getType());
auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);

Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy);
auto PredTy = MemoryTy;
auto AddrMemoryTy = MemoryTy;
bool IsQuadStore = false;

switch (IntrinsicID) {
case Intrinsic::aarch64_sve_st1uwq:
case Intrinsic::aarch64_sve_st1udq:
AddrMemoryTy = llvm::ScalableVectorType::get(MemEltTy, 1);
PredTy =
llvm::ScalableVectorType::get(IntegerType::get(getLLVMContext(), 1), 1);
IsQuadStore = true;
break;
default:
break;
}
Value *Predicate = EmitSVEPredicateCast(Ops[0], PredTy);
Value *BasePtr = Ops[1];

// Does the store have an offset?
if (Ops.size() == 4)
BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Ops[2]);
BasePtr = Builder.CreateGEP(AddrMemoryTy, BasePtr, Ops[2]);

// Last value is always the data
llvm::Value *Val = Builder.CreateTrunc(Ops.back(), MemoryTy);
Value *Val =
IsQuadStore ? Ops.back() : Builder.CreateTrunc(Ops.back(), MemoryTy);

Function *F = CGM.getIntrinsic(BuiltinID, MemoryTy);
Function *F =
CGM.getIntrinsic(IntrinsicID, IsQuadStore ? VectorTy : MemoryTy);
auto *Store =
cast<llvm::Instruction>(Builder.CreateCall(F, {Val, Predicate, BasePtr}));
auto TBAAInfo = CGM.getTBAAAccessInfo(LangPTy->getPointeeType());
Expand Down
Loading