Skip to content

Commit 29b263a

Browse files
committed
[Clang][AArch64] Inline assembly support for the ACLE type 'data512_t'
In LLVM IR terms the ACLE type 'data512_t' is essentially an aggregate type { [8 x i64] }. When emitting code for inline assembly operands, clang tries to scalarize aggregate types to an integer of the equivalent length, otherwise it passes them by-reference. This patch adds a target hook to tell whether a given inline assembly operand is scalarizable so that clang can emit code to pass/return it by-value. Differential Revision: https://reviews.llvm.org/D94098
1 parent fb09f36 commit 29b263a

File tree

5 files changed

+137
-17
lines changed

5 files changed

+137
-17
lines changed

clang/lib/Basic/Targets/AArch64.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -431,7 +431,8 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
431431
Feature == "sve2-aes" || Feature == "sve2-sha3" ||
432432
Feature == "sve2-sm4" || Feature == "f64mm" || Feature == "f32mm" ||
433433
Feature == "i8mm" || Feature == "bf16") &&
434-
(FPU & SveMode));
434+
(FPU & SveMode)) ||
435+
(Feature == "ls64" && HasLS64);
435436
}
436437

437438
bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
@@ -752,6 +753,9 @@ bool AArch64TargetInfo::validateConstraintModifier(
752753
if (Size == 64)
753754
return true;
754755

756+
if (Size == 512)
757+
return HasLS64;
758+
755759
SuggestedModifier = "w";
756760
return false;
757761
}

clang/lib/CodeGen/CGStmt.cpp

Lines changed: 27 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2097,7 +2097,8 @@ CodeGenFunction::EmitAsmInputLValue(const TargetInfo::ConstraintInfo &Info,
20972097
} else {
20982098
llvm::Type *Ty = ConvertType(InputType);
20992099
uint64_t Size = CGM.getDataLayout().getTypeSizeInBits(Ty);
2100-
if (Size <= 64 && llvm::isPowerOf2_64(Size)) {
2100+
if ((Size <= 64 && llvm::isPowerOf2_64(Size)) ||
2101+
getTargetHooks().isScalarizableAsmOperand(*this, Ty)) {
21012102
Ty = llvm::IntegerType::get(getLLVMContext(), Size);
21022103
Ty = llvm::PointerType::getUnqual(Ty);
21032104

@@ -2320,23 +2321,28 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
23202321

23212322
// If this is a register output, then make the inline asm return it
23222323
// by-value. If this is a memory result, return the value by-reference.
2323-
bool isScalarizableAggregate =
2324-
hasAggregateEvaluationKind(OutExpr->getType());
2325-
if (!Info.allowsMemory() && (hasScalarEvaluationKind(OutExpr->getType()) ||
2326-
isScalarizableAggregate)) {
2324+
QualType QTy = OutExpr->getType();
2325+
const bool IsScalarOrAggregate = hasScalarEvaluationKind(QTy) ||
2326+
hasAggregateEvaluationKind(QTy);
2327+
if (!Info.allowsMemory() && IsScalarOrAggregate) {
2328+
23272329
Constraints += "=" + OutputConstraint;
2328-
ResultRegQualTys.push_back(OutExpr->getType());
2330+
ResultRegQualTys.push_back(QTy);
23292331
ResultRegDests.push_back(Dest);
2330-
ResultTruncRegTypes.push_back(ConvertTypeForMem(OutExpr->getType()));
2331-
if (Info.allowsRegister() && isScalarizableAggregate) {
2332-
ResultTypeRequiresCast.push_back(true);
2333-
unsigned Size = getContext().getTypeSize(OutExpr->getType());
2334-
llvm::Type *ConvTy = llvm::IntegerType::get(getLLVMContext(), Size);
2335-
ResultRegTypes.push_back(ConvTy);
2336-
} else {
2337-
ResultTypeRequiresCast.push_back(false);
2338-
ResultRegTypes.push_back(ResultTruncRegTypes.back());
2332+
2333+
llvm::Type *Ty = ConvertTypeForMem(QTy);
2334+
const bool RequiresCast = Info.allowsRegister() &&
2335+
(getTargetHooks().isScalarizableAsmOperand(*this, Ty) ||
2336+
Ty->isAggregateType());
2337+
2338+
ResultTruncRegTypes.push_back(Ty);
2339+
ResultTypeRequiresCast.push_back(RequiresCast);
2340+
2341+
if (RequiresCast) {
2342+
unsigned Size = getContext().getTypeSize(QTy);
2343+
Ty = llvm::IntegerType::get(getLLVMContext(), Size);
23392344
}
2345+
ResultRegTypes.push_back(Ty);
23402346
// If this output is tied to an input, and if the input is larger, then
23412347
// we need to set the actual result type of the inline asm node to be the
23422348
// same as the input type.
@@ -2638,11 +2644,11 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
26382644
assert(ResultTypeRequiresCast.size() <= ResultRegDests.size());
26392645
for (unsigned i = 0, e = RegResults.size(); i != e; ++i) {
26402646
llvm::Value *Tmp = RegResults[i];
2647+
llvm::Type *TruncTy = ResultTruncRegTypes[i];
26412648

26422649
// If the result type of the LLVM IR asm doesn't match the result type of
26432650
// the expression, do the conversion.
26442651
if (ResultRegTypes[i] != ResultTruncRegTypes[i]) {
2645-
llvm::Type *TruncTy = ResultTruncRegTypes[i];
26462652

26472653
// Truncate the integer result to the right size, note that TruncTy can be
26482654
// a pointer.
@@ -2672,6 +2678,11 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
26722678
unsigned Size = getContext().getTypeSize(ResultRegQualTys[i]);
26732679
Address A = Builder.CreateBitCast(Dest.getAddress(*this),
26742680
ResultRegTypes[i]->getPointerTo());
2681+
if (getTargetHooks().isScalarizableAsmOperand(*this, TruncTy)) {
2682+
Builder.CreateStore(Tmp, A);
2683+
continue;
2684+
}
2685+
26752686
QualType Ty = getContext().getIntTypeForBitwidth(Size, /*Signed*/ false);
26762687
if (Ty.isNull()) {
26772688
const Expr *OutExpr = S.getOutputExpr(i);

clang/lib/CodeGen/TargetInfo.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5526,6 +5526,20 @@ class AArch64TargetCodeGenInfo : public TargetCodeGenInfo {
55265526
Fn->addFnAttr("branch-target-enforcement",
55275527
BPI.BranchTargetEnforcement ? "true" : "false");
55285528
}
5529+
5530+
bool isScalarizableAsmOperand(CodeGen::CodeGenFunction &CGF,
5531+
llvm::Type *Ty) const override {
5532+
if (CGF.getTarget().hasFeature("ls64")) {
5533+
auto *ST = dyn_cast<llvm::StructType>(Ty);
5534+
if (ST && ST->getNumElements() == 1) {
5535+
auto *AT = dyn_cast<llvm::ArrayType>(ST->getElementType(0));
5536+
if (AT && AT->getNumElements() == 8 &&
5537+
AT->getElementType()->isIntegerTy(64))
5538+
return true;
5539+
}
5540+
}
5541+
return TargetCodeGenInfo::isScalarizableAsmOperand(CGF, Ty);
5542+
}
55295543
};
55305544

55315545
class WindowsAArch64TargetCodeGenInfo : public AArch64TargetCodeGenInfo {

clang/lib/CodeGen/TargetInfo.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,13 @@ class TargetCodeGenInfo {
148148
return Ty;
149149
}
150150

151+
/// Target hook to decide whether an inline asm operand can be passed
152+
/// by value.
153+
virtual bool isScalarizableAsmOperand(CodeGen::CodeGenFunction &CGF,
154+
llvm::Type *Ty) const {
155+
return false;
156+
}
157+
151158
/// Adds constraints and types for result registers.
152159
virtual void addReturnRegisterOutputs(
153160
CodeGen::CodeGenFunction &CGF, CodeGen::LValue ReturnValue,
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2+
// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +ls64 -O1 -S -emit-llvm -x c %s -o - | FileCheck %s
3+
4+
struct foo { unsigned long long x[8]; };
5+
6+
// CHECK-LABEL: @load(
7+
// CHECK-NEXT: entry:
8+
// CHECK-NEXT: [[TMP0:%.*]] = call i512 asm sideeffect "ld64b $0,[$1]", "=r,r,~{memory}"(i8* [[ADDR:%.*]]) #[[ATTR1:[0-9]+]], !srcloc !6
9+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.foo* [[OUTPUT:%.*]] to i512*
10+
// CHECK-NEXT: store i512 [[TMP0]], i512* [[TMP1]], align 8
11+
// CHECK-NEXT: ret void
12+
//
13+
void load(struct foo *output, void *addr)
14+
{
15+
__asm__ volatile ("ld64b %0,[%1]" : "=r" (*output) : "r" (addr) : "memory");
16+
}
17+
18+
// CHECK-LABEL: @store(
19+
// CHECK-NEXT: entry:
20+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.foo* [[INPUT:%.*]] to i512*
21+
// CHECK-NEXT: [[TMP1:%.*]] = load i512, i512* [[TMP0]], align 8
22+
// CHECK-NEXT: call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[TMP1]], i8* [[ADDR:%.*]]) #[[ATTR1]], !srcloc !7
23+
// CHECK-NEXT: ret void
24+
//
25+
void store(const struct foo *input, void *addr)
26+
{
27+
__asm__ volatile ("st64b %0,[%1]" : : "r" (*input), "r" (addr) : "memory" );
28+
}
29+
30+
// CHECK-LABEL: @store2(
31+
// CHECK-NEXT: entry:
32+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[IN:%.*]], align 4, !tbaa [[TBAA8:![0-9]+]]
33+
// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP0]] to i64
34+
// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 1
35+
// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4, !tbaa [[TBAA8]]
36+
// CHECK-NEXT: [[CONV2:%.*]] = sext i32 [[TMP1]] to i64
37+
// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 4
38+
// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4, !tbaa [[TBAA8]]
39+
// CHECK-NEXT: [[CONV5:%.*]] = sext i32 [[TMP2]] to i64
40+
// CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 16
41+
// CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX7]], align 4, !tbaa [[TBAA8]]
42+
// CHECK-NEXT: [[CONV8:%.*]] = sext i32 [[TMP3]] to i64
43+
// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 25
44+
// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX10]], align 4, !tbaa [[TBAA8]]
45+
// CHECK-NEXT: [[CONV11:%.*]] = sext i32 [[TMP4]] to i64
46+
// CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 36
47+
// CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4, !tbaa [[TBAA8]]
48+
// CHECK-NEXT: [[CONV14:%.*]] = sext i32 [[TMP5]] to i64
49+
// CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 49
50+
// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX16]], align 4, !tbaa [[TBAA8]]
51+
// CHECK-NEXT: [[CONV17:%.*]] = sext i32 [[TMP6]] to i64
52+
// CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 64
53+
// CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX19]], align 4, !tbaa [[TBAA8]]
54+
// CHECK-NEXT: [[CONV20:%.*]] = sext i32 [[TMP7]] to i64
55+
// CHECK-NEXT: [[S_SROA_10_0_INSERT_EXT:%.*]] = zext i64 [[CONV20]] to i512
56+
// CHECK-NEXT: [[S_SROA_10_0_INSERT_SHIFT:%.*]] = shl nuw i512 [[S_SROA_10_0_INSERT_EXT]], 448
57+
// CHECK-NEXT: [[S_SROA_9_0_INSERT_EXT:%.*]] = zext i64 [[CONV17]] to i512
58+
// CHECK-NEXT: [[S_SROA_9_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_9_0_INSERT_EXT]], 384
59+
// CHECK-NEXT: [[S_SROA_9_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_10_0_INSERT_SHIFT]], [[S_SROA_9_0_INSERT_SHIFT]]
60+
// CHECK-NEXT: [[S_SROA_8_0_INSERT_EXT:%.*]] = zext i64 [[CONV14]] to i512
61+
// CHECK-NEXT: [[S_SROA_8_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_8_0_INSERT_EXT]], 320
62+
// CHECK-NEXT: [[S_SROA_8_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_9_0_INSERT_INSERT]], [[S_SROA_8_0_INSERT_SHIFT]]
63+
// CHECK-NEXT: [[S_SROA_7_0_INSERT_EXT:%.*]] = zext i64 [[CONV11]] to i512
64+
// CHECK-NEXT: [[S_SROA_7_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_7_0_INSERT_EXT]], 256
65+
// CHECK-NEXT: [[S_SROA_7_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_8_0_INSERT_INSERT]], [[S_SROA_7_0_INSERT_SHIFT]]
66+
// CHECK-NEXT: [[S_SROA_6_0_INSERT_EXT:%.*]] = zext i64 [[CONV8]] to i512
67+
// CHECK-NEXT: [[S_SROA_6_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_6_0_INSERT_EXT]], 192
68+
// CHECK-NEXT: [[S_SROA_6_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_7_0_INSERT_INSERT]], [[S_SROA_6_0_INSERT_SHIFT]]
69+
// CHECK-NEXT: [[S_SROA_5_0_INSERT_EXT:%.*]] = zext i64 [[CONV5]] to i512
70+
// CHECK-NEXT: [[S_SROA_5_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_5_0_INSERT_EXT]], 128
71+
// CHECK-NEXT: [[S_SROA_4_0_INSERT_EXT:%.*]] = zext i64 [[CONV2]] to i512
72+
// CHECK-NEXT: [[S_SROA_4_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_4_0_INSERT_EXT]], 64
73+
// CHECK-NEXT: [[S_SROA_4_0_INSERT_MASK:%.*]] = or i512 [[S_SROA_6_0_INSERT_INSERT]], [[S_SROA_5_0_INSERT_SHIFT]]
74+
// CHECK-NEXT: [[S_SROA_0_0_INSERT_EXT:%.*]] = zext i64 [[CONV]] to i512
75+
// CHECK-NEXT: [[S_SROA_0_0_INSERT_MASK:%.*]] = or i512 [[S_SROA_4_0_INSERT_MASK]], [[S_SROA_4_0_INSERT_SHIFT]]
76+
// CHECK-NEXT: [[S_SROA_0_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_0_0_INSERT_MASK]], [[S_SROA_0_0_INSERT_EXT]]
77+
// CHECK-NEXT: call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[S_SROA_0_0_INSERT_INSERT]], i8* [[ADDR:%.*]]) #[[ATTR1]], !srcloc !12
78+
// CHECK-NEXT: ret void
79+
//
80+
void store2(int *in, void *addr)
81+
{
82+
struct foo s = { in[0], in[1], in[4], in[16], in[25], in[36], in[49], in[64] };
83+
__asm__ volatile ("st64b %0,[%1]" : : "r" (s), "r" (addr) : "memory" );
84+
}

0 commit comments

Comments
 (0)