Skip to content

[Clang][AArch64][ARM]: Fix Inefficient loads/stores of _BitInt(N) #93495

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions clang/include/clang/Basic/TargetInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,11 @@ class TargetInfo : public TransferrableTargetInfo,
return false;
}

// Different targets may support different machine type width for the _BitInt
virtual unsigned getBitIntLegalWidth(unsigned Width) const { return Width; }

virtual bool isBitIntSignExtended(bool IsSigned) const { return false; }

// Different targets may support a different maximum width for the _BitInt
// type, depending on what operations are supported.
virtual size_t getMaxBitIntWidth() const {
Expand Down
4 changes: 4 additions & 0 deletions clang/lib/Basic/Targets/AArch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,10 @@ bool AArch64TargetInfo::validateTarget(DiagnosticsEngine &Diags) const {
return true;
}

unsigned AArch64TargetInfo::getBitIntLegalWidth(unsigned Width) const {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is likely unnecessary (also it's incorrect in a couple of ways[1]). I think you should be using instead ASTConext::getTypeInfo(T).Width.

[1] Representation might be in less than 32-bits (could be also 8 or 16) and _BitInt(N), N > 128 is not N bits wide, it's in multiples of i128.

return getBitIntWidth(Width);
}

bool AArch64TargetInfo::validateBranchProtection(StringRef Spec, StringRef,
BranchProtectionInfo &BPI,
StringRef &Err) const {
Expand Down
2 changes: 2 additions & 0 deletions clang/lib/Basic/Targets/AArch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,8 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
bool hasBitIntType() const override { return true; }

bool validateTarget(DiagnosticsEngine &Diags) const override;

unsigned getBitIntLegalWidth(unsigned Width) const override;
};

class LLVM_LIBRARY_VISIBILITY AArch64leTargetInfo : public AArch64TargetInfo {
Expand Down
10 changes: 10 additions & 0 deletions clang/lib/Basic/Targets/ARM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1344,6 +1344,16 @@ int ARMTargetInfo::getEHDataRegisterNumber(unsigned RegNo) const {

bool ARMTargetInfo::hasSjLjLowering() const { return true; }

unsigned ARMTargetInfo::getBitIntLegalWidth(unsigned Width) const {
return getBitIntWidth(Width);
}

bool ARMTargetInfo::isBitIntSignExtended(bool IsSigned) const {
if (IsSigned)
return true;
return false;
}

ARMleTargetInfo::ARMleTargetInfo(const llvm::Triple &Triple,
const TargetOptions &Opts)
: ARMTargetInfo(Triple, Opts) {}
Expand Down
4 changes: 4 additions & 0 deletions clang/lib/Basic/Targets/ARM.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,10 @@ class LLVM_LIBRARY_VISIBILITY ARMTargetInfo : public TargetInfo {
std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
return std::make_pair(getTriple().isArch64Bit() ? 256 : 64, 64);
}

unsigned getBitIntLegalWidth(unsigned Width) const override;

bool isBitIntSignExtended(bool IsSigned) const override;
};

class LLVM_LIBRARY_VISIBILITY ARMleTargetInfo : public ARMTargetInfo {
Expand Down
17 changes: 16 additions & 1 deletion clang/lib/CodeGen/CGExpr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1989,7 +1989,14 @@ llvm::Value *CodeGenFunction::EmitLoadOfScalar(Address Addr, bool Volatile,
return EmitAtomicLoad(AtomicLValue, Loc).getScalarVal();
}

llvm::LoadInst *Load = Builder.CreateLoad(Addr, Volatile);
llvm::LoadInst *Load = nullptr;
if (const auto *BitintTy = Ty->getAs<BitIntType>()) {
Address TempAddr(Addr.getBasePointer(), ConvertTypeForMem(Ty),
Addr.getAlignment());
Load = Builder.CreateLoad(TempAddr, Volatile);
} else
Load = Builder.CreateLoad(Addr, Volatile);

if (isNontemporal) {
llvm::MDNode *Node = llvm::MDNode::get(
Load->getContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
Expand Down Expand Up @@ -2021,6 +2028,12 @@ llvm::Value *CodeGenFunction::EmitToMemory(llvm::Value *Value, QualType Ty) {
assert(Value->getType()->isIntegerTy(getContext().getTypeSize(Ty)) &&
"wrong value rep of bool");
}
if (auto *BitIntTy = Ty->getAs<BitIntType>()) {
if (CGM.getTarget().isBitIntSignExtended(BitIntTy->isSigned()))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might be introducing changes that are not desirable (or correct) for non-Arm targets.
Instead, I would suggest to add a description of the in-memory padding for the _BitInt types (e.g. via a member function in TargetInfo).
One reasonable approach is lilke this:

enum class TargetBitInitPaddingKind {
    None,
    ZeroOrSignExtend,
    AnyExtend
};

where

  • None will be the default and will result in identical code as the one that Clang generates now, i.e. no sext or zext, load/stores use LLVM type iN for _BitInt(N).
  • ZeroOrSignExtend would mean in-memory representation is padded with 0 for unsigned _BitInt(N) and with the sign bit for signed _BitInt(N). This will be the value for AArch32
  • AnyExtend would mean in-memory representation is padded with unspecified bits. This will be the value for AArch64. Since AFAIK we don't have such an operation in LLVM IR, one way to implement this would be identically to ZeroOrSignExtend or, alternatively, do zero-extend regardless of the signedness of the _BitInt(N) type.

return Builder.CreateSExt(Value, ConvertTypeForMem(Ty), "sext_bitint");
else
return Builder.CreateZExt(Value, ConvertTypeForMem(Ty), "zext_bitint");
}

return Value;
}
Expand All @@ -2043,6 +2056,8 @@ llvm::Value *CodeGenFunction::EmitFromMemory(llvm::Value *Value, QualType Ty) {
unsigned ValNumElems = cast<llvm::FixedVectorType>(ValTy)->getNumElements();
return emitBoolVecConversion(V, ValNumElems, "extractvec");
}
if (Ty->getAs<BitIntType>())
return Builder.CreateTrunc(Value, ConvertType(Ty), "to_bitint");

return Value;
}
Expand Down
4 changes: 4 additions & 0 deletions clang/lib/CodeGen/CodeGenTypes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,10 @@ llvm::Type *CodeGenTypes::ConvertTypeForMem(QualType T, bool ForBitField) {
return llvm::IntegerType::get(getLLVMContext(),
(unsigned)Context.getTypeSize(T));

if (T->isBitIntType())
return llvm::IntegerType::get(getLLVMContext(),
CGM.getTarget().getBitIntLegalWidth(
T->getAs<BitIntType>()->getNumBits()));
// Else, don't map it.
return R;
}
Expand Down
35 changes: 35 additions & 0 deletions clang/test/CodeGen/AArch64/BitInt.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
// RUN: %clang_cc1 -triple aarch64 -emit-llvm -o - %s | FileCheck %s
// RUN: %clang_cc1 -triple aarch64 -S -o /dev/null %s

_BitInt(18) signed_src;
_BitInt(18) signed_dst;

// CHECK-LABEL: define dso_local void @test_signed(
// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @signed_dst, align 4
// CHECK-NEXT: [[TOBITINT:%.*]] = trunc i32 [[TMP0]] to i18
// CHECK-NEXT: [[ZEXTBITINT:%.*]] = zext i18 [[TOBITINT]] to i32
// CHECK-NEXT: store i32 [[ZEXTBITINT]], ptr @signed_src, align 4
// CHECK-NEXT: ret void
//
void test_signed() {
signed_src = signed_dst;
}

unsigned _BitInt(18) src;
unsigned _BitInt(18) dst;

// CHECK-LABEL: define dso_local void @test_unsigned(
// CHECK-SAME: ) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @dst, align 4
// CHECK-NEXT: [[TOBITINT:%.*]] = trunc i32 [[TMP0]] to i18
// CHECK-NEXT: [[ZEXTBITINT:%.*]] = zext i18 [[TOBITINT]] to i32
// CHECK-NEXT: store i32 [[ZEXTBITINT]], ptr @src, align 4
// CHECK-NEXT: ret void
//
void test_unsigned() {
src = dst;
}
36 changes: 36 additions & 0 deletions clang/test/CodeGen/Arm/BitInt.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
// RUN: %clang_cc1 -triple arm -emit-llvm -o - %s | FileCheck %s
// RUN: %clang_cc1 -triple arm -S -o /dev/null %s

_BitInt(18) signed_src;
_BitInt(18) signed_dst;

// CHECK-LABEL: define dso_local arm_aapcscc void @test_signed(
// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @signed_dst, align 4
// CHECK-NEXT: [[TOBITINT:%.*]] = trunc i32 [[TMP0]] to i18
// CHECK-NEXT: [[SEXTBITINT:%.*]] = sext i18 [[TOBITINT]] to i32
// CHECK-NEXT: store i32 [[SEXTBITINT]], ptr @signed_src, align 4
// CHECK-NEXT: ret void
//
void test_signed() {
signed_src = signed_dst;
}

unsigned _BitInt(18) src;
unsigned _BitInt(18) dst;

// CHECK-LABEL: define dso_local arm_aapcscc void @test_unsigned(
// CHECK-SAME: ) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @dst, align 4
// CHECK-NEXT: [[TOBITINT:%.*]] = trunc i32 [[TMP0]] to i18
// CHECK-NEXT: [[ZEXTBITINT:%.*]] = zext i18 [[TOBITINT]] to i32
// CHECK-NEXT: store i32 [[ZEXTBITINT]], ptr @src, align 4
// CHECK-NEXT: ret void
//
void test_unsigned() {
src = dst;
}

6 changes: 3 additions & 3 deletions clang/test/CodeGen/attr-noundef.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,9 @@ void pass_large_BitInt(_BitInt(127) e) {

// TODO: for now, ExtInt is only noundef if it is sign/zero-extended
// CHECK-INTEL: [[DEF]] noundef signext i3 @{{.*}}ret_BitInt{{.*}}()
// CHECK-AARCH: [[DEF]] i3 @{{.*}}ret_BitInt{{.*}}()
// CHECK-AARCH: [[DEF]] noundef i3 @{{.*}}ret_BitInt{{.*}}()
// CHECK-INTEL: [[DEF]] void @{{.*}}pass_BitInt{{.*}}(i3 noundef signext %
// CHECK-AARCH: [[DEF]] void @{{.*}}pass_BitInt{{.*}}(i3 %
// CHECK-AARCH: [[DEF]] void @{{.*}}pass_BitInt{{.*}}(i3 noundef %
// CHECK-INTEL: [[DEF]] void @{{.*}}pass_large_BitInt{{.*}}(i64 %{{.*}}, i64 %
// CHECK-AARCH: [[DEF]] void @{{.*}}pass_large_BitInt{{.*}}(i127 %
// CHECK-AARCH: [[DEF]] void @{{.*}}pass_large_BitInt{{.*}}(i127 noundef %
} // namespace check_exotic
54 changes: 30 additions & 24 deletions clang/test/CodeGen/builtins-bitint.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@
// CHECK-O0-LABEL: define dso_local arm_aapcscc i32 @test_popcountg_ubi1(
// CHECK-O0-SAME: ) #[[ATTR0:[0-9]+]] {
// CHECK-O0-NEXT: entry:
// CHECK-O0-NEXT: [[A:%.*]] = alloca i1, align 1
// CHECK-O0-NEXT: store i1 true, ptr [[A]], align 1
// CHECK-O0-NEXT: [[TMP0:%.*]] = load i1, ptr [[A]], align 1
// CHECK-O0-NEXT: [[TMP1:%.*]] = call i1 @llvm.ctpop.i1(i1 [[TMP0]])
// CHECK-O0-NEXT: [[A:%.*]] = alloca i8, align 1
// CHECK-O0-NEXT: store i8 1, ptr [[A]], align 1
// CHECK-O0-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1
// CHECK-O0-NEXT: [[TO_BITINT:%.*]] = trunc i8 [[TMP0]] to i1
// CHECK-O0-NEXT: [[TMP1:%.*]] = call i1 @llvm.ctpop.i1(i1 [[TO_BITINT]])
// CHECK-O0-NEXT: [[CAST:%.*]] = zext i1 [[TMP1]] to i32
// CHECK-O0-NEXT: ret i32 [[CAST]]
//
Expand All @@ -28,10 +29,11 @@ int test_popcountg_ubi1() {
// CHECK-O0-LABEL: define dso_local arm_aapcscc i32 @test_popcountg_ubi2(
// CHECK-O0-SAME: ) #[[ATTR0]] {
// CHECK-O0-NEXT: entry:
// CHECK-O0-NEXT: [[A:%.*]] = alloca i2, align 1
// CHECK-O0-NEXT: store i2 -1, ptr [[A]], align 1
// CHECK-O0-NEXT: [[TMP0:%.*]] = load i2, ptr [[A]], align 1
// CHECK-O0-NEXT: [[TMP1:%.*]] = call i2 @llvm.ctpop.i2(i2 [[TMP0]])
// CHECK-O0-NEXT: [[A:%.*]] = alloca i8, align 1
// CHECK-O0-NEXT: store i8 3, ptr [[A]], align 1
// CHECK-O0-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1
// CHECK-O0-NEXT: [[TO_BITINT:%.*]] = trunc i8 [[TMP0]] to i2
// CHECK-O0-NEXT: [[TMP1:%.*]] = call i2 @llvm.ctpop.i2(i2 [[TO_BITINT]])
// CHECK-O0-NEXT: [[CAST:%.*]] = zext i2 [[TMP1]] to i32
// CHECK-O0-NEXT: ret i32 [[CAST]]
//
Expand All @@ -48,10 +50,11 @@ int test_popcountg_ubi2() {
// CHECK-O0-LABEL: define dso_local arm_aapcscc i32 @test_ctzg_ubi1(
// CHECK-O0-SAME: ) #[[ATTR0]] {
// CHECK-O0-NEXT: entry:
// CHECK-O0-NEXT: [[A:%.*]] = alloca i1, align 1
// CHECK-O0-NEXT: store i1 false, ptr [[A]], align 1
// CHECK-O0-NEXT: [[TMP0:%.*]] = load i1, ptr [[A]], align 1
// CHECK-O0-NEXT: [[TMP1:%.*]] = call i1 @llvm.cttz.i1(i1 [[TMP0]], i1 false)
// CHECK-O0-NEXT: [[A:%.*]] = alloca i8, align 1
// CHECK-O0-NEXT: store i8 0, ptr [[A]], align 1
// CHECK-O0-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1
// CHECK-O0-NEXT: [[TO_BITINT:%.*]] = trunc i8 [[TMP0]] to i1
// CHECK-O0-NEXT: [[TMP1:%.*]] = call i1 @llvm.cttz.i1(i1 [[TO_BITINT]], i1 false)
// CHECK-O0-NEXT: [[CAST:%.*]] = zext i1 [[TMP1]] to i32
// CHECK-O0-NEXT: ret i32 [[CAST]]
//
Expand All @@ -68,10 +71,11 @@ int test_ctzg_ubi1() {
// CHECK-O0-LABEL: define dso_local arm_aapcscc i32 @test_ctzg_ubi2(
// CHECK-O0-SAME: ) #[[ATTR0]] {
// CHECK-O0-NEXT: entry:
// CHECK-O0-NEXT: [[A:%.*]] = alloca i2, align 1
// CHECK-O0-NEXT: store i2 0, ptr [[A]], align 1
// CHECK-O0-NEXT: [[TMP0:%.*]] = load i2, ptr [[A]], align 1
// CHECK-O0-NEXT: [[TMP1:%.*]] = call i2 @llvm.cttz.i2(i2 [[TMP0]], i1 false)
// CHECK-O0-NEXT: [[A:%.*]] = alloca i8, align 1
// CHECK-O0-NEXT: store i8 0, ptr [[A]], align 1
// CHECK-O0-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1
// CHECK-O0-NEXT: [[TO_BITINT:%.*]] = trunc i8 [[TMP0]] to i2
// CHECK-O0-NEXT: [[TMP1:%.*]] = call i2 @llvm.cttz.i2(i2 [[TO_BITINT]], i1 false)
// CHECK-O0-NEXT: [[CAST:%.*]] = zext i2 [[TMP1]] to i32
// CHECK-O0-NEXT: ret i32 [[CAST]]
//
Expand All @@ -88,10 +92,11 @@ int test_ctzg_ubi2() {
// CHECK-O0-LABEL: define dso_local arm_aapcscc i32 @test_clzg_ubi1(
// CHECK-O0-SAME: ) #[[ATTR0]] {
// CHECK-O0-NEXT: entry:
// CHECK-O0-NEXT: [[A:%.*]] = alloca i1, align 1
// CHECK-O0-NEXT: store i1 false, ptr [[A]], align 1
// CHECK-O0-NEXT: [[TMP0:%.*]] = load i1, ptr [[A]], align 1
// CHECK-O0-NEXT: [[TMP1:%.*]] = call i1 @llvm.ctlz.i1(i1 [[TMP0]], i1 false)
// CHECK-O0-NEXT: [[A:%.*]] = alloca i8, align 1
// CHECK-O0-NEXT: store i8 0, ptr [[A]], align 1
// CHECK-O0-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1
// CHECK-O0-NEXT: [[TO_BITINT:%.*]] = trunc i8 [[TMP0]] to i1
// CHECK-O0-NEXT: [[TMP1:%.*]] = call i1 @llvm.ctlz.i1(i1 [[TO_BITINT]], i1 false)
// CHECK-O0-NEXT: [[CAST:%.*]] = zext i1 [[TMP1]] to i32
// CHECK-O0-NEXT: ret i32 [[CAST]]
//
Expand All @@ -108,10 +113,11 @@ int test_clzg_ubi1() {
// CHECK-O0-LABEL: define dso_local arm_aapcscc i32 @test_clzg_ubi2(
// CHECK-O0-SAME: ) #[[ATTR0]] {
// CHECK-O0-NEXT: entry:
// CHECK-O0-NEXT: [[A:%.*]] = alloca i2, align 1
// CHECK-O0-NEXT: store i2 0, ptr [[A]], align 1
// CHECK-O0-NEXT: [[TMP0:%.*]] = load i2, ptr [[A]], align 1
// CHECK-O0-NEXT: [[TMP1:%.*]] = call i2 @llvm.ctlz.i2(i2 [[TMP0]], i1 false)
// CHECK-O0-NEXT: [[A:%.*]] = alloca i8, align 1
// CHECK-O0-NEXT: store i8 0, ptr [[A]], align 1
// CHECK-O0-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1
// CHECK-O0-NEXT: [[TO_BITINT:%.*]] = trunc i8 [[TMP0]] to i2
// CHECK-O0-NEXT: [[TMP1:%.*]] = call i2 @llvm.ctlz.i2(i2 [[TO_BITINT]], i1 false)
// CHECK-O0-NEXT: [[CAST:%.*]] = zext i2 [[TMP1]] to i32
// CHECK-O0-NEXT: ret i32 [[CAST]]
//
Expand Down
Loading