-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[CodeGen] Improve ExpandMemCmp for more efficient non-register aligned sizes handling #70469
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-llvm-analysis Author: Igor Kirillov (igogo-x86) Changes
Reapplication of #69942 after fixing a bug Patch is 148.94 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/70469.diff 5 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 5234ef8788d9e96..3ec80d99b392b2e 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -907,6 +907,17 @@ class TargetTransformInfo {
// be done with two 4-byte compares instead of 4+2+1-byte compares. This
// requires all loads in LoadSizes to be doable in an unaligned way.
bool AllowOverlappingLoads = false;
+
+ // Sometimes, the amount of data that needs to be compared is smaller than
+ // the standard register size, but it cannot be loaded with just one load
+ // instruction. For example, if the size of the memory comparison is 6
+ // bytes, we can handle it more efficiently by loading all 6 bytes in a
+ // single block and generating an 8-byte number, instead of generating two
+ // separate blocks with conditional jumps for 4 and 2 byte loads. This
+ // approach simplifies the process and produces the comparison result as
+ // normal. This array lists the allowed sizes of memcmp tails that can be
+ // merged into one block
+ SmallVector<unsigned, 4> AllowedTailExpansions;
};
MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
bool IsZeroCmp) const;
diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp
index 911ebd41afc5b91..28e258be226a695 100644
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -117,8 +117,8 @@ class MemCmpExpansion {
Value *Lhs = nullptr;
Value *Rhs = nullptr;
};
- LoadPair getLoadPair(Type *LoadSizeType, bool NeedsBSwap, Type *CmpSizeType,
- unsigned OffsetBytes);
+ LoadPair getLoadPair(Type *LoadSizeType, Type *BSwapSizeType,
+ Type *CmpSizeType, unsigned OffsetBytes);
static LoadEntryVector
computeGreedyLoadSequence(uint64_t Size, llvm::ArrayRef<unsigned> LoadSizes,
@@ -128,6 +128,11 @@ class MemCmpExpansion {
unsigned MaxNumLoads,
unsigned &NumLoadsNonOneByte);
+ static void optimiseLoadSequence(
+ LoadEntryVector &LoadSequence,
+ const TargetTransformInfo::MemCmpExpansionOptions &Options,
+ bool IsUsedForZeroCmp);
+
public:
MemCmpExpansion(CallInst *CI, uint64_t Size,
const TargetTransformInfo::MemCmpExpansionOptions &Options,
@@ -210,6 +215,37 @@ MemCmpExpansion::computeOverlappingLoadSequence(uint64_t Size,
return LoadSequence;
}
+void MemCmpExpansion::optimiseLoadSequence(
+ LoadEntryVector &LoadSequence,
+ const TargetTransformInfo::MemCmpExpansionOptions &Options,
+ bool IsUsedForZeroCmp) {
+ // This part of code attempts to optimize the LoadSequence by merging allowed
+ // subsequences into single loads of allowed sizes from
+ // `MemCmpExpansionOptions::AllowedTailExpansions`. If it is for zero
+ // comparison or if no allowed tail expansions are specified, we exit early.
+ if (IsUsedForZeroCmp || Options.AllowedTailExpansions.empty())
+ return;
+
+ while (LoadSequence.size() >= 2) {
+ auto Last = LoadSequence[LoadSequence.size() - 1];
+ auto PreLast = LoadSequence[LoadSequence.size() - 2];
+
+ // Exit the loop if the two sequences are not contiguous
+ if (PreLast.Offset + PreLast.LoadSize != Last.Offset)
+ break;
+
+ auto LoadSize = Last.LoadSize + PreLast.LoadSize;
+ if (find(Options.AllowedTailExpansions, LoadSize) ==
+ Options.AllowedTailExpansions.end())
+ break;
+
+ // Remove the last two sequences and replace with the combined sequence
+ LoadSequence.pop_back();
+ LoadSequence.pop_back();
+ LoadSequence.emplace_back(PreLast.Offset, LoadSize);
+ }
+}
+
// Initialize the basic block structure required for expansion of memcmp call
// with given maximum load size and memcmp size parameter.
// This structure includes:
@@ -255,6 +291,7 @@ MemCmpExpansion::MemCmpExpansion(
}
}
assert(LoadSequence.size() <= Options.MaxNumLoads && "broken invariant");
+ optimiseLoadSequence(LoadSequence, Options, IsUsedForZeroCmp);
}
unsigned MemCmpExpansion::getNumBlocks() {
@@ -278,7 +315,7 @@ void MemCmpExpansion::createResultBlock() {
}
MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType,
- bool NeedsBSwap,
+ Type *BSwapSizeType,
Type *CmpSizeType,
unsigned OffsetBytes) {
// Get the memory source at offset `OffsetBytes`.
@@ -307,16 +344,22 @@ MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType,
if (!Rhs)
Rhs = Builder.CreateAlignedLoad(LoadSizeType, RhsSource, RhsAlign);
+ // Zero extend if Byte Swap intrinsic has different type
+ if (BSwapSizeType && LoadSizeType != BSwapSizeType) {
+ Lhs = Builder.CreateZExt(Lhs, BSwapSizeType);
+ Rhs = Builder.CreateZExt(Rhs, BSwapSizeType);
+ }
+
// Swap bytes if required.
- if (NeedsBSwap) {
- Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
- Intrinsic::bswap, LoadSizeType);
+ if (BSwapSizeType) {
+ Function *Bswap = Intrinsic::getDeclaration(
+ CI->getModule(), Intrinsic::bswap, BSwapSizeType);
Lhs = Builder.CreateCall(Bswap, Lhs);
Rhs = Builder.CreateCall(Bswap, Rhs);
}
// Zero extend if required.
- if (CmpSizeType != nullptr && CmpSizeType != LoadSizeType) {
+ if (CmpSizeType != nullptr && CmpSizeType != Lhs->getType()) {
Lhs = Builder.CreateZExt(Lhs, CmpSizeType);
Rhs = Builder.CreateZExt(Rhs, CmpSizeType);
}
@@ -332,7 +375,7 @@ void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex,
BasicBlock *BB = LoadCmpBlocks[BlockIndex];
Builder.SetInsertPoint(BB);
const LoadPair Loads =
- getLoadPair(Type::getInt8Ty(CI->getContext()), /*NeedsBSwap=*/false,
+ getLoadPair(Type::getInt8Ty(CI->getContext()), nullptr,
Type::getInt32Ty(CI->getContext()), OffsetBytes);
Value *Diff = Builder.CreateSub(Loads.Lhs, Loads.Rhs);
@@ -385,11 +428,12 @@ Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex,
IntegerType *const MaxLoadType =
NumLoads == 1 ? nullptr
: IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+
for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) {
const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex];
const LoadPair Loads = getLoadPair(
- IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8),
- /*NeedsBSwap=*/false, MaxLoadType, CurLoadEntry.Offset);
+ IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8), nullptr,
+ MaxLoadType, CurLoadEntry.Offset);
if (NumLoads != 1) {
// If we have multiple loads per block, we need to generate a composite
@@ -475,14 +519,20 @@ void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) {
Type *LoadSizeType =
IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
- Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+ Type *BSwapSizeType =
+ DL.isLittleEndian()
+ ? IntegerType::get(CI->getContext(),
+ PowerOf2Ceil(CurLoadEntry.LoadSize * 8))
+ : nullptr;
+ Type *MaxLoadType = IntegerType::get(
+ CI->getContext(),
+ std::max(MaxLoadSize, (unsigned)PowerOf2Ceil(CurLoadEntry.LoadSize)) * 8);
assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type");
Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
- const LoadPair Loads =
- getLoadPair(LoadSizeType, /*NeedsBSwap=*/DL.isLittleEndian(), MaxLoadType,
- CurLoadEntry.Offset);
+ const LoadPair Loads = getLoadPair(LoadSizeType, BSwapSizeType, MaxLoadType,
+ CurLoadEntry.Offset);
// Add the loaded values to the phi nodes for calculating memcmp result only
// if result is not used in a zero equality.
@@ -587,19 +637,24 @@ Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() {
/// A memcmp expansion that only has one block of load and compare can bypass
/// the compare, branch, and phi IR that is required in the general case.
Value *MemCmpExpansion::getMemCmpOneBlock() {
- Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
bool NeedsBSwap = DL.isLittleEndian() && Size != 1;
+ Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
+ Type *BSwapSizeType =
+ NeedsBSwap ? IntegerType::get(CI->getContext(), PowerOf2Ceil(Size * 8))
+ : nullptr;
+ Type *MaxLoadType =
+ IntegerType::get(CI->getContext(),
+ std::max(MaxLoadSize, (unsigned)PowerOf2Ceil(Size)) * 8);
// The i8 and i16 cases don't need compares. We zext the loaded values and
// subtract them to get the suitable negative, zero, or positive i32 result.
- if (Size < 4) {
- const LoadPair Loads =
- getLoadPair(LoadSizeType, NeedsBSwap, Builder.getInt32Ty(),
- /*Offset*/ 0);
+ if (Size == 1 || Size == 2) {
+ const LoadPair Loads = getLoadPair(LoadSizeType, BSwapSizeType,
+ Builder.getInt32Ty(), /*Offset*/ 0);
return Builder.CreateSub(Loads.Lhs, Loads.Rhs);
}
- const LoadPair Loads = getLoadPair(LoadSizeType, NeedsBSwap, LoadSizeType,
+ const LoadPair Loads = getLoadPair(LoadSizeType, BSwapSizeType, MaxLoadType,
/*Offset*/ 0);
// The result of memcmp is negative, zero, or positive, so produce that by
// subtracting 2 extended compare bits: sub (ugt, ult).
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 5f2d09f0765aa38..1d9dcfc4e9f446c 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2961,6 +2961,7 @@ AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
// they may wake up the FP unit, which raises the power consumption. Perhaps
// they could be used with no holds barred (-O3).
Options.LoadSizes = {8, 4, 2, 1};
+ Options.AllowedTailExpansions = {3, 5, 6};
return Options;
}
diff --git a/llvm/test/CodeGen/AArch64/memcmp.ll b/llvm/test/CodeGen/AArch64/memcmp.ll
new file mode 100644
index 000000000000000..d13a416a28761ca
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/memcmp.ll
@@ -0,0 +1,3005 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
+
+@.str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i64)
+
+define i32 @length0(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+ ret i32 %m
+ }
+
+define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length0_eq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w0, #1 // =0x1
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+ %c = icmp eq i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length0_lt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+ %c = icmp slt i32 %m, 0
+ ret i1 %c
+}
+
+define i32 @length2(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: ldrh w9, [x1]
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: sub w0, w8, w9, lsr #16
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+ ret i32 %m
+}
+
+define i32 @length2_const(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_const:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w9, [x0]
+; CHECK-NEXT: mov w8, #-12594 // =0xffffcece
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: add w0, w8, w9, lsr #16
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+ ret i32 %m
+}
+
+define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_gt_const:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w9, [x0]
+; CHECK-NEXT: mov w8, #-12594 // =0xffffcece
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: add w8, w8, w9, lsr #16
+; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: cset w0, gt
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+ %c = icmp sgt i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_eq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: ldrh w9, [x1]
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+ %c = icmp eq i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_lt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: ldrh w9, [x1]
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: sub w8, w8, w9, lsr #16
+; CHECK-NEXT: lsr w0, w8, #31
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+ %c = icmp slt i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_gt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: ldrh w9, [x1]
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: sub w8, w8, w9, lsr #16
+; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: cset w0, gt
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+ %c = icmp sgt i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length2_eq_const:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: mov w9, #12849 // =0x3231
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, ne
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+ %c = icmp ne i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_eq_nobuiltin_attr:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: mov w2, #2 // =0x2
+; CHECK-NEXT: bl memcmp
+; CHECK-NEXT: cmp w0, #0
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
+ %c = icmp eq i32 %m, 0
+ ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrb w8, [x0, #2]
+; CHECK-NEXT: ldrh w9, [x0]
+; CHECK-NEXT: ldrb w10, [x1, #2]
+; CHECK-NEXT: ldrh w11, [x1]
+; CHECK-NEXT: orr w8, w9, w8, lsl #16
+; CHECK-NEXT: orr w9, w11, w10, lsl #16
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w8, hi
+; CHECK-NEXT: cset w9, lo
+; CHECK-NEXT: sub w0, w8, w9
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+ ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length3_eq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: ldrh w9, [x1]
+; CHECK-NEXT: ldrb w10, [x0, #2]
+; CHECK-NEXT: ldrb w11, [x1, #2]
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: ccmp w10, w11, #0, eq
+; CHECK-NEXT: cset w0, ne
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+ %c = icmp ne i32 %m, 0
+ ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: ldr w9, [x1]
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w8, hi
+; CHECK-NEXT: cset w9, lo
+; CHECK-NEXT: sub w0, w8, w9
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+ ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length4_eq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: ldr w9, [x1]
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, ne
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+ %c = icmp ne i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length4_lt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: ldr w9, [x1]
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w8, hi
+; CHECK-NEXT: cset w9, lo
+; CHECK-NEXT: sub w8, w8, w9
+; CHECK-NEXT: lsr w0, w8, #31
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+ %c = icmp slt i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length4_gt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: ldr w9, [x1]
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w8, hi
+; CHECK-NEXT: cset w9, lo
+; CHECK-NEXT: sub w8, w8, w9
+; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: cset w0, gt
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+ %c = icmp sgt i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length4_eq_const:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: mov w9, #12849 // =0x3231
+; CHECK-NEXT: movk w9, #13363, lsl #16
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
+ %c = icmp eq i32 %m, 0
+ ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length5:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrb w8, [x0, #4]
+; CHECK-NEXT: ldr w9, [x0]
+; CHECK-NEXT: ldrb w10, [x1, #4]
+; CHECK-NEXT: ldr w11, [x1]
+; CHECK-NEXT: orr x8, x9, x8, lsl #32
+; CHECK-NEXT: orr x9, x11, x10, lsl #32
+; CHECK-NEXT: rev x8, x8
+; CHECK-NEXT: rev x9, x9
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: cset w8, hi
+; CHECK-NEXT: cset w9, lo
+; CHECK-NEXT: sub w0, w8, w9
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+ ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length5_eq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: ldr w9, [x1]
+; CHECK-NEXT: ldrb w10, [x0, #4]
+; CHECK-NEXT: ldrb w11, [x1, #4]
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: ccmp w10, w11, #0, eq
+; CHECK-NEXT: cset w0, ne
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+ %c = icmp ne i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length5_lt:
+; CHECK: // %bb.0:
+; CHECK-NE...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM! The fix for the previous merge issue looks sensible to me.
…d sizes handling * Enhanced the logic of ExpandMemCmp pass to merge contiguous subsequences in LoadSequence, based on sizes allowed in `AllowedTailExpansions`. * This enhancement seeks to minimize the number of basic blocks and produce optimized code when using memcmp with non-register aligned sizes. * Enable this feature for AArch64 with memcmp sizes modulo 8 equal to 3, 5, and 6.
f7324a8
to
8aeb63e
Compare
This optimization was introduced by llvm#70469. Like AArch64, we allow tail expansions for 3 on RV32 and 3/5/6 on RV64. This can simplify the comparison and reduce the number of blocks.
This optimization was introduced by #70469. Like AArch64, we allow tail expansions for 3 on RV32 and 3/5/6 on RV64. This can simplify the comparison and reduce the number of blocks.
in LoadSequence, based on sizes allowed in
AllowedTailExpansions
.optimized code when using memcmp with non-register aligned sizes.
3, 5, and 6.
Reapplication of #69942 after fixing a bug