Skip to content

Commit deb429e

Browse files
committed
Revert "[CodeGen] Improve ExpandMemCmp for more efficient non-register aligned sizes handling (#69942)"
This reverts commit 9bcb30d.
1 parent 396b562 commit deb429e

File tree

5 files changed

+19
-3965
lines changed

5 files changed

+19
-3965
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -907,17 +907,6 @@ class TargetTransformInfo {
907907
// be done with two 4-byte compares instead of 4+2+1-byte compares. This
908908
// requires all loads in LoadSizes to be doable in an unaligned way.
909909
bool AllowOverlappingLoads = false;
910-
911-
// Sometimes, the amount of data that needs to be compared is smaller than
912-
// the standard register size, but it cannot be loaded with just one load
913-
// instruction. For example, if the size of the memory comparison is 6
914-
// bytes, we can handle it more efficiently by loading all 6 bytes in a
915-
// single block and generating an 8-byte number, instead of generating two
916-
// separate blocks with conditional jumps for 4 and 2 byte loads. This
917-
// approach simplifies the process and produces the comparison result as
918-
// normal. This array lists the allowed sizes of memcmp tails that can be
919-
// merged into one block
920-
SmallVector<unsigned, 4> AllowedTailExpansions;
921910
};
922911
MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
923912
bool IsZeroCmp) const;

llvm/lib/CodeGen/ExpandMemCmp.cpp

Lines changed: 19 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -117,8 +117,8 @@ class MemCmpExpansion {
117117
Value *Lhs = nullptr;
118118
Value *Rhs = nullptr;
119119
};
120-
LoadPair getLoadPair(Type *LoadSizeType, Type *BSwapSizeType,
121-
Type *CmpSizeType, unsigned OffsetBytes);
120+
LoadPair getLoadPair(Type *LoadSizeType, bool NeedsBSwap, Type *CmpSizeType,
121+
unsigned OffsetBytes);
122122

123123
static LoadEntryVector
124124
computeGreedyLoadSequence(uint64_t Size, llvm::ArrayRef<unsigned> LoadSizes,
@@ -128,11 +128,6 @@ class MemCmpExpansion {
128128
unsigned MaxNumLoads,
129129
unsigned &NumLoadsNonOneByte);
130130

131-
static void optimiseLoadSequence(
132-
LoadEntryVector &LoadSequence,
133-
const TargetTransformInfo::MemCmpExpansionOptions &Options,
134-
bool IsUsedForZeroCmp);
135-
136131
public:
137132
MemCmpExpansion(CallInst *CI, uint64_t Size,
138133
const TargetTransformInfo::MemCmpExpansionOptions &Options,
@@ -215,37 +210,6 @@ MemCmpExpansion::computeOverlappingLoadSequence(uint64_t Size,
215210
return LoadSequence;
216211
}
217212

218-
void MemCmpExpansion::optimiseLoadSequence(
219-
LoadEntryVector &LoadSequence,
220-
const TargetTransformInfo::MemCmpExpansionOptions &Options,
221-
bool IsUsedForZeroCmp) {
222-
// This part of code attempts to optimize the LoadSequence by merging allowed
223-
// subsequences into single loads of allowed sizes from
224-
// `MemCmpExpansionOptions::AllowedTailExpansions`. If it is for zero
225-
// comparison or if no allowed tail expansions are specified, we exit early.
226-
if (IsUsedForZeroCmp || Options.AllowedTailExpansions.empty())
227-
return;
228-
229-
while (LoadSequence.size() >= 2) {
230-
auto Last = LoadSequence[LoadSequence.size() - 1];
231-
auto PreLast = LoadSequence[LoadSequence.size() - 2];
232-
233-
// Exit the loop if the two sequences are not contiguous
234-
if (PreLast.Offset + PreLast.LoadSize != Last.Offset)
235-
break;
236-
237-
auto LoadSize = Last.LoadSize + PreLast.LoadSize;
238-
if (find(Options.AllowedTailExpansions, LoadSize) ==
239-
Options.AllowedTailExpansions.end())
240-
break;
241-
242-
// Remove the last two sequences and replace with the combined sequence
243-
LoadSequence.pop_back();
244-
LoadSequence.pop_back();
245-
LoadSequence.emplace_back(PreLast.Offset, LoadSize);
246-
}
247-
}
248-
249213
// Initialize the basic block structure required for expansion of memcmp call
250214
// with given maximum load size and memcmp size parameter.
251215
// This structure includes:
@@ -291,7 +255,6 @@ MemCmpExpansion::MemCmpExpansion(
291255
}
292256
}
293257
assert(LoadSequence.size() <= Options.MaxNumLoads && "broken invariant");
294-
optimiseLoadSequence(LoadSequence, Options, IsUsedForZeroCmp);
295258
}
296259

297260
unsigned MemCmpExpansion::getNumBlocks() {
@@ -315,7 +278,7 @@ void MemCmpExpansion::createResultBlock() {
315278
}
316279

317280
MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType,
318-
Type *BSwapSizeType,
281+
bool NeedsBSwap,
319282
Type *CmpSizeType,
320283
unsigned OffsetBytes) {
321284
// Get the memory source at offset `OffsetBytes`.
@@ -344,22 +307,16 @@ MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType,
344307
if (!Rhs)
345308
Rhs = Builder.CreateAlignedLoad(LoadSizeType, RhsSource, RhsAlign);
346309

347-
// Zero extend if Byte Swap intrinsic has different type
348-
if (BSwapSizeType && LoadSizeType != BSwapSizeType) {
349-
Lhs = Builder.CreateZExt(Lhs, BSwapSizeType);
350-
Rhs = Builder.CreateZExt(Rhs, BSwapSizeType);
351-
}
352-
353310
// Swap bytes if required.
354-
if (BSwapSizeType) {
355-
Function *Bswap = Intrinsic::getDeclaration(
356-
CI->getModule(), Intrinsic::bswap, BSwapSizeType);
311+
if (NeedsBSwap) {
312+
Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
313+
Intrinsic::bswap, LoadSizeType);
357314
Lhs = Builder.CreateCall(Bswap, Lhs);
358315
Rhs = Builder.CreateCall(Bswap, Rhs);
359316
}
360317

361318
// Zero extend if required.
362-
if (CmpSizeType != nullptr && CmpSizeType != Lhs->getType()) {
319+
if (CmpSizeType != nullptr && CmpSizeType != LoadSizeType) {
363320
Lhs = Builder.CreateZExt(Lhs, CmpSizeType);
364321
Rhs = Builder.CreateZExt(Rhs, CmpSizeType);
365322
}
@@ -375,7 +332,7 @@ void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex,
375332
BasicBlock *BB = LoadCmpBlocks[BlockIndex];
376333
Builder.SetInsertPoint(BB);
377334
const LoadPair Loads =
378-
getLoadPair(Type::getInt8Ty(CI->getContext()), nullptr,
335+
getLoadPair(Type::getInt8Ty(CI->getContext()), /*NeedsBSwap=*/false,
379336
Type::getInt32Ty(CI->getContext()), OffsetBytes);
380337
Value *Diff = Builder.CreateSub(Loads.Lhs, Loads.Rhs);
381338

@@ -428,12 +385,11 @@ Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex,
428385
IntegerType *const MaxLoadType =
429386
NumLoads == 1 ? nullptr
430387
: IntegerType::get(CI->getContext(), MaxLoadSize * 8);
431-
432388
for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) {
433389
const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex];
434390
const LoadPair Loads = getLoadPair(
435-
IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8), nullptr,
436-
MaxLoadType, CurLoadEntry.Offset);
391+
IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8),
392+
/*NeedsBSwap=*/false, MaxLoadType, CurLoadEntry.Offset);
437393

438394
if (NumLoads != 1) {
439395
// If we have multiple loads per block, we need to generate a composite
@@ -519,20 +475,14 @@ void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) {
519475

520476
Type *LoadSizeType =
521477
IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
522-
Type *BSwapSizeType =
523-
DL.isLittleEndian()
524-
? IntegerType::get(CI->getContext(),
525-
PowerOf2Ceil(CurLoadEntry.LoadSize * 8))
526-
: nullptr;
527-
Type *MaxLoadType = IntegerType::get(
528-
CI->getContext(),
529-
std::max(MaxLoadSize, (unsigned)PowerOf2Ceil(CurLoadEntry.LoadSize)) * 8);
478+
Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
530479
assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type");
531480

532481
Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
533482

534-
const LoadPair Loads = getLoadPair(LoadSizeType, BSwapSizeType, MaxLoadType,
535-
CurLoadEntry.Offset);
483+
const LoadPair Loads =
484+
getLoadPair(LoadSizeType, /*NeedsBSwap=*/DL.isLittleEndian(), MaxLoadType,
485+
CurLoadEntry.Offset);
536486

537487
// Add the loaded values to the phi nodes for calculating memcmp result only
538488
// if result is not used in a zero equality.
@@ -637,24 +587,19 @@ Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() {
637587
/// A memcmp expansion that only has one block of load and compare can bypass
638588
/// the compare, branch, and phi IR that is required in the general case.
639589
Value *MemCmpExpansion::getMemCmpOneBlock() {
640-
bool NeedsBSwap = DL.isLittleEndian() && Size != 1;
641590
Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
642-
Type *BSwapSizeType =
643-
NeedsBSwap ? IntegerType::get(CI->getContext(), PowerOf2Ceil(Size * 8))
644-
: nullptr;
645-
Type *MaxLoadType =
646-
IntegerType::get(CI->getContext(),
647-
std::max(MaxLoadSize, (unsigned)PowerOf2Ceil(Size)) * 8);
591+
bool NeedsBSwap = DL.isLittleEndian() && Size != 1;
648592

649593
// The i8 and i16 cases don't need compares. We zext the loaded values and
650594
// subtract them to get the suitable negative, zero, or positive i32 result.
651595
if (Size < 4) {
652-
const LoadPair Loads = getLoadPair(LoadSizeType, BSwapSizeType,
653-
Builder.getInt32Ty(), /*Offset*/ 0);
596+
const LoadPair Loads =
597+
getLoadPair(LoadSizeType, NeedsBSwap, Builder.getInt32Ty(),
598+
/*Offset*/ 0);
654599
return Builder.CreateSub(Loads.Lhs, Loads.Rhs);
655600
}
656601

657-
const LoadPair Loads = getLoadPair(LoadSizeType, BSwapSizeType, MaxLoadType,
602+
const LoadPair Loads = getLoadPair(LoadSizeType, NeedsBSwap, LoadSizeType,
658603
/*Offset*/ 0);
659604
// The result of memcmp is negative, zero, or positive, so produce that by
660605
// subtracting 2 extended compare bits: sub (ugt, ult).

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2961,7 +2961,6 @@ AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
29612961
// they may wake up the FP unit, which raises the power consumption. Perhaps
29622962
// they could be used with no holds barred (-O3).
29632963
Options.LoadSizes = {8, 4, 2, 1};
2964-
Options.AllowedTailExpansions = {3, 5, 6};
29652964
return Options;
29662965
}
29672966

0 commit comments

Comments
 (0)