-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[GlobalISel] Optimize ULEB128 usage #90565
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-support @llvm/pr-subscribers-llvm-globalisel Author: Pierre van Houtryve (Pierre-vh) Changes
Previous LEB128 decode function was about 45 instructions on X86 assuming it got inlined for our use case & the error checks got optimized out. This fast one is 23 instruction. Full diff: https://github.com/llvm/llvm-project/pull/90565.diff 5 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h
index 8eddc6a6a531b4..1086b1e89f1340 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h
@@ -168,7 +168,7 @@ enum {
/// operand.
/// - InsnID(ULEB128) - Instruction ID
/// - MMOIdx(ULEB128) - MMO index
- /// - NumAddrSpace(ULEB128) - Number of valid address spaces
+ /// - NumAddrSpace(1) - Number of valid address spaces
/// - AddrSpaceN(ULEB128) - An allowed space of the memory access
/// - AddrSpaceN+1 ...
GIM_CheckMemoryAddressSpace,
@@ -177,7 +177,7 @@ enum {
/// memory operand.
/// - InsnID(ULEB128) - Instruction ID
/// - MMOIdx(ULEB128) - MMO index
- /// - MinAlign(ULEB128) - Minimum acceptable alignment
+ /// - MinAlign(1) - Minimum acceptable alignment
GIM_CheckMemoryAlignment,
/// Check the size of the memory access for the given machine memory operand
@@ -713,6 +713,27 @@ class GIMatchTableExecutor {
memcpy(&Ret, MatchTable, sizeof(Ret));
return Ret;
}
+
+public:
+ // Faster ULEB128 decoder tailored for the Match Table Executor.
+ //
+ // - Arguments are fixed to avoid mid-function checks.
+ // - Unchecked execution, assumes no error.
+ // - Fast common case handling (1 byte values).
+ LLVM_ATTRIBUTE_ALWAYS_INLINE static uint64_t
+ fastDecodeULEB128(const uint8_t *__restrict MatchTable,
+ uint64_t &CurrentIdx) {
+ uint64_t Value = MatchTable[CurrentIdx] & 0x7f;
+ if (LLVM_UNLIKELY(MatchTable[CurrentIdx++] >= 128)) {
+ unsigned Shift = 7;
+ do {
+ uint64_t Slice = MatchTable[CurrentIdx] & 0x7f;
+ Value += Slice << Shift;
+ Shift += 7;
+ } while (MatchTable[CurrentIdx++] >= 128);
+ }
+ return Value;
+ }
};
} // end namespace llvm
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
index dec2d97bb1fa7e..4d147bf20c26a5 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
@@ -100,10 +100,7 @@ bool GIMatchTableExecutor::executeMatchTable(
};
const auto readULEB = [&]() {
- unsigned N = 0;
- uint64_t Val = decodeULEB128(MatchTable + CurrentIdx, &N);
- CurrentIdx += N;
- return Val;
+ return fastDecodeULEB128(MatchTable, CurrentIdx);
};
// Convenience function to return a signed value. This avoids
@@ -476,7 +473,7 @@ bool GIMatchTableExecutor::executeMatchTable(
}
case GIM_CheckAtomicOrdering: {
uint64_t InsnID = readULEB();
- auto Ordering = (AtomicOrdering)readULEB();
+ auto Ordering = (AtomicOrdering)MatchTable[CurrentIdx++];
DEBUG_WITH_TYPE(TgtExecutor::getName(),
dbgs() << CurrentIdx << ": GIM_CheckAtomicOrdering(MIs["
<< InsnID << "], " << (uint64_t)Ordering << ")\n");
@@ -493,7 +490,7 @@ bool GIMatchTableExecutor::executeMatchTable(
}
case GIM_CheckAtomicOrderingOrStrongerThan: {
uint64_t InsnID = readULEB();
- auto Ordering = (AtomicOrdering)readULEB();
+ auto Ordering = (AtomicOrdering)MatchTable[CurrentIdx++];
DEBUG_WITH_TYPE(TgtExecutor::getName(),
dbgs() << CurrentIdx
<< ": GIM_CheckAtomicOrderingOrStrongerThan(MIs["
@@ -511,7 +508,7 @@ bool GIMatchTableExecutor::executeMatchTable(
}
case GIM_CheckAtomicOrderingWeakerThan: {
uint64_t InsnID = readULEB();
- auto Ordering = (AtomicOrdering)readULEB();
+ auto Ordering = (AtomicOrdering)MatchTable[CurrentIdx++];
DEBUG_WITH_TYPE(TgtExecutor::getName(),
dbgs() << CurrentIdx
<< ": GIM_CheckAtomicOrderingWeakerThan(MIs["
@@ -531,7 +528,7 @@ bool GIMatchTableExecutor::executeMatchTable(
uint64_t InsnID = readULEB();
uint64_t MMOIdx = readULEB();
// This accepts a list of possible address spaces.
- const uint64_t NumAddrSpace = readULEB();
+ const uint64_t NumAddrSpace = MatchTable[CurrentIdx++];
if (State.MIs[InsnID]->getNumMemOperands() <= MMOIdx) {
if (handleReject() == RejectAndGiveUp)
@@ -568,7 +565,7 @@ bool GIMatchTableExecutor::executeMatchTable(
case GIM_CheckMemoryAlignment: {
uint64_t InsnID = readULEB();
uint64_t MMOIdx = readULEB();
- uint64_t MinAlign = readULEB();
+ uint64_t MinAlign = MatchTable[CurrentIdx++];
assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
diff --git a/llvm/unittests/CodeGen/GlobalISel/CMakeLists.txt b/llvm/unittests/CodeGen/GlobalISel/CMakeLists.txt
index 6ed2409f2ad755..111d7f4d2f6200 100644
--- a/llvm/unittests/CodeGen/GlobalISel/CMakeLists.txt
+++ b/llvm/unittests/CodeGen/GlobalISel/CMakeLists.txt
@@ -15,6 +15,7 @@ set(LLVM_LINK_COMPONENTS
add_llvm_unittest(GlobalISelTests
ConstantFoldingTest.cpp
CSETest.cpp
+ GIMatchTableExecutorTest.cpp
LegalizerTest.cpp
LegalizerHelperTest.cpp
LegalizerInfoTest.cpp
diff --git a/llvm/unittests/CodeGen/GlobalISel/GIMatchTableExecutorTest.cpp b/llvm/unittests/CodeGen/GlobalISel/GIMatchTableExecutorTest.cpp
new file mode 100644
index 00000000000000..5a811d79592880
--- /dev/null
+++ b/llvm/unittests/CodeGen/GlobalISel/GIMatchTableExecutorTest.cpp
@@ -0,0 +1,49 @@
+//===- GIMatchTableExecutorTest.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+TEST(GlobalISelLEB128Test, fastDecodeULEB128) {
+#define EXPECT_DECODE_ULEB128_EQ(EXPECTED, VALUE) \
+ do { \
+ uint64_t ActualSize = 0; \
+ uint64_t Actual = GIMatchTableExecutor::fastDecodeULEB128( \
+ reinterpret_cast<const uint8_t *>(VALUE), ActualSize); \
+ EXPECT_EQ(sizeof(VALUE) - 1, ActualSize); \
+ EXPECT_EQ(EXPECTED, Actual); \
+ } while (0)
+
+ EXPECT_DECODE_ULEB128_EQ(0u, "\x00");
+ EXPECT_DECODE_ULEB128_EQ(1u, "\x01");
+ EXPECT_DECODE_ULEB128_EQ(63u, "\x3f");
+ EXPECT_DECODE_ULEB128_EQ(64u, "\x40");
+ EXPECT_DECODE_ULEB128_EQ(0x7fu, "\x7f");
+ EXPECT_DECODE_ULEB128_EQ(0x80u, "\x80\x01");
+ EXPECT_DECODE_ULEB128_EQ(0x81u, "\x81\x01");
+ EXPECT_DECODE_ULEB128_EQ(0x90u, "\x90\x01");
+ EXPECT_DECODE_ULEB128_EQ(0xffu, "\xff\x01");
+ EXPECT_DECODE_ULEB128_EQ(0x100u, "\x80\x02");
+ EXPECT_DECODE_ULEB128_EQ(0x101u, "\x81\x02");
+ EXPECT_DECODE_ULEB128_EQ(4294975616ULL, "\x80\xc1\x80\x80\x10");
+
+ // Decode ULEB128 with extra padding bytes
+ EXPECT_DECODE_ULEB128_EQ(0u, "\x80\x00");
+ EXPECT_DECODE_ULEB128_EQ(0u, "\x80\x80\x00");
+ EXPECT_DECODE_ULEB128_EQ(0x7fu, "\xff\x00");
+ EXPECT_DECODE_ULEB128_EQ(0x7fu, "\xff\x80\x00");
+ EXPECT_DECODE_ULEB128_EQ(0x80u, "\x80\x81\x00");
+ EXPECT_DECODE_ULEB128_EQ(0x80u, "\x80\x81\x80\x00");
+ EXPECT_DECODE_ULEB128_EQ(0x80u, "\x80\x81\x80\x80\x80\x80\x80\x80\x80\x00");
+ EXPECT_DECODE_ULEB128_EQ(0x80000000'00000000ul,
+ "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x01");
+
+#undef EXPECT_DECODE_ULEB128_EQ
+}
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
index 8af219f34e18b6..cfce540a95d994 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
@@ -1565,13 +1565,14 @@ bool MemoryAddressSpacePredicateMatcher::isIdentical(
void MemoryAddressSpacePredicateMatcher::emitPredicateOpcodes(
MatchTable &Table, RuleMatcher &Rule) const {
+ assert(AddrSpaces.size() < 255);
Table << MatchTable::Opcode("GIM_CheckMemoryAddressSpace")
<< MatchTable::Comment("MI") << MatchTable::ULEB128Value(InsnVarID)
<< MatchTable::Comment("MMO")
<< MatchTable::ULEB128Value(MMOIdx)
// Encode number of address spaces to expect.
<< MatchTable::Comment("NumAddrSpace")
- << MatchTable::ULEB128Value(AddrSpaces.size());
+ << MatchTable::IntValue(1, AddrSpaces.size());
for (unsigned AS : AddrSpaces)
Table << MatchTable::Comment("AddrSpace") << MatchTable::ULEB128Value(AS);
@@ -1593,7 +1594,7 @@ void MemoryAlignmentPredicateMatcher::emitPredicateOpcodes(
Table << MatchTable::Opcode("GIM_CheckMemoryAlignment")
<< MatchTable::Comment("MI") << MatchTable::ULEB128Value(InsnVarID)
<< MatchTable::Comment("MMO") << MatchTable::ULEB128Value(MMOIdx)
- << MatchTable::Comment("MinAlign") << MatchTable::ULEB128Value(MinAlign)
+ << MatchTable::Comment("MinAlign") << MatchTable::IntValue(1, MinAlign)
<< MatchTable::LineBreak;
}
|
@aemerson In one test, I observed a small decrease in MatchTable time with this. From about 7.6% to 6.6% of a build. |
// - Unchecked execution, assumes no error. | ||
// - Fast common case handling (1 byte values). | ||
LLVM_ATTRIBUTE_ALWAYS_INLINE static uint64_t | ||
fastDecodeULEB128(const uint8_t *__restrict MatchTable, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need to guard the non-standard __restrict in a macro? I don't see any prior art for this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it enough to indicate the underlying table is actually const?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
__restrict seems to allow for optimizing away a whole branch. I assume it doesn't reload MatchTable as often because it knows CurrentIdx
cannot alias it
It probably needs a compiler macro though, I'll look into it
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What if you remove the CurrentIndex reference, and return std::pair of the loaded value and new CurrentIdx?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Upon closer inspection the branch it avoids is just meaningless, in the fast path it just falls through so it's fine to remove restrict
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If it makes an observable ISA change I think it's worth trying to keep
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's very small: https://godbolt.org/z/f9341f1Ea
Though I sometimes wonder why we don't use restrict in more places. Seems like a useful thing in some cases.
I'd say let's do without it for now and start a discussion on Discourse? Using restrict would set a precedent and I think it's better to get community approval before using that in non-runtime code
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This isn't really worthy of an RFC. We just need another macro in Support/Compiler.h
Sure I'll try to do it some time this week. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with nit
@@ -1593,7 +1594,7 @@ void MemoryAlignmentPredicateMatcher::emitPredicateOpcodes( | |||
Table << MatchTable::Opcode("GIM_CheckMemoryAlignment") | |||
<< MatchTable::Comment("MI") << MatchTable::ULEB128Value(InsnVarID) | |||
<< MatchTable::Comment("MMO") << MatchTable::ULEB128Value(MMOIdx) | |||
<< MatchTable::Comment("MinAlign") << MatchTable::ULEB128Value(MinAlign) | |||
<< MatchTable::Comment("MinAlign") << MatchTable::IntValue(1, MinAlign) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe assert MinAlign < 256
somewhere? This is not log alignment, so it would break if some vector load instruction actually required 256-byte or greater alignment.
llvm/include/llvm/Support/Compiler.h
Outdated
@@ -278,6 +278,14 @@ | |||
#define LLVM_ATTRIBUTE_RETURNS_NONNULL | |||
#endif | |||
|
|||
/// LLVM_ATTRIBUTE_RESTRICT- Annotates a pointer to tell the compiler that |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/// LLVM_ATTRIBUTE_RESTRICT- Annotates a pointer to tell the compiler that | |
/// LLVM_ATTRIBUTE_RESTRICT - Annotates a pointer to tell the compiler that |
Again ClamAV is missing from these results, but the other benchmarks show a very nice improvement! Thanks for doing this.
|
- Remove some cases where ULEB128 isn't needed - Add a fastDecodeULEB128 tailored for GlobalISel which does unchecked decoding optimized for the common case, which is 1 byte values. Previous LEB128 decode function was about 45 instructions on X86 assuming it got inlined for our use case & the error checks got optimized out. This "fast" one is 23 instruction.
This specific LEB128 decode function generates almost 2x less instructions than the generic one.