Skip to content

Commit 29397d9

Browse files
committed
[LoadStoreVectorizer] Postprocess and merge equivalence classes
This patch introduces a new method: void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const The method is called at the end of Vectorizer::collectEquivalenceClasses() and is needed to merge equivalence classes that differ only by their underlying objects (UB1 and UB2), where UB1 is 1-level-indirection underlying base for UB2. This situation arises due to the limited lookup depth used during the search of underlying bases with llvm::getUnderlyingObject(ptr). Using any fixed lookup depth can result into creation of multiple equivalence classes that only differ by 1-level indirection bases. The new approach merges equivalence classes if they have adjucent bases (1-level indirection). If a series of equivalence classes form ladder formed of 1-step/level indirections, they are all merged into a single equivalence class. This provides more opportunities for the load-store vectorizer to generate better vectors. Signed-off-by: Klochkov, Vyacheslav N <[email protected]>
1 parent 5cb7305 commit 29397d9

File tree

2 files changed

+191
-0
lines changed

2 files changed

+191
-0
lines changed

llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,11 @@ class Vectorizer {
324324
Instruction *ChainElem, Instruction *ChainBegin,
325325
const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets);
326326

327+
/// Merges the equivalence classes if they have uderlying objects that differ
328+
/// by one level of indirection (i.e., one is a getelementptr and the other is
329+
/// the base pointer in that getelementptr).
330+
void mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const;
331+
327332
/// Collects loads and stores grouped by "equivalence class", where:
328333
/// - all elements in an eq class are a load or all are a store,
329334
/// - they all load/store the same element size (it's OK to have e.g. i8 and
@@ -1305,6 +1310,128 @@ std::optional<APInt> Vectorizer::getConstantOffsetSelects(
13051310
return std::nullopt;
13061311
}
13071312

1313+
void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
1314+
if (EQClasses.size() < 2) // There is nothing to merge.
1315+
return;
1316+
1317+
// The reduced key has all elements of the ECClassKey except the underlying
1318+
// object. Check that EqClassKey has 4 elements and define the reduced key.
1319+
static_assert(std::tuple_size_v<EqClassKey> == 4,
1320+
"EqClassKey has changed - EqClassReducedKey needs changes too");
1321+
using EqClassReducedKey =
1322+
std::tuple<std::tuple_element_t<1, EqClassKey> /* AddrSpace */,
1323+
std::tuple_element_t<2, EqClassKey> /* Element size */,
1324+
std::tuple_element_t<3, EqClassKey> /* IsLoad; */>;
1325+
using ECReducedKeyToUnderlyingObjectMap =
1326+
MapVector<EqClassReducedKey,
1327+
SmallPtrSet<std::tuple_element_t<0, EqClassKey>, 4>>;
1328+
1329+
// Form a map from the reduced key (without the underlying object) to the
1330+
// underlying objects: 1 reduced key to many underlying objects, to form
1331+
// groups of potentially merge-able equivalence classes.
1332+
ECReducedKeyToUnderlyingObjectMap RedKeyToUOMap;
1333+
bool FoundPotentiallyOptimizableEC = false;
1334+
for (const auto &EC : EQClasses) {
1335+
const auto &Key = EC.first;
1336+
EqClassReducedKey RedKey{std::get<1>(Key), std::get<2>(Key),
1337+
std::get<3>(Key)};
1338+
RedKeyToUOMap[RedKey].insert(std::get<0>(Key));
1339+
if (RedKeyToUOMap[RedKey].size() > 1)
1340+
FoundPotentiallyOptimizableEC = true;
1341+
}
1342+
if (!FoundPotentiallyOptimizableEC)
1343+
return;
1344+
1345+
LLVM_DEBUG({
1346+
dbgs() << "LSV: mergeEquivalenceClasses: before merging:\n";
1347+
for (const auto &EC : EQClasses) {
1348+
dbgs() << " Key: ([" << std::get<0>(EC.first)
1349+
<< "]: " << *std::get<0>(EC.first) << ", " << std::get<1>(EC.first)
1350+
<< ", " << std::get<2>(EC.first) << ", "
1351+
<< static_cast<int>(std::get<3>(EC.first)) << ")\n";
1352+
for (const auto &Inst : EC.second)
1353+
dbgs() << "\tInst:\t" << *Inst << "\n";
1354+
}
1355+
});
1356+
LLVM_DEBUG({
1357+
dbgs() << "LSV: mergeEquivalenceClasses: RedKeyToUOMap:\n";
1358+
for (const auto &RedKeyToUO : RedKeyToUOMap) {
1359+
dbgs() << " Reduced key: (" << std::get<0>(RedKeyToUO.first) << ", "
1360+
<< std::get<1>(RedKeyToUO.first) << ", "
1361+
<< static_cast<int>(std::get<2>(RedKeyToUO.first)) << ") --> "
1362+
<< RedKeyToUO.second.size() << " underlying objects:\n";
1363+
for (auto UObject : RedKeyToUO.second)
1364+
dbgs() << " [" << UObject << "]: " << *UObject << "\n";
1365+
}
1366+
});
1367+
1368+
using UObjectToUObjectMap = DenseMap<const Value *, const Value *>;
1369+
1370+
// Compute the ultimate targets for a set of underlying objects.
1371+
auto GetUltimateTargets =
1372+
[](SmallPtrSetImpl<const Value *> &UObjects) -> UObjectToUObjectMap {
1373+
UObjectToUObjectMap IndirectionMap;
1374+
for (const auto *UObject : UObjects) {
1375+
const unsigned MaxLookupDepth = 1; // look for 1-level indirections only
1376+
const auto *UltimateTarget =
1377+
llvm::getUnderlyingObject(UObject, MaxLookupDepth);
1378+
if (UltimateTarget != UObject)
1379+
IndirectionMap[UObject] = UltimateTarget;
1380+
}
1381+
UObjectToUObjectMap UltimateTargetsMap;
1382+
for (const auto *UObject : UObjects) {
1383+
auto Target = UObject;
1384+
auto It = IndirectionMap.find(Target);
1385+
for (; It != IndirectionMap.end(); It = IndirectionMap.find(Target))
1386+
Target = It->second;
1387+
UltimateTargetsMap[UObject] = Target;
1388+
}
1389+
return UltimateTargetsMap;
1390+
};
1391+
1392+
// For each item in RedKeyToUOMap, if it has more than one underlying object,
1393+
// try to merge the equivalence classes.
1394+
for (auto &RedKeyToUO : RedKeyToUOMap) {
1395+
auto UObjects = RedKeyToUO.second;
1396+
if (UObjects.size() < 2)
1397+
continue;
1398+
const auto RedKey = RedKeyToUO.first;
1399+
auto UTMap = GetUltimateTargets(UObjects);
1400+
for (const auto &UT : UTMap) {
1401+
const Value *UObject = UT.first;
1402+
const Value *UltimateTarget = UT.second;
1403+
if (UObject == UltimateTarget)
1404+
continue;
1405+
1406+
EqClassKey KeyFrom{UObject, std::get<0>(RedKey), std::get<1>(RedKey),
1407+
std::get<2>(RedKey)};
1408+
EqClassKey KeyTo{UltimateTarget, std::get<0>(RedKey), std::get<1>(RedKey),
1409+
std::get<2>(RedKey)};
1410+
auto VecFrom = EQClasses[KeyFrom];
1411+
auto VecTo = EQClasses[KeyTo];
1412+
SmallVector<Instruction *, 8> MergedVec;
1413+
std::merge(VecFrom.begin(), VecFrom.end(), VecTo.begin(), VecTo.end(),
1414+
std::back_inserter(MergedVec),
1415+
[](Instruction *A, Instruction *B) {
1416+
return A && B && A->comesBefore(B);
1417+
});
1418+
EQClasses[KeyTo] = std::move(MergedVec);
1419+
EQClasses.erase(KeyFrom);
1420+
}
1421+
}
1422+
LLVM_DEBUG({
1423+
dbgs() << "LSV: mergeEquivalenceClasses: after merging:\n";
1424+
for (const auto &EC : EQClasses) {
1425+
dbgs() << " Key: ([" << std::get<0>(EC.first)
1426+
<< "]: " << *std::get<0>(EC.first) << ", " << std::get<1>(EC.first)
1427+
<< ", " << std::get<2>(EC.first) << ", "
1428+
<< static_cast<int>(std::get<3>(EC.first)) << ")\n";
1429+
for (const auto &Inst : EC.second)
1430+
dbgs() << "\tInst:\t" << *Inst << "\n";
1431+
}
1432+
});
1433+
}
1434+
13081435
EquivalenceClassMap
13091436
Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
13101437
BasicBlock::iterator End) {
@@ -1377,6 +1504,7 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
13771504
.emplace_back(&I);
13781505
}
13791506

1507+
mergeEquivalenceClasses(Ret);
13801508
return Ret;
13811509
}
13821510

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
; RUN: opt %s -mtriple=x86_64-unknown-linux-gnu -passes=load-store-vectorizer -mcpu=skx -S -o %t.out.ll
2+
; RUN: FileCheck -input-file=%t.out.ll %s
3+
4+
; This test verifies that the vectorizer can handle an extended sequence of
5+
; getelementptr instructions and generate longer vectors. With special handling,
6+
; some elements can still be vectorized even if they require looking up the
7+
; common underlying object deeper than 6 levels from the original pointer.
8+
9+
; The test below is the simplified version of actual performance oriented
10+
; workload; the offsets in getelementptr instructins are similar or same for
11+
; the test simplicity.
12+
13+
define void @v1_v2_v4_v1_to_v8_levels_6_7_8_8(i32 %arg0, ptr align 16 %arg1) {
14+
; CHECK-LABEL: @v1_v2_v4_v1_to_v8_levels_6_7_8_8
15+
; CHECK: store <8 x half>
16+
17+
%level1 = getelementptr inbounds i8, ptr %arg1, i32 917504
18+
%level2 = getelementptr i8, ptr %level1, i32 %arg0
19+
%level3 = getelementptr i8, ptr %level2, i32 32768
20+
%level4 = getelementptr inbounds i8, ptr %level3, i32 %arg0
21+
%level5 = getelementptr i8, ptr %level4, i32 %arg0
22+
23+
%a6 = getelementptr i8, ptr %level5, i32 %arg0
24+
%b7 = getelementptr i8, ptr %a6, i32 2
25+
%c8 = getelementptr i8, ptr %b7, i32 8
26+
%d8 = getelementptr inbounds i8, ptr %b7, i32 12
27+
28+
store half 0xH0000, ptr %a6, align 16
29+
store <4 x half> zeroinitializer, ptr %b7, align 2
30+
store <2 x half> zeroinitializer, ptr %c8, align 2
31+
store half 0xH0000, ptr %d8, align 2
32+
ret void
33+
}
34+
35+
define void @v1x8_levels_6_7_8_9_10_11_12_13(i32 %arg0, ptr align 16 %arg1) {
36+
; CHECK-LABEL: @v1x8_levels_6_7_8_9_10_11_12_13
37+
; CHECK: store <8 x half>
38+
39+
%level1 = getelementptr inbounds i8, ptr %arg1, i32 917504
40+
%level2 = getelementptr i8, ptr %level1, i32 %arg0
41+
%level3 = getelementptr i8, ptr %level2, i32 32768
42+
%level4 = getelementptr inbounds i8, ptr %level3, i32 %arg0
43+
%level5 = getelementptr i8, ptr %level4, i32 %arg0
44+
45+
%a6 = getelementptr i8, ptr %level5, i32 %arg0
46+
%b7 = getelementptr i8, ptr %a6, i32 2
47+
%c8 = getelementptr i8, ptr %b7, i32 2
48+
%d9 = getelementptr inbounds i8, ptr %c8, i32 2
49+
%e10 = getelementptr inbounds i8, ptr %d9, i32 2
50+
%f11 = getelementptr inbounds i8, ptr %e10, i32 2
51+
%g12 = getelementptr inbounds i8, ptr %f11, i32 2
52+
%h13 = getelementptr inbounds i8, ptr %g12, i32 2
53+
54+
store half 0xH0000, ptr %a6, align 16
55+
store half 0xH0000, ptr %b7, align 2
56+
store half 0xH0000, ptr %c8, align 2
57+
store half 0xH0000, ptr %d9, align 2
58+
store half 0xH0000, ptr %e10, align 8
59+
store half 0xH0000, ptr %f11, align 2
60+
store half 0xH0000, ptr %g12, align 2
61+
store half 0xH0000, ptr %h13, align 2
62+
ret void
63+
}

0 commit comments

Comments
 (0)