Skip to content

Commit d47138e

Browse files
SC llvm teamSC llvm team
authored andcommitted
Merged main:d3921e467005 into amd-gfx:4a300c28880f
Local branch amd-gfx 4a300c2 Merged main:241c290ad73f into amd-gfx:77eb6cdceaa5 Remote branch main d3921e4 [OpenMP] Basic BumpAllocator for (AMD)GPUs (llvm#69806)
2 parents 4a300c2 + d3921e4 commit d47138e

File tree

21 files changed

+473
-88
lines changed

21 files changed

+473
-88
lines changed

llvm/include/llvm/Config/llvm-config.h.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
/* Indicate that this is LLVM compiled from the amd-gfx branch. */
1818
#define LLVM_HAVE_BRANCH_AMD_GFX
19-
#define LLVM_MAIN_REVISION 478259
19+
#define LLVM_MAIN_REVISION 478262
2020

2121
/* Define if LLVM_ENABLE_DUMP is enabled */
2222
#cmakedefine LLVM_ENABLE_DUMP

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
namespace llvm {
3232

3333
class LoopInfo;
34+
class DominatorTree;
3435
class LoopVectorizationLegality;
3536
class LoopVectorizationCostModel;
3637
class PredicatedScalarEvolution;
@@ -287,6 +288,9 @@ class LoopVectorizationPlanner {
287288
/// Loop Info analysis.
288289
LoopInfo *LI;
289290

291+
/// The dominator tree.
292+
DominatorTree *DT;
293+
290294
/// Target Library Info.
291295
const TargetLibraryInfo *TLI;
292296

@@ -317,16 +321,14 @@ class LoopVectorizationPlanner {
317321
VPBuilder Builder;
318322

319323
public:
320-
LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
321-
const TargetTransformInfo &TTI,
322-
LoopVectorizationLegality *Legal,
323-
LoopVectorizationCostModel &CM,
324-
InterleavedAccessInfo &IAI,
325-
PredicatedScalarEvolution &PSE,
326-
const LoopVectorizeHints &Hints,
327-
OptimizationRemarkEmitter *ORE)
328-
: OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI),
329-
PSE(PSE), Hints(Hints), ORE(ORE) {}
324+
LoopVectorizationPlanner(
325+
Loop *L, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
326+
const TargetTransformInfo &TTI, LoopVectorizationLegality *Legal,
327+
LoopVectorizationCostModel &CM, InterleavedAccessInfo &IAI,
328+
PredicatedScalarEvolution &PSE, const LoopVectorizeHints &Hints,
329+
OptimizationRemarkEmitter *ORE)
330+
: OrigLoop(L), LI(LI), DT(DT), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM),
331+
IAI(IAI), PSE(PSE), Hints(Hints), ORE(ORE) {}
330332

331333
/// Plan how to best vectorize, return the best VF and its cost, or
332334
/// std::nullopt if vectorization and interleaving should be avoided up front.

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 45 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -3617,40 +3617,10 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
36173617
VPBasicBlock *Header =
36183618
State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
36193619

3620-
// Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
3621-
// sank outside of the loop would keep the same order as they had in the
3622-
// original loop.
3623-
SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
36243620
for (VPRecipeBase &R : Header->phis()) {
36253621
if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3626-
ReductionPHIList.emplace_back(ReductionPhi);
3622+
fixReduction(ReductionPhi, State);
36273623
}
3628-
stable_sort(ReductionPHIList, [this](const VPReductionPHIRecipe *R1,
3629-
const VPReductionPHIRecipe *R2) {
3630-
auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
3631-
auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
3632-
3633-
// If neither of the recipes has an intermediate store, keep the order the
3634-
// same.
3635-
if (!IS1 && !IS2)
3636-
return false;
3637-
3638-
// If only one of the recipes has an intermediate store, then move it
3639-
// towards the beginning of the list.
3640-
if (IS1 && !IS2)
3641-
return true;
3642-
3643-
if (!IS1 && IS2)
3644-
return false;
3645-
3646-
// If both recipes have an intermediate store, then the recipe with the
3647-
// later store should be processed earlier. So it should go to the beginning
3648-
// of the list.
3649-
return DT->dominates(IS2, IS1);
3650-
});
3651-
3652-
for (VPReductionPHIRecipe *ReductionPhi : ReductionPHIList)
3653-
fixReduction(ReductionPhi, State);
36543624

36553625
for (VPRecipeBase &R : Header->phis()) {
36563626
if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
@@ -9041,9 +9011,48 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
90419011
void LoopVectorizationPlanner::adjustRecipesForReductions(
90429012
VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
90439013
ElementCount MinVF) {
9014+
VPBasicBlock *Header = Plan->getVectorLoopRegion()->getEntryBasicBlock();
9015+
// Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
9016+
// sank outside of the loop would keep the same order as they had in the
9017+
// original loop.
9018+
SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
9019+
for (VPRecipeBase &R : Header->phis()) {
9020+
if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
9021+
ReductionPHIList.emplace_back(ReductionPhi);
9022+
}
9023+
bool HasIntermediateStore = false;
9024+
stable_sort(ReductionPHIList,
9025+
[this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
9026+
const VPReductionPHIRecipe *R2) {
9027+
auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
9028+
auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
9029+
HasIntermediateStore |= IS1 || IS2;
9030+
9031+
// If neither of the recipes has an intermediate store, keep the
9032+
// order the same.
9033+
if (!IS1 && !IS2)
9034+
return false;
9035+
9036+
// If only one of the recipes has an intermediate store, then
9037+
// move it towards the beginning of the list.
9038+
if (IS1 && !IS2)
9039+
return true;
9040+
9041+
if (!IS1 && IS2)
9042+
return false;
9043+
9044+
// If both recipes have an intermediate store, then the recipe
9045+
// with the later store should be processed earlier. So it
9046+
// should go to the beginning of the list.
9047+
return DT->dominates(IS2, IS1);
9048+
});
9049+
9050+
if (HasIntermediateStore && ReductionPHIList.size() > 1)
9051+
for (VPRecipeBase *R : ReductionPHIList)
9052+
R->moveBefore(*Header, Header->getFirstNonPhi());
9053+
90449054
SmallVector<VPReductionPHIRecipe *> InLoopReductionPhis;
9045-
for (VPRecipeBase &R :
9046-
Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9055+
for (VPRecipeBase &R : Header->phis()) {
90479056
auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
90489057
if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
90499058
continue;
@@ -9682,7 +9691,8 @@ static bool processLoopInVPlanNativePath(
96829691
// Use the planner for outer loop vectorization.
96839692
// TODO: CM is not used at this point inside the planner. Turn CM into an
96849693
// optional argument if we don't need it in the future.
9685-
LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, LVL, CM, IAI, PSE, Hints, ORE);
9694+
LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9695+
ORE);
96869696

96879697
// Get user vectorization factor.
96889698
ElementCount UserVF = Hints.getWidth();
@@ -10024,7 +10034,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1002410034
LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
1002510035
F, &Hints, IAI);
1002610036
// Use the planner for vectorization.
10027-
LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
10037+
LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
1002810038
ORE);
1002910039

1003010040
// Get user vectorization factor and interleave count.

openmp/docs/design/Runtimes.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1465,3 +1465,4 @@ debugging features are supported.
14651465

14661466
* Enable debugging assertions in the device. ``0x01``
14671467
* Enable diagnosing common problems during offloading . ``0x4``
1468+
* Enable device malloc statistics (amdgpu only). ``0x8``

openmp/libomptarget/DeviceRTL/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ endif()
8383
list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES)
8484

8585
set(include_files
86+
${include_directory}/Allocator.h
8687
${include_directory}/Configuration.h
8788
${include_directory}/Debug.h
8889
${include_directory}/Interface.h
@@ -95,6 +96,7 @@ set(include_files
9596
)
9697

9798
set(src_files
99+
${source_directory}/Allocator.cpp
98100
${source_directory}/Configuration.cpp
99101
${source_directory}/Debug.cpp
100102
${source_directory}/Kernel.cpp
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
//===-------- Allocator.h - OpenMP memory allocator interface ---- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
#ifndef OMPTARGET_ALLOCATOR_H
13+
#define OMPTARGET_ALLOCATOR_H
14+
15+
#include "Types.h"
16+
17+
// Forward declaration.
18+
struct KernelEnvironmentTy;
19+
20+
#pragma omp begin declare target device_type(nohost)
21+
22+
namespace ompx {
23+
24+
namespace allocator {
25+
26+
static uint64_t constexpr ALIGNMENT = 16;
27+
28+
/// Initialize the allocator according to \p KernelEnvironment
29+
void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment);
30+
31+
/// Allocate \p Size bytes.
32+
[[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT), gnu::malloc]] void *
33+
alloc(uint64_t Size);
34+
35+
/// Free the allocation pointed to by \p Ptr.
36+
void free(void *Ptr);
37+
38+
} // namespace allocator
39+
40+
} // namespace ompx
41+
42+
#pragma omp end declare target
43+
44+
#endif

openmp/libomptarget/DeviceRTL/include/LibC.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
extern "C" {
1818

1919
int memcmp(const void *lhs, const void *rhs, size_t count);
20+
void memset(void *dst, int C, size_t count);
2021

2122
int printf(const char *format, ...);
2223
}
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#include "Allocator.h"
12+
#include "Configuration.h"
13+
#include "Environment.h"
14+
#include "Mapping.h"
15+
#include "Synchronization.h"
16+
#include "Types.h"
17+
#include "Utils.h"
18+
19+
using namespace ompx;
20+
21+
#pragma omp begin declare target device_type(nohost)
22+
23+
[[gnu::used, gnu::retain, gnu::weak,
24+
gnu::visibility(
25+
"protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
26+
[[gnu::used, gnu::retain, gnu::weak,
27+
gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
28+
__omp_rtl_device_memory_pool_tracker;
29+
30+
/// Stateless bump allocator that uses the __omp_rtl_device_memory_pool
31+
/// directly.
32+
struct BumpAllocatorTy final {
33+
34+
void *alloc(uint64_t Size) {
35+
Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT));
36+
37+
if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) {
38+
atomic::add(&__omp_rtl_device_memory_pool_tracker.NumAllocations, 1,
39+
atomic::seq_cst);
40+
atomic::add(&__omp_rtl_device_memory_pool_tracker.AllocationTotal, Size,
41+
atomic::seq_cst);
42+
atomic::min(&__omp_rtl_device_memory_pool_tracker.AllocationMin, Size,
43+
atomic::seq_cst);
44+
atomic::max(&__omp_rtl_device_memory_pool_tracker.AllocationMax, Size,
45+
atomic::seq_cst);
46+
}
47+
48+
uint64_t *Data =
49+
reinterpret_cast<uint64_t *>(&__omp_rtl_device_memory_pool.Ptr);
50+
uint64_t End =
51+
reinterpret_cast<uint64_t>(Data) + __omp_rtl_device_memory_pool.Size;
52+
53+
uint64_t OldData = atomic::add(Data, Size, atomic::seq_cst);
54+
if (OldData + Size > End)
55+
__builtin_trap();
56+
57+
return reinterpret_cast<void *>(OldData);
58+
}
59+
60+
void free(void *) {}
61+
};
62+
63+
BumpAllocatorTy BumpAllocator;
64+
65+
/// allocator namespace implementation
66+
///
67+
///{
68+
69+
void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) {
70+
// TODO: Check KernelEnvironment for an allocator choice as soon as we have
71+
// more than one.
72+
}
73+
74+
void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }
75+
76+
void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
77+
78+
///}
79+
80+
#pragma omp end declare target

openmp/libomptarget/DeviceRTL/src/Kernel.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
//
1111
//===----------------------------------------------------------------------===//
1212

13+
#include "Allocator.h"
1314
#include "Debug.h"
1415
#include "Environment.h"
1516
#include "Interface.h"
@@ -30,6 +31,7 @@ static void inititializeRuntime(bool IsSPMD,
3031
synchronize::init(IsSPMD);
3132
mapping::init(IsSPMD);
3233
state::init(IsSPMD, KernelEnvironment);
34+
allocator::init(IsSPMD, KernelEnvironment);
3335
}
3436

3537
/// Simple generic state machine for worker threads.

openmp/libomptarget/DeviceRTL/src/LibC.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,12 @@ int memcmp(const void *lhs, const void *rhs, size_t count) {
4747
return 0;
4848
}
4949

50+
void memset(void *dst, int C, size_t count) {
51+
auto *dstc = reinterpret_cast<char *>(dst);
52+
for (size_t I = 0; I < count; ++I)
53+
dstc[I] = C;
54+
}
55+
5056
/// printf() calls are rewritten by CGGPUBuiltin to __llvm_omp_vprintf
5157
int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t Size) {
5258
return impl::omp_vprintf(Format, Arguments, Size);

0 commit comments

Comments
 (0)