Skip to content

Commit d5c8716

Browse files
committed
[Polly] Use VirtualUse to determine references.
VirtualUse ensures consistency over different source of values with Polly. In particular, this enables its use of instructions moved between Statement. Before the patch, the code wrongly assumed that the BB's instructions are also the ScopStmt's instructions. Reference are determined for OpenMP outlining and GPGPU kernel extraction. GPGPU CodeGen had some problems. For one, it generated GPU kernel parameters for constants. Second, it emitted GPU-side invariant loads which have already been loaded by the host. This has been partially fixed, it still generates a store for the invariant load result, but using the value that the host has already written. WARNING: I did not test the generated PollyACC code on an actual GPU. The improved consistency will be made use of in the next patch.
1 parent 1cea25e commit d5c8716

File tree

5 files changed

+82
-44
lines changed

5 files changed

+82
-44
lines changed

polly/include/polly/CodeGen/IslNodeBuilder.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ struct SubtreeReferences {
5858
/// SubtreeReferences structure.
5959
/// @param CreateScalarRefs Should the result include allocas of scalar
6060
/// references?
61-
void addReferencesFromStmt(const ScopStmt *Stmt, void *UserPtr,
61+
void addReferencesFromStmt(ScopStmt *Stmt, void *UserPtr,
6262
bool CreateScalarRefs = true);
6363

6464
class IslNodeBuilder {

polly/lib/CodeGen/IslNodeBuilder.cpp

Lines changed: 58 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "polly/Support/ISLTools.h"
2525
#include "polly/Support/SCEVValidator.h"
2626
#include "polly/Support/ScopHelper.h"
27+
#include "polly/Support/VirtualInstruction.h"
2728
#include "llvm/ADT/APInt.h"
2829
#include "llvm/ADT/PostOrderIterator.h"
2930
#include "llvm/ADT/SetVector.h"
@@ -205,40 +206,68 @@ int IslNodeBuilder::getNumberOfIterations(isl::ast_node_for For) {
205206
return NumberIterations + 1;
206207
}
207208

208-
/// Extract the values and SCEVs needed to generate code for a block.
209-
static int findReferencesInBlock(struct SubtreeReferences &References,
210-
const ScopStmt *Stmt, BasicBlock *BB) {
211-
for (Instruction &Inst : *BB) {
212-
// Include invariant loads
213-
if (isa<LoadInst>(Inst))
214-
if (Value *InvariantLoad = References.GlobalMap.lookup(&Inst))
215-
References.Values.insert(InvariantLoad);
216-
217-
for (Value *SrcVal : Inst.operands()) {
218-
auto *Scope = References.LI.getLoopFor(BB);
219-
if (canSynthesize(SrcVal, References.S, &References.SE, Scope)) {
220-
References.SCEVs.insert(References.SE.getSCEVAtScope(SrcVal, Scope));
221-
continue;
222-
} else if (Value *NewVal = References.GlobalMap.lookup(SrcVal))
223-
References.Values.insert(NewVal);
209+
static void findReferencesByUse(Value *SrcVal, ScopStmt *UserStmt,
210+
Loop *UserScope, const ValueMapT &GlobalMap,
211+
SetVector<Value *> &Values,
212+
SetVector<const SCEV *> &SCEVs) {
213+
VirtualUse VUse = VirtualUse::create(UserStmt, UserScope, SrcVal, true);
214+
switch (VUse.getKind()) {
215+
case VirtualUse::Constant:
216+
// When accelerator-offloading, GlobalValue is a host address whose content
217+
// must still be transferred to the GPU.
218+
if (isa<GlobalValue>(SrcVal))
219+
Values.insert(SrcVal);
220+
break;
221+
222+
case VirtualUse::Synthesizable:
223+
SCEVs.insert(VUse.getScevExpr());
224+
return;
225+
226+
case VirtualUse::Block:
227+
case VirtualUse::ReadOnly:
228+
case VirtualUse::Hoisted:
229+
case VirtualUse::Intra:
230+
case VirtualUse::Inter:
231+
break;
232+
}
233+
234+
if (Value *NewVal = GlobalMap.lookup(SrcVal))
235+
Values.insert(NewVal);
236+
}
237+
238+
static void findReferencesInInst(Instruction *Inst, ScopStmt *UserStmt,
239+
Loop *UserScope, const ValueMapT &GlobalMap,
240+
SetVector<Value *> &Values,
241+
SetVector<const SCEV *> &SCEVs) {
242+
for (Use &U : Inst->operands())
243+
findReferencesByUse(U.get(), UserStmt, UserScope, GlobalMap, Values, SCEVs);
244+
}
245+
246+
static void findReferencesInStmt(ScopStmt *Stmt, SetVector<Value *> &Values,
247+
ValueMapT &GlobalMap,
248+
SetVector<const SCEV *> &SCEVs) {
249+
LoopInfo *LI = Stmt->getParent()->getLI();
250+
251+
BasicBlock *BB = Stmt->getBasicBlock();
252+
Loop *Scope = LI->getLoopFor(BB);
253+
for (Instruction *Inst : Stmt->getInstructions())
254+
findReferencesInInst(Inst, Stmt, Scope, GlobalMap, Values, SCEVs);
255+
256+
if (Stmt->isRegionStmt()) {
257+
for (BasicBlock *BB : Stmt->getRegion()->blocks()) {
258+
Loop *Scope = LI->getLoopFor(BB);
259+
for (Instruction &Inst : *BB)
260+
findReferencesInInst(&Inst, Stmt, Scope, GlobalMap, Values, SCEVs);
224261
}
225262
}
226-
return 0;
227263
}
228264

229-
void polly::addReferencesFromStmt(const ScopStmt *Stmt, void *UserPtr,
265+
void polly::addReferencesFromStmt(ScopStmt *Stmt, void *UserPtr,
230266
bool CreateScalarRefs) {
231267
auto &References = *static_cast<struct SubtreeReferences *>(UserPtr);
232268

233-
if (Stmt->isBlockStmt())
234-
findReferencesInBlock(References, Stmt, Stmt->getBasicBlock());
235-
else if (Stmt->isRegionStmt()) {
236-
for (BasicBlock *BB : Stmt->getRegion()->blocks())
237-
findReferencesInBlock(References, Stmt, BB);
238-
} else {
239-
assert(Stmt->isCopyStmt());
240-
// Copy Stmts have no instructions that we need to consider.
241-
}
269+
findReferencesInStmt(Stmt, References.Values, References.GlobalMap,
270+
References.SCEVs);
242271

243272
for (auto &Access : *Stmt) {
244273
if (References.ParamSpace) {
@@ -276,8 +305,8 @@ void polly::addReferencesFromStmt(const ScopStmt *Stmt, void *UserPtr,
276305
static void addReferencesFromStmtSet(isl::set Set,
277306
struct SubtreeReferences *UserPtr) {
278307
isl::id Id = Set.get_tuple_id();
279-
auto *Stmt = static_cast<const ScopStmt *>(Id.get_user());
280-
return addReferencesFromStmt(Stmt, UserPtr);
308+
auto *Stmt = static_cast<ScopStmt *>(Id.get_user());
309+
addReferencesFromStmt(Stmt, UserPtr);
281310
}
282311

283312
/// Extract the out-of-scop values and SCEVs referenced from a union set

polly/test/GPGPU/invariant-load-hoisting-read-in-kernel.ll

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,24 @@
1+
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-invariant-load-hoisting \
2+
; RUN: -S < %s | \
3+
; RUN: FileCheck -check-prefix=HOST-IR %s
4+
15
; RUN: opt %loadPolly -disable-output -polly-acc-dump-kernel-ir \
26
; RUN: -polly-codegen-ppcg -polly-scops \
3-
; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s
7+
; RUN: -polly-invariant-load-hoisting < %s | FileCheck -check-prefix=KERNEL-IR %s
48

59
; REQUIRES: pollyacc
610

711
; Verify that invariant loads used in a kernel statement are correctly forwarded
812
; as subtree value to the GPU kernel.
913

10-
; CHECK: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0({{.*}} float %polly.access.p.load)
11-
; CHECK: store float %polly.access.p.load, float* %indvar2f.phiops
14+
; HOST-IR: store float %polly.access.p.load, float* %invariant.preload.s2a, align 4
15+
16+
; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_2({{.*}}i8 addrspace(1)* %MemRef_indvar2f__phi{{.*}})
17+
; KERNEL-IR: %indvar2f.phiops.reload = load float, float* %indvar2f.phiops, align 4
18+
; KERNEL-IR: store float %indvar2f.phiops.reload, float addrspace(1)* %polly.access.MemRef_A, align 4
19+
20+
; FIXME: store float %indvar2f.phiops.reload, float* %indvar2f.phiops, align 4
21+
; For some reason the above instruction is emitted that stores back to the addess it was just loaded from.
1222

1323
define void @foo(float* %A, float* %p) {
1424
entry:
@@ -21,15 +31,15 @@ loop:
2131
%ptr = getelementptr float, float* %A, i64 %indvar
2232
store float 42.0, float* %ptr
2333
%cmp = icmp sle i64 %indvar, 1024
24-
br i1 %cmp, label %loop, label %loop2
34+
br i1 %cmp, label %loop, label %anotherloop
2535

26-
loop2:
27-
%indvar2 = phi i64 [0, %loop], [%indvar2.next, %loop2]
28-
%indvar2f = phi float [%invariant, %loop], [%indvar2f, %loop2]
36+
anotherloop:
37+
%indvar2 = phi i64 [0, %loop], [%indvar2.next, %anotherloop]
38+
%indvar2f = phi float [%invariant, %loop], [%indvar2f, %anotherloop]
2939
%indvar2.next = add i64 %indvar2, 1
3040
store float %indvar2f, float* %A
3141
%cmp2 = icmp sle i64 %indvar2, 1024
32-
br i1 %cmp2, label %loop2, label %end
42+
br i1 %cmp2, label %anotherloop, label %end
3343

3444
end:
3545
ret void

polly/test/GPGPU/invariant-load-of-scalar.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,7 @@
3838
; kernel function.
3939
; KERNEL-IR: define ptx_kernel void @FUNC_checkPrivatization_SCOP_0_KERNEL_0
4040
; KERNEL-IR-SAME: (i8 addrspace(1)* %MemRef_A, i32 %tmp,
41-
; KERNEL-IR-SAME: i32 %tmp2, i32 %polly.access.begin.load,
42-
; KERNEL-IR-SAME: i32 %polly.access.end.load)
41+
; KERNEL-IR-SAME: i32 %tmp2, i32 %polly.access.begin.load)
4342

4443

4544
; void checkScalarPointerOffload(int A[], int *begin, int *end) {

polly/test/GPGPU/phi-nodes-in-kernel.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,9 @@ target triple = "x86_64-unknown-linux-gnu"
5252
; IR: [[REGC:%.+]] = bitcast i32* %{{[0-9]+}} to i8*
5353
; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_c, i8* [[REGC]], i64 196)
5454

55-
; KERNEL-IR: define ptx_kernel void @FUNC_kernel_dynprog_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_c, i32 %0) #0 {
56-
; KERNEL-IR: %polly.access.MemRef_c = getelementptr i32, i32 addrspace(1)* %polly.access.cast.MemRef_c, i64 %10
57-
; KERNEL-IR-NEXT: store i32 %0, i32 addrspace(1)* %polly.access.MemRef_c, align 4
55+
; KERNEL-IR: define ptx_kernel void @FUNC_kernel_dynprog_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_c) #0 {
56+
; KERNEL-IR: %polly.access.MemRef_c = getelementptr i32, i32 addrspace(1)* %polly.access.cast.MemRef_c, i64 %9
57+
; KERNEL-IR-NEXT: store i32 422, i32 addrspace(1)* %polly.access.MemRef_c, align 4
5858

5959
define void @kernel_dynprog([50 x i32]* %c) {
6060
entry:
@@ -75,7 +75,7 @@ for.cond15.for.cond12.loopexit_crit_edge: ; preds = %for.body17
7575
for.body17: ; preds = %for.body17, %for.cond1.preheader
7676
%indvars.iv71 = phi i64 [ 1, %for.cond1.preheader ], [ %indvars.iv.next72, %for.body17 ]
7777
%arrayidx69 = getelementptr inbounds [50 x i32], [50 x i32]* %c, i64 0, i64 %indvars.iv71
78-
store i32 undef, i32* %arrayidx69, align 4
78+
store i32 422, i32* %arrayidx69, align 4
7979
%indvars.iv.next72 = add nuw nsw i64 %indvars.iv71, 1
8080
%lftr.wideiv74 = trunc i64 %indvars.iv.next72 to i32
8181
%exitcond75 = icmp ne i32 %lftr.wideiv74, 50

0 commit comments

Comments
 (0)