Skip to content

Commit 16295d5

Browse files
committed
InstCombine: Broaden copy-constant-to-alloca optimization
Consider any constant memory type, not just global constants. AMDGPU kernel parameters are effectively global constants, but appear as either reads from an intrinsic derived pointer or function argument.
1 parent a881dc1 commit 16295d5

File tree

2 files changed

+107
-29
lines changed

2 files changed

+107
-29
lines changed

llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp

Lines changed: 15 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -32,22 +32,6 @@ using namespace PatternMatch;
3232
STATISTIC(NumDeadStore, "Number of dead stores eliminated");
3333
STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global");
3434

35-
/// pointsToConstantGlobal - Return true if V (possibly indirectly) points to
36-
/// some part of a constant global variable. This intentionally only accepts
37-
/// constant expressions because we can't rewrite arbitrary instructions.
38-
static bool pointsToConstantGlobal(Value *V) {
39-
if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
40-
return GV->isConstant();
41-
42-
if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
43-
if (CE->getOpcode() == Instruction::BitCast ||
44-
CE->getOpcode() == Instruction::AddrSpaceCast ||
45-
CE->getOpcode() == Instruction::GetElementPtr)
46-
return pointsToConstantGlobal(CE->getOperand(0));
47-
}
48-
return false;
49-
}
50-
5135
/// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived)
5236
/// pointer to an alloca. Ignore any reads of the pointer, return false if we
5337
/// see any stores or other unknown uses. If we see pointer arithmetic, keep
@@ -56,7 +40,8 @@ static bool pointsToConstantGlobal(Value *V) {
5640
/// the alloca, and if the source pointer is a pointer to a constant global, we
5741
/// can optimize this.
5842
static bool
59-
isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
43+
isOnlyCopiedFromConstantMemory(AliasAnalysis *AA,
44+
Value *V, MemTransferInst *&TheCopy,
6045
SmallVectorImpl<Instruction *> &ToDelete) {
6146
// We track lifetime intrinsics as we encounter them. If we decide to go
6247
// ahead and replace the value with the global, this lets the caller quickly
@@ -145,7 +130,7 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
145130
if (U.getOperandNo() != 0) return false;
146131

147132
// If the source of the memcpy/move is not a constant global, reject it.
148-
if (!pointsToConstantGlobal(MI->getSource()))
133+
if (!AA->pointsToConstantMemory(MI->getSource()))
149134
return false;
150135

151136
// Otherwise, the transform is safe. Remember the copy instruction.
@@ -159,10 +144,11 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
159144
/// modified by a copy from a constant global. If we can prove this, we can
160145
/// replace any uses of the alloca with uses of the global directly.
161146
static MemTransferInst *
162-
isOnlyCopiedFromConstantGlobal(AllocaInst *AI,
147+
isOnlyCopiedFromConstantMemory(AliasAnalysis *AA,
148+
AllocaInst *AI,
163149
SmallVectorImpl<Instruction *> &ToDelete) {
164150
MemTransferInst *TheCopy = nullptr;
165-
if (isOnlyCopiedFromConstantGlobal(AI, TheCopy, ToDelete))
151+
if (isOnlyCopiedFromConstantMemory(AA, AI, TheCopy, ToDelete))
166152
return TheCopy;
167153
return nullptr;
168154
}
@@ -391,13 +377,13 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
391377

392378
if (AI.getAlignment()) {
393379
// Check to see if this allocation is only modified by a memcpy/memmove from
394-
// a constant global whose alignment is equal to or exceeds that of the
395-
// allocation. If this is the case, we can change all users to use
396-
// the constant global instead. This is commonly produced by the CFE by
397-
// constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A'
398-
// is only subsequently read.
380+
// a constant whose alignment is equal to or exceeds that of the allocation.
381+
// If this is the case, we can change all users to use the constant global
382+
// instead. This is commonly produced by the CFE by constructs like "void
383+
// foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A' is only subsequently
384+
// read.
399385
SmallVector<Instruction *, 4> ToDelete;
400-
if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(&AI, ToDelete)) {
386+
if (MemTransferInst *Copy = isOnlyCopiedFromConstantMemory(AA, &AI, ToDelete)) {
401387
MaybeAlign AllocaAlign = AI.getAlign();
402388
Align SourceAlign = getOrEnforceKnownAlignment(
403389
Copy->getSource(), AllocaAlign, DL, &AI, &AC, &DT);
@@ -407,12 +393,12 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
407393
LLVM_DEBUG(dbgs() << " memcpy = " << *Copy << '\n');
408394
for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
409395
eraseInstFromFunction(*ToDelete[i]);
410-
Constant *TheSrc = cast<Constant>(Copy->getSource());
396+
Value *TheSrc = Copy->getSource();
411397
auto *SrcTy = TheSrc->getType();
412398
auto *DestTy = PointerType::get(AI.getType()->getPointerElementType(),
413399
SrcTy->getPointerAddressSpace());
414-
Constant *Cast =
415-
ConstantExpr::getPointerBitCastOrAddrSpaceCast(TheSrc, DestTy);
400+
Value *Cast =
401+
Builder.CreatePointerBitCastOrAddrSpaceCast(TheSrc, DestTy);
416402
if (AI.getType()->getPointerAddressSpace() ==
417403
SrcTy->getPointerAddressSpace()) {
418404
Instruction *NewI = replaceInstUsesWith(AI, Cast);
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -amdgpu-aa-wrapper -amdgpu-aa -instcombine -o - %s | FileCheck %s
3+
4+
; Make sure the optimization from memcpy-from-global.ll happens, but
5+
; the constant source is not a global variable.
6+
7+
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
8+
9+
; Simple memcpy to alloca from constant address space argument.
10+
define i8 @memcpy_constant_arg_ptr_to_alloca([32 x i8] addrspace(4)* noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) {
11+
; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca(
12+
; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
13+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr [32 x i8], [32 x i8] addrspace(4)* [[ARG:%.*]], i64 0, i64 [[TMP1]]
14+
; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(4)* [[GEP]], align 1
15+
; CHECK-NEXT: ret i8 [[LOAD]]
16+
;
17+
%alloca = alloca [32 x i8], align 4, addrspace(5)
18+
%alloca.cast = bitcast [32 x i8] addrspace(5)* %alloca to i8 addrspace(5)*
19+
%arg.cast = bitcast [32 x i8] addrspace(4)* %arg to i8 addrspace(4)*
20+
call void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* %alloca.cast, i8 addrspace(4)* %arg.cast, i64 32, i1 false)
21+
%gep = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %alloca, i32 0, i32 %idx
22+
%load = load i8, i8 addrspace(5)* %gep
23+
ret i8 %load
24+
}
25+
26+
; Simple memcpy to alloca from constant address space intrinsic call
27+
define amdgpu_kernel void @memcpy_constant_intrinsic_ptr_to_alloca(i8 addrspace(1)* %out, i32 %idx) {
28+
; CHECK-LABEL: @memcpy_constant_intrinsic_ptr_to_alloca(
29+
; CHECK-NEXT: [[KERNARG_SEGMENT_PTR:%.*]] = call align 16 dereferenceable(32) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
30+
; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
31+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERNARG_SEGMENT_PTR]], i64 [[TMP1]]
32+
; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(4)* [[GEP]], align 1
33+
; CHECK-NEXT: store i8 [[LOAD]], i8 addrspace(1)* [[OUT:%.*]], align 1
34+
; CHECK-NEXT: ret void
35+
;
36+
%alloca = alloca [32 x i8], align 4, addrspace(5)
37+
%alloca.cast = bitcast [32 x i8] addrspace(5)* %alloca to i8 addrspace(5)*
38+
%kernarg.segment.ptr = call dereferenceable(32) align 16 i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
39+
call void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* %alloca.cast, i8 addrspace(4)* %kernarg.segment.ptr, i64 32, i1 false)
40+
%gep = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %alloca, i32 0, i32 %idx
41+
%load = load i8, i8 addrspace(5)* %gep
42+
store i8 %load, i8 addrspace(1)* %out
43+
ret void
44+
}
45+
46+
; Alloca is written through a flat pointer
47+
define i8 @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat([32 x i8] addrspace(4)* noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) {
48+
; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat(
49+
; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
50+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr [32 x i8], [32 x i8] addrspace(4)* [[ARG:%.*]], i64 0, i64 [[TMP1]]
51+
; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(4)* [[GEP]], align 1
52+
; CHECK-NEXT: ret i8 [[LOAD]]
53+
;
54+
%alloca = alloca [32 x i8], align 4, addrspace(5)
55+
%alloca.cast = bitcast [32 x i8] addrspace(5)* %alloca to i8 addrspace(5)*
56+
%alloca.cast.asc = addrspacecast i8 addrspace(5)* %alloca.cast to i8*
57+
%arg.cast = bitcast [32 x i8] addrspace(4)* %arg to i8 addrspace(4)*
58+
call void @llvm.memcpy.p0i8.p4i8.i64(i8* %alloca.cast.asc, i8 addrspace(4)* %arg.cast, i64 32, i1 false)
59+
%gep = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %alloca, i32 0, i32 %idx
60+
%load = load i8, i8 addrspace(5)* %gep
61+
ret i8 %load
62+
}
63+
64+
; Alloca is only addressed through flat pointer.
65+
define i8 @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat2([32 x i8] addrspace(4)* noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) {
66+
; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat2(
67+
; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [32 x i8], align 4, addrspace(5)
68+
; CHECK-NEXT: [[ALLOCA_CAST1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* [[ALLOCA]], i32 0, i32 0
69+
; CHECK-NEXT: [[ALLOCA_CAST:%.*]] = addrspacecast i8 addrspace(5)* [[ALLOCA_CAST1]] to i8*
70+
; CHECK-NEXT: [[ARG_CAST:%.*]] = getelementptr inbounds [32 x i8], [32 x i8] addrspace(4)* [[ARG:%.*]], i64 0, i64 0
71+
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p4i8.i64(i8* nonnull align 1 dereferenceable(32) [[ALLOCA_CAST]], i8 addrspace(4)* align 4 dereferenceable(32) [[ARG_CAST]], i64 32, i1 false)
72+
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* [[ALLOCA]], i32 0, i32 [[IDX:%.*]]
73+
; CHECK-NEXT: [[GEP:%.*]] = addrspacecast i8 addrspace(5)* [[GEP2]] to i8*
74+
; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8* [[GEP]], align 1
75+
; CHECK-NEXT: ret i8 [[LOAD]]
76+
;
77+
%alloca = alloca [32 x i8], align 4, addrspace(5)
78+
%alloca.cast.asc = addrspacecast [32 x i8] addrspace(5)* %alloca to [32 x i8]*
79+
%alloca.cast = bitcast [32 x i8]* %alloca.cast.asc to i8*
80+
%arg.cast = bitcast [32 x i8] addrspace(4)* %arg to i8 addrspace(4)*
81+
call void @llvm.memcpy.p0i8.p4i8.i64(i8* %alloca.cast, i8 addrspace(4)* %arg.cast, i64 32, i1 false)
82+
%gep = getelementptr inbounds [32 x i8], [32 x i8]* %alloca.cast.asc, i32 0, i32 %idx
83+
%load = load i8, i8* %gep
84+
ret i8 %load
85+
}
86+
87+
declare void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* nocapture, i8 addrspace(4)* nocapture, i64, i1) #0
88+
declare void @llvm.memcpy.p0i8.p4i8.i64(i8* nocapture, i8 addrspace(4)* nocapture, i64, i1) #0
89+
declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #1
90+
91+
attributes #0 = { argmemonly nounwind willreturn }
92+
attributes #1 = { nounwind readnone speculatable }

0 commit comments

Comments
 (0)