Skip to content
This repository was archived by the owner on Feb 5, 2019. It is now read-only.

Commit 7653e62

Browse files
committed
Merging r275869:
------------------------------------------------------------------------ r275869 | arsenm | 2016-07-18 11:34:53 -0700 (Mon, 18 Jul 2016) | 7 lines AMDGPU: Remove dead check in AMDGPUPromoteAlloca This is currently only called with GEP users. A direct alloca would only happen with current typed pointers for arrays which are a perverse case. Also fix crashes on 0 x and 1 x arrays. ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_39@277077 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 7e5531e commit 7653e62

File tree

3 files changed

+130
-43
lines changed

3 files changed

+130
-43
lines changed

lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -348,9 +348,6 @@ static VectorType *arrayTypeToVecType(Type *ArrayTy) {
348348
static Value *
349349
calculateVectorIndex(Value *Ptr,
350350
const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
351-
if (isa<AllocaInst>(Ptr))
352-
return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
353-
354351
GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
355352

356353
auto I = GEPIdx.find(GEP);
@@ -360,11 +357,11 @@ calculateVectorIndex(Value *Ptr,
360357
static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
361358
// FIXME we only support simple cases
362359
if (GEP->getNumOperands() != 3)
363-
return NULL;
360+
return nullptr;
364361

365362
ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
366363
if (!I0 || !I0->isZero())
367-
return NULL;
364+
return nullptr;
368365

369366
return GEP->getOperand(2);
370367
}
@@ -398,7 +395,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
398395
// are just being conservative for now.
399396
if (!AllocaTy ||
400397
AllocaTy->getElementType()->isVectorTy() ||
401-
AllocaTy->getNumElements() > 4) {
398+
AllocaTy->getNumElements() > 4 ||
399+
AllocaTy->getNumElements() < 2) {
402400
DEBUG(dbgs() << " Cannot convert type to vector\n");
403401
return false;
404402
}
@@ -443,19 +441,23 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
443441
IRBuilder<> Builder(Inst);
444442
switch (Inst->getOpcode()) {
445443
case Instruction::Load: {
444+
Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
446445
Value *Ptr = Inst->getOperand(0);
447446
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
448-
Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
447+
448+
Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
449449
Value *VecValue = Builder.CreateLoad(BitCast);
450450
Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
451451
Inst->replaceAllUsesWith(ExtractElement);
452452
Inst->eraseFromParent();
453453
break;
454454
}
455455
case Instruction::Store: {
456+
Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
457+
456458
Value *Ptr = Inst->getOperand(1);
457459
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
458-
Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
460+
Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
459461
Value *VecValue = Builder.CreateLoad(BitCast);
460462
Value *NewVecValue = Builder.CreateInsertElement(VecValue,
461463
Inst->getOperand(0),
@@ -469,7 +471,6 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
469471
break;
470472

471473
default:
472-
Inst->dump();
473474
llvm_unreachable("Inconsistency in instructions promotable to vector");
474475
}
475476
}

test/CodeGen/AMDGPU/amdgpu.private-memory.ll

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -417,12 +417,6 @@ entry:
417417
ret void
418418
}
419419

420-
; HSAOPT: !0 = !{}
421-
; HSAOPT: !1 = !{i32 0, i32 2048}
422-
423-
; NOHSAOPT: !0 = !{i32 0, i32 2048}
424-
425-
426420
; FUNC-LABEL: v16i32_stack:
427421

428422
; R600: MOVA_INT
@@ -527,4 +521,33 @@ define void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) {
527521
ret void
528522
}
529523

524+
; OPT-LABEL: @direct_alloca_read_0xi32(
525+
; OPT: store [0 x i32] undef, [0 x i32] addrspace(3)*
526+
; OPT: load [0 x i32], [0 x i32] addrspace(3)*
527+
define void @direct_alloca_read_0xi32([0 x i32] addrspace(1)* %out, i32 %index) {
528+
entry:
529+
%tmp = alloca [0 x i32]
530+
store [0 x i32] [], [0 x i32]* %tmp
531+
%load = load [0 x i32], [0 x i32]* %tmp
532+
store [0 x i32] %load, [0 x i32] addrspace(1)* %out
533+
ret void
534+
}
535+
536+
; OPT-LABEL: @direct_alloca_read_1xi32(
537+
; OPT: store [1 x i32] zeroinitializer, [1 x i32] addrspace(3)*
538+
; OPT: load [1 x i32], [1 x i32] addrspace(3)*
539+
define void @direct_alloca_read_1xi32([1 x i32] addrspace(1)* %out, i32 %index) {
540+
entry:
541+
%tmp = alloca [1 x i32]
542+
store [1 x i32] [i32 0], [1 x i32]* %tmp
543+
%load = load [1 x i32], [1 x i32]* %tmp
544+
store [1 x i32] %load, [1 x i32] addrspace(1)* %out
545+
ret void
546+
}
547+
530548
attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="2" }
549+
550+
; HSAOPT: !0 = !{}
551+
; HSAOPT: !1 = !{i32 0, i32 2048}
552+
553+
; NOHSAOPT: !0 = !{i32 0, i32 2048}

test/CodeGen/AMDGPU/vector-alloca.ll

Lines changed: 91 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,11 @@
33
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
44
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
55
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s
6+
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine < %s | FileCheck -check-prefix=OPT %s
7+
8+
; OPT-LABEL: @vector_read(
9+
; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
10+
; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
611

712
; FUNC-LABEL: {{^}}vector_read:
813
; EG: MOV
@@ -12,21 +17,26 @@
1217
; EG: MOVA_INT
1318
define void @vector_read(i32 addrspace(1)* %out, i32 %index) {
1419
entry:
15-
%0 = alloca [4 x i32]
16-
%x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0
17-
%y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
18-
%z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2
19-
%w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3
20+
%tmp = alloca [4 x i32]
21+
%x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
22+
%y = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
23+
%z = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 2
24+
%w = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 3
2025
store i32 0, i32* %x
2126
store i32 1, i32* %y
2227
store i32 2, i32* %z
2328
store i32 3, i32* %w
24-
%1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %index
25-
%2 = load i32, i32* %1
26-
store i32 %2, i32 addrspace(1)* %out
29+
%tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 %index
30+
%tmp2 = load i32, i32* %tmp1
31+
store i32 %tmp2, i32 addrspace(1)* %out
2732
ret void
2833
}
2934

35+
; OPT-LABEL: @vector_write(
36+
; OPT: %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index
37+
; OPT: %1 = extractelement <4 x i32> %0, i32 %r_index
38+
; OPT: store i32 %1, i32 addrspace(1)* %out, align 4
39+
3040
; FUNC-LABEL: {{^}}vector_write:
3141
; EG: MOV
3242
; EG: MOV
@@ -36,42 +46,95 @@ entry:
3646
; EG: MOVA_INT
3747
define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
3848
entry:
39-
%0 = alloca [4 x i32]
40-
%x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0
41-
%y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
42-
%z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2
43-
%w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3
49+
%tmp = alloca [4 x i32]
50+
%x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
51+
%y = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
52+
%z = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 2
53+
%w = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 3
4454
store i32 0, i32* %x
4555
store i32 0, i32* %y
4656
store i32 0, i32* %z
4757
store i32 0, i32* %w
48-
%1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %w_index
49-
store i32 1, i32* %1
50-
%2 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %r_index
51-
%3 = load i32, i32* %2
52-
store i32 %3, i32 addrspace(1)* %out
58+
%tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 %w_index
59+
store i32 1, i32* %tmp1
60+
%tmp2 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 %r_index
61+
%tmp3 = load i32, i32* %tmp2
62+
store i32 %tmp3, i32 addrspace(1)* %out
5363
ret void
5464
}
5565

5666
; This test should be optimize to:
5767
; store i32 0, i32 addrspace(1)* %out
68+
69+
; OPT-LABEL: @bitcast_gep(
70+
; OPT-LABEL: store i32 0, i32 addrspace(1)* %out, align 4
71+
5872
; FUNC-LABEL: {{^}}bitcast_gep:
5973
; EG: STORE_RAW
6074
define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
6175
entry:
62-
%0 = alloca [4 x i32]
63-
%x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0
64-
%y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
65-
%z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2
66-
%w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3
76+
%tmp = alloca [4 x i32]
77+
%x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
78+
%y = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
79+
%z = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 2
80+
%w = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 3
6781
store i32 0, i32* %x
6882
store i32 0, i32* %y
6983
store i32 0, i32* %z
7084
store i32 0, i32* %w
71-
%1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
72-
%2 = bitcast i32* %1 to [4 x i32]*
73-
%3 = getelementptr [4 x i32], [4 x i32]* %2, i32 0, i32 0
74-
%4 = load i32, i32* %3
75-
store i32 %4, i32 addrspace(1)* %out
85+
%tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
86+
%tmp2 = bitcast i32* %tmp1 to [4 x i32]*
87+
%tmp3 = getelementptr [4 x i32], [4 x i32]* %tmp2, i32 0, i32 0
88+
%tmp4 = load i32, i32* %tmp3
89+
store i32 %tmp4, i32 addrspace(1)* %out
90+
ret void
91+
}
92+
93+
; OPT-LABEL: @vector_read_bitcast_gep(
94+
; OPT: %0 = extractelement <4 x i32> <i32 1065353216, i32 1, i32 2, i32 3>, i32 %index
95+
; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
96+
define void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) {
97+
entry:
98+
%tmp = alloca [4 x i32]
99+
%x = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
100+
%y = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
101+
%z = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 2
102+
%w = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 3
103+
%bc = bitcast i32* %x to float*
104+
store float 1.0, float* %bc
105+
store i32 1, i32* %y
106+
store i32 2, i32* %z
107+
store i32 3, i32* %w
108+
%tmp1 = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 %index
109+
%tmp2 = load i32, i32* %tmp1
110+
store i32 %tmp2, i32 addrspace(1)* %out
111+
ret void
112+
}
113+
114+
; FIXME: Should be able to promote this. Instcombine should fold the
115+
; cast in the hasOneUse case so it might not matter in practice
116+
117+
; OPT-LABEL: @vector_read_bitcast_alloca(
118+
; OPT: alloca [4 x float]
119+
; OPT: store float
120+
; OPT: store float
121+
; OPT: store float
122+
; OPT: store float
123+
; OPT: load float
124+
define void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) {
125+
entry:
126+
%tmp = alloca [4 x i32]
127+
%tmp.bc = bitcast [4 x i32]* %tmp to [4 x float]*
128+
%x = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 0
129+
%y = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 1
130+
%z = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 2
131+
%w = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 3
132+
store float 0.0, float* %x
133+
store float 1.0, float* %y
134+
store float 2.0, float* %z
135+
store float 4.0, float* %w
136+
%tmp1 = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 %index
137+
%tmp2 = load float, float* %tmp1
138+
store float %tmp2, float addrspace(1)* %out
76139
ret void
77140
}

0 commit comments

Comments
 (0)