Skip to content

Commit b6491cc

Browse files
committed
AMDGPU: Implement hook for InferAddressSpaces
For now just port some of the existing NVPTX tests and from an old HSAIL optimization pass which approximately did the same thing. Don't enable the pass yet until more testing is done. llvm-svn: 293580
1 parent 850657a commit b6491cc

File tree

6 files changed

+468
-5
lines changed

6 files changed

+468
-5
lines changed

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,10 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
312312
return EnableXNACK;
313313
}
314314

315+
bool hasFlatAddressSpace() const {
316+
return FlatAddressSpace;
317+
}
318+
315319
bool isMesaKernel(const MachineFunction &MF) const {
316320
return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction()->getCallingConv());
317321
}
@@ -554,10 +558,6 @@ class SISubtarget final : public AMDGPUSubtarget {
554558
return 16;
555559
}
556560

557-
bool hasFlatAddressSpace() const {
558-
return FlatAddressSpace;
559-
}
560-
561561
bool hasSMemRealTime() const {
562562
return HasSMemRealTime;
563563
}

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
3232

3333
const AMDGPUSubtarget *ST;
3434
const AMDGPUTargetLowering *TLI;
35+
bool IsGraphicsShader;
3536

3637
const AMDGPUSubtarget *getST() const { return ST; }
3738
const AMDGPUTargetLowering *getTLI() const { return TLI; }
@@ -62,7 +63,8 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
6263
explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
6364
: BaseT(TM, F.getParent()->getDataLayout()),
6465
ST(TM->getSubtargetImpl(F)),
65-
TLI(ST->getTargetLowering()) {}
66+
TLI(ST->getTargetLowering()),
67+
IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())) {}
6668

6769
bool hasBranchDivergence() { return true; }
6870

@@ -91,6 +93,14 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
9193
int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
9294
bool isSourceOfDivergence(const Value *V) const;
9395

96+
unsigned getFlatAddressSpace() const {
97+
// Don't bother running InferAddressSpaces pass on graphics shaders which
98+
// don't use flat addressing.
99+
if (IsGraphicsShader)
100+
return -1;
101+
return ST->hasFlatAddressSpace() ? AMDGPUAS::FLAT_ADDRESS : -1;
102+
}
103+
94104
unsigned getVectorSplitCost() { return 0; }
95105
};
96106

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -infer-address-spaces %s | FileCheck %s
2+
3+
; Trivial optimization of generic addressing
4+
5+
; CHECK-LABEL: @load_global_from_flat(
6+
; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(1)*
7+
; CHECK-NEXT: %tmp1 = load float, float addrspace(1)* %tmp0
8+
; CHECK-NEXT: ret float %tmp1
9+
define float @load_global_from_flat(float addrspace(4)* %generic_scalar) #0 {
10+
%tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(1)*
11+
%tmp1 = load float, float addrspace(1)* %tmp0
12+
ret float %tmp1
13+
}
14+
15+
; CHECK-LABEL: @load_constant_from_flat(
16+
; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(2)*
17+
; CHECK-NEXT: %tmp1 = load float, float addrspace(2)* %tmp0
18+
; CHECK-NEXT: ret float %tmp1
19+
define float @load_constant_from_flat(float addrspace(4)* %generic_scalar) #0 {
20+
%tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(2)*
21+
%tmp1 = load float, float addrspace(2)* %tmp0
22+
ret float %tmp1
23+
}
24+
25+
; CHECK-LABEL: @load_group_from_flat(
26+
; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(3)*
27+
; CHECK-NEXT: %tmp1 = load float, float addrspace(3)* %tmp0
28+
; CHECK-NEXT: ret float %tmp1
29+
define float @load_group_from_flat(float addrspace(4)* %generic_scalar) #0 {
30+
%tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(3)*
31+
%tmp1 = load float, float addrspace(3)* %tmp0
32+
ret float %tmp1
33+
}
34+
35+
; CHECK-LABEL: @load_private_from_flat(
36+
; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float*
37+
; CHECK-NEXT: %tmp1 = load float, float* %tmp0
38+
; CHECK-NEXT: ret float %tmp1
39+
define float @load_private_from_flat(float addrspace(4)* %generic_scalar) #0 {
40+
%tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float*
41+
%tmp1 = load float, float* %tmp0
42+
ret float %tmp1
43+
}
44+
45+
; CHECK-LABEL: @store_global_from_flat(
46+
; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(1)*
47+
; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* %tmp0
48+
define void @store_global_from_flat(float addrspace(4)* %generic_scalar) #0 {
49+
%tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(1)*
50+
store float 0.0, float addrspace(1)* %tmp0
51+
ret void
52+
}
53+
54+
; CHECK-LABEL: @store_group_from_flat(
55+
; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(3)*
56+
; CHECK-NEXT: store float 0.000000e+00, float addrspace(3)* %tmp0
57+
define void @store_group_from_flat(float addrspace(4)* %generic_scalar) #0 {
58+
%tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(3)*
59+
store float 0.0, float addrspace(3)* %tmp0
60+
ret void
61+
}
62+
63+
; CHECK-LABEL: @store_private_from_flat(
64+
; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float*
65+
; CHECK-NEXT: store float 0.000000e+00, float* %tmp0
66+
define void @store_private_from_flat(float addrspace(4)* %generic_scalar) #0 {
67+
%tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float*
68+
store float 0.0, float* %tmp0
69+
ret void
70+
}
71+
72+
; optimized to global load/store.
73+
; CHECK-LABEL: @load_store_global(
74+
; CHECK-NEXT: %val = load i32, i32 addrspace(1)* %input, align 4
75+
; CHECK-NEXT: store i32 %val, i32 addrspace(1)* %output, align 4
76+
; CHECK-NEXT: ret void
77+
define void @load_store_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
78+
%tmp0 = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
79+
%tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)*
80+
%val = load i32, i32 addrspace(4)* %tmp0, align 4
81+
store i32 %val, i32 addrspace(4)* %tmp1, align 4
82+
ret void
83+
}
84+
85+
; Optimized to group load/store.
86+
; CHECK-LABEL: @load_store_group(
87+
; CHECK-NEXT: %val = load i32, i32 addrspace(3)* %input, align 4
88+
; CHECK-NEXT: store i32 %val, i32 addrspace(3)* %output, align 4
89+
; CHECK-NEXT: ret void
90+
define void @load_store_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
91+
%tmp0 = addrspacecast i32 addrspace(3)* %input to i32 addrspace(4)*
92+
%tmp1 = addrspacecast i32 addrspace(3)* %output to i32 addrspace(4)*
93+
%val = load i32, i32 addrspace(4)* %tmp0, align 4
94+
store i32 %val, i32 addrspace(4)* %tmp1, align 4
95+
ret void
96+
}
97+
98+
; Optimized to private load/store.
99+
; CHECK-LABEL: @load_store_private(
100+
; CHECK-NEXT: %val = load i32, i32* %input, align 4
101+
; CHECK-NEXT: store i32 %val, i32* %output, align 4
102+
; CHECK-NEXT: ret void
103+
define void @load_store_private(i32* nocapture %input, i32* nocapture %output) #0 {
104+
%tmp0 = addrspacecast i32* %input to i32 addrspace(4)*
105+
%tmp1 = addrspacecast i32* %output to i32 addrspace(4)*
106+
%val = load i32, i32 addrspace(4)* %tmp0, align 4
107+
store i32 %val, i32 addrspace(4)* %tmp1, align 4
108+
ret void
109+
}
110+
111+
; No optimization. flat load/store.
112+
; CHECK-LABEL: @load_store_flat(
113+
; CHECK-NEXT: %val = load i32, i32 addrspace(4)* %input, align 4
114+
; CHECK-NEXT: store i32 %val, i32 addrspace(4)* %output, align 4
115+
; CHECK-NEXT: ret void
116+
define void @load_store_flat(i32 addrspace(4)* nocapture %input, i32 addrspace(4)* nocapture %output) #0 {
117+
%val = load i32, i32 addrspace(4)* %input, align 4
118+
store i32 %val, i32 addrspace(4)* %output, align 4
119+
ret void
120+
}
121+
122+
; CHECK-LABEL: @store_addrspacecast_ptr_value(
123+
; CHECK: %cast = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
124+
; CHECK-NEXT: store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %output, align 4
125+
define void @store_addrspacecast_ptr_value(i32 addrspace(1)* nocapture %input, i32 addrspace(4)* addrspace(1)* nocapture %output) #0 {
126+
%cast = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
127+
store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %output, align 4
128+
ret void
129+
}
130+
131+
attributes #0 = { nounwind }
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -infer-address-spaces %s | FileCheck %s
2+
; Ports of most of test/CodeGen/NVPTX/access-non-generic.ll
3+
4+
@scalar = internal addrspace(3) global float 0.0, align 4
5+
@array = internal addrspace(3) global [10 x float] zeroinitializer, align 4
6+
7+
; CHECK-LABEL: @load_store_lds_f32(
8+
; CHECK: %tmp = load float, float addrspace(3)* @scalar, align 4
9+
; CHECK: call void @use(float %tmp)
10+
; CHECK: store float %v, float addrspace(3)* @scalar, align 4
11+
; CHECK: call void @llvm.amdgcn.s.barrier()
12+
; CHECK: %tmp2 = load float, float addrspace(3)* @scalar, align 4
13+
; CHECK: call void @use(float %tmp2)
14+
; CHECK: store float %v, float addrspace(3)* @scalar, align 4
15+
; CHECK: call void @llvm.amdgcn.s.barrier()
16+
; CHECK: %tmp3 = load float, float addrspace(3)* getelementptr inbounds ([10 x float], [10 x float] addrspace(3)* @array, i32 0, i32 5), align 4
17+
; CHECK: call void @use(float %tmp3)
18+
; CHECK: store float %v, float addrspace(3)* getelementptr inbounds ([10 x float], [10 x float] addrspace(3)* @array, i32 0, i32 5), align 4
19+
; CHECK: call void @llvm.amdgcn.s.barrier()
20+
; CHECK: %tmp4 = getelementptr inbounds [10 x float], [10 x float] addrspace(3)* @array, i32 0, i32 5
21+
; CHECK: %tmp5 = load float, float addrspace(3)* %tmp4, align 4
22+
; CHECK: call void @use(float %tmp5)
23+
; CHECK: store float %v, float addrspace(3)* %tmp4, align 4
24+
; CHECK: call void @llvm.amdgcn.s.barrier()
25+
; CHECK: %tmp7 = getelementptr inbounds [10 x float], [10 x float] addrspace(3)* @array, i32 0, i32 %i
26+
; CHECK: %tmp8 = load float, float addrspace(3)* %tmp7, align 4
27+
; CHECK: call void @use(float %tmp8)
28+
; CHECK: store float %v, float addrspace(3)* %tmp7, align 4
29+
; CHECK: call void @llvm.amdgcn.s.barrier()
30+
; CHECK: ret void
31+
define void @load_store_lds_f32(i32 %i, float %v) #0 {
32+
bb:
33+
%tmp = load float, float addrspace(4)* addrspacecast (float addrspace(3)* @scalar to float addrspace(4)*), align 4
34+
call void @use(float %tmp)
35+
store float %v, float addrspace(4)* addrspacecast (float addrspace(3)* @scalar to float addrspace(4)*), align 4
36+
call void @llvm.amdgcn.s.barrier()
37+
%tmp1 = addrspacecast float addrspace(3)* @scalar to float addrspace(4)*
38+
%tmp2 = load float, float addrspace(4)* %tmp1, align 4
39+
call void @use(float %tmp2)
40+
store float %v, float addrspace(4)* %tmp1, align 4
41+
call void @llvm.amdgcn.s.barrier()
42+
%tmp3 = load float, float addrspace(4)* getelementptr inbounds ([10 x float], [10 x float] addrspace(4)* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float] addrspace(4)*), i32 0, i32 5), align 4
43+
call void @use(float %tmp3)
44+
store float %v, float addrspace(4)* getelementptr inbounds ([10 x float], [10 x float] addrspace(4)* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float] addrspace(4)*), i32 0, i32 5), align 4
45+
call void @llvm.amdgcn.s.barrier()
46+
%tmp4 = getelementptr inbounds [10 x float], [10 x float] addrspace(4)* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float] addrspace(4)*), i32 0, i32 5
47+
%tmp5 = load float, float addrspace(4)* %tmp4, align 4
48+
call void @use(float %tmp5)
49+
store float %v, float addrspace(4)* %tmp4, align 4
50+
call void @llvm.amdgcn.s.barrier()
51+
%tmp6 = addrspacecast [10 x float] addrspace(3)* @array to [10 x float] addrspace(4)*
52+
%tmp7 = getelementptr inbounds [10 x float], [10 x float] addrspace(4)* %tmp6, i32 0, i32 %i
53+
%tmp8 = load float, float addrspace(4)* %tmp7, align 4
54+
call void @use(float %tmp8)
55+
store float %v, float addrspace(4)* %tmp7, align 4
56+
call void @llvm.amdgcn.s.barrier()
57+
ret void
58+
}
59+
60+
; CHECK-LABEL: @constexpr_load_int_from_float_lds(
61+
; CHECK: %tmp = load i32, i32 addrspace(3)* bitcast (float addrspace(3)* @scalar to i32 addrspace(3)*), align 4
62+
define i32 @constexpr_load_int_from_float_lds() #0 {
63+
bb:
64+
%tmp = load i32, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @scalar to i32 addrspace(3)*) to i32 addrspace(4)*), align 4
65+
ret i32 %tmp
66+
}
67+
68+
; CHECK-LABEL: @load_int_from_global_float(
69+
; CHECK: %tmp1 = getelementptr float, float addrspace(1)* %input, i32 %i
70+
; CHECK: %tmp2 = getelementptr float, float addrspace(1)* %tmp1, i32 %j
71+
; CHECK: %tmp3 = bitcast float addrspace(1)* %tmp2 to i32 addrspace(1)*
72+
; CHECK: %tmp4 = load i32, i32 addrspace(1)* %tmp3
73+
; CHECK: ret i32 %tmp4
74+
define i32 @load_int_from_global_float(float addrspace(1)* %input, i32 %i, i32 %j) #0 {
75+
bb:
76+
%tmp = addrspacecast float addrspace(1)* %input to float addrspace(4)*
77+
%tmp1 = getelementptr float, float addrspace(4)* %tmp, i32 %i
78+
%tmp2 = getelementptr float, float addrspace(4)* %tmp1, i32 %j
79+
%tmp3 = bitcast float addrspace(4)* %tmp2 to i32 addrspace(4)*
80+
%tmp4 = load i32, i32 addrspace(4)* %tmp3
81+
ret i32 %tmp4
82+
}
83+
84+
; CHECK-LABEL: @nested_const_expr(
85+
; CHECK: store i32 1, i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds ([10 x float], [10 x float] addrspace(3)* @array, i64 0, i64 1) to i32 addrspace(3)*), align 4
86+
define void @nested_const_expr() #0 {
87+
store i32 1, i32 addrspace(4)* bitcast (float addrspace(4)* getelementptr ([10 x float], [10 x float] addrspace(4)* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float] addrspace(4)*), i64 0, i64 1) to i32 addrspace(4)*), align 4
88+
ret void
89+
}
90+
91+
; CHECK-LABEL: @rauw(
92+
; CHECK: %addr = getelementptr float, float addrspace(1)* %input, i64 10
93+
; CHECK-NEXT: %v = load float, float addrspace(1)* %addr
94+
; CHECK-NEXT: store float %v, float addrspace(1)* %addr
95+
; CHECK-NEXT: ret void
96+
define void @rauw(float addrspace(1)* %input) #0 {
97+
bb:
98+
%generic_input = addrspacecast float addrspace(1)* %input to float addrspace(4)*
99+
%addr = getelementptr float, float addrspace(4)* %generic_input, i64 10
100+
%v = load float, float addrspace(4)* %addr
101+
store float %v, float addrspace(4)* %addr
102+
ret void
103+
}
104+
105+
; FIXME: Should be able to eliminate the cast inside the loop
106+
; CHECK-LABEL: @loop(
107+
108+
; CHECK: %p = bitcast [10 x float] addrspace(3)* @array to float addrspace(3)*
109+
; CHECK: %0 = addrspacecast float addrspace(3)* %p to float addrspace(4)*
110+
; CHECK: %end = getelementptr float, float addrspace(4)* %0, i64 10
111+
; CHECK: br label %loop
112+
113+
; CHECK: loop: ; preds = %loop, %entry
114+
; CHECK: %i = phi float addrspace(3)* [ %p, %entry ], [ %i2, %loop ]
115+
; CHECK: %v = load float, float addrspace(3)* %i
116+
; CHECK: call void @use(float %v)
117+
; CHECK: %i2 = getelementptr float, float addrspace(3)* %i, i64 1
118+
; CHECK: %1 = addrspacecast float addrspace(3)* %i2 to float addrspace(4)*
119+
; CHECK: %exit_cond = icmp eq float addrspace(4)* %1, %end
120+
; CHECK: br i1 %exit_cond, label %exit, label %loop
121+
define void @loop() #0 {
122+
entry:
123+
%p = addrspacecast [10 x float] addrspace(3)* @array to float addrspace(4)*
124+
%end = getelementptr float, float addrspace(4)* %p, i64 10
125+
br label %loop
126+
127+
loop: ; preds = %loop, %entry
128+
%i = phi float addrspace(4)* [ %p, %entry ], [ %i2, %loop ]
129+
%v = load float, float addrspace(4)* %i
130+
call void @use(float %v)
131+
%i2 = getelementptr float, float addrspace(4)* %i, i64 1
132+
%exit_cond = icmp eq float addrspace(4)* %i2, %end
133+
br i1 %exit_cond, label %exit, label %loop
134+
135+
exit: ; preds = %loop
136+
ret void
137+
}
138+
139+
@generic_end = external addrspace(1) global float addrspace(4)*
140+
141+
; CHECK-LABEL: @loop_with_generic_bound(
142+
; CHECK: %p = bitcast [10 x float] addrspace(3)* @array to float addrspace(3)*
143+
; CHECK: %end = load float addrspace(4)*, float addrspace(4)* addrspace(1)* @generic_end
144+
; CHECK: br label %loop
145+
146+
; CHECK: loop:
147+
; CHECK: %i = phi float addrspace(3)* [ %p, %entry ], [ %i2, %loop ]
148+
; CHECK: %v = load float, float addrspace(3)* %i
149+
; CHECK: call void @use(float %v)
150+
; CHECK: %i2 = getelementptr float, float addrspace(3)* %i, i64 1
151+
; CHECK: %0 = addrspacecast float addrspace(3)* %i2 to float addrspace(4)*
152+
; CHECK: %exit_cond = icmp eq float addrspace(4)* %0, %end
153+
; CHECK: br i1 %exit_cond, label %exit, label %loop
154+
define void @loop_with_generic_bound() #0 {
155+
entry:
156+
%p = addrspacecast [10 x float] addrspace(3)* @array to float addrspace(4)*
157+
%end = load float addrspace(4)*, float addrspace(4)* addrspace(1)* @generic_end
158+
br label %loop
159+
160+
loop: ; preds = %loop, %entry
161+
%i = phi float addrspace(4)* [ %p, %entry ], [ %i2, %loop ]
162+
%v = load float, float addrspace(4)* %i
163+
call void @use(float %v)
164+
%i2 = getelementptr float, float addrspace(4)* %i, i64 1
165+
%exit_cond = icmp eq float addrspace(4)* %i2, %end
166+
br i1 %exit_cond, label %exit, label %loop
167+
168+
exit: ; preds = %loop
169+
ret void
170+
}
171+
172+
declare void @llvm.amdgcn.s.barrier() #1
173+
declare void @use(float) #0
174+
175+
attributes #0 = { nounwind }
176+
attributes #1 = { convergent nounwind }

0 commit comments

Comments
 (0)