Skip to content

Commit b3924cb

Browse files
[AMDGPU] Set Convergent property for image.(getlod/sample*) intrinsics which uses WQM (#122908)
This change adds IntrConvergent property to image.getlod intrinsic and to several image.sample intrinsics. All image.sample intrinsics apart from LOD(_L), Level 0(_LZ), Derivative(_D) will be marked as Convergent.
1 parent 04b002b commit b3924cb

File tree

2 files changed

+126
-12
lines changed

2 files changed

+126
-12
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -876,6 +876,8 @@ class AMDGPUSampleVariant<string ucmod, string lcmod, list<AMDGPUArg> extra_addr
876876
// Name of the {lod} or {clamp} argument that is appended to the coordinates,
877877
// if any.
878878
string LodOrClamp = "";
879+
880+
bit UsesWQM = false;
879881
}
880882

881883
// AMDGPUSampleVariants: all variants supported by IMAGE_SAMPLE
@@ -905,8 +907,9 @@ defset list<AMDGPUSampleVariant> AMDGPUSampleVariants = {
905907
}
906908

907909
defset list<AMDGPUSampleVariant> AMDGPUSampleVariantsNoGradients = {
910+
let UsesWQM = true in
908911
defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"", "", []>;
909-
let Bias = true in
912+
let Bias = true, UsesWQM = true in
910913
defm AMDGPUSample : AMDGPUSampleHelper_Clamp<
911914
"_B", "_b", [AMDGPUArg<llvm_anyfloat_ty, "bias">]>;
912915
let LodOrClamp = "lod" in
@@ -1172,7 +1175,8 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
11721175
foreach dim = AMDGPUDims.NoMsaa in {
11731176
def !strconcat(NAME, "_", dim.Name) : AMDGPUImageDimIntrinsic<
11741177
AMDGPUDimSampleProfile<opmod, dim, sample>,
1175-
!if(NoMem, [IntrNoMem], [IntrReadMem]),
1178+
!listconcat(!if(NoMem, [IntrNoMem], [IntrReadMem]),
1179+
!if(sample.UsesWQM, [IntrConvergent], [])),
11761180
!if(NoMem, [], [SDNPMemOperand])>;
11771181
}
11781182
}
@@ -1188,7 +1192,8 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
11881192
foreach dim = AMDGPUDims.NoMsaa in {
11891193
def !strconcat(NAME, "_", dim.Name, "_nortn") : AMDGPUImageDimIntrinsic<
11901194
AMDGPUDimSampleNoReturnProfile<opmod, dim, sample>,
1191-
[IntrWillReturn], [SDNPMemOperand]>;
1195+
!listconcat([IntrWillReturn], !if(sample.UsesWQM, [IntrConvergent], [])),
1196+
[SDNPMemOperand]>;
11921197
}
11931198
}
11941199
foreach sample = AMDGPUSampleVariants in {

llvm/test/CodeGen/AMDGPU/sink-image-sample.ll

Lines changed: 118 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,143 @@
11
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
22
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
33

4-
; Test that image.sample instruction is sunk across the branch and not left in the first block. Since the kill may terminate the shader there might be no need for sampling the image.
4+
; Test that image.sample LOD(_L), Level 0(_LZ), Derivative(_D) instructions are sunk across the branch and not left in the first block. Since the kill may terminate the shader there might be no need for sampling the image.
55

66
; GCN-LABEL: {{^}}sinking_img_sample:
7-
; GCN-NOT: image_sample
7+
; GCN-NOT: image_sample_l v
8+
; GCN-NOT: image_sample_lz v
9+
; GCN-NOT: image_sample_c_lz v
10+
; GCN-NOT: image_sample_c_l v
11+
; GCN-NOT: image_sample_d v
12+
; GCN-NOT: image_sample_c_d v
13+
; GCN-NOT: image_sample_d_cl v
14+
; GCN-NOT: image_sample_c_d_cl v
815
; GCN: branch
9-
; GCN: image_sample
16+
; GCN: image_sample_l v
17+
; GCN: image_sample_lz v
18+
; GCN: image_sample_c_lz v
19+
; GCN: image_sample_c_l v
20+
; GCN: image_sample_d v
21+
; GCN: image_sample_c_d v
22+
; GCN: image_sample_d_cl v
23+
; GCN: image_sample_c_d_cl v
1024
; GCN: exp null
1125

12-
define amdgpu_ps float @sinking_img_sample() {
26+
define amdgpu_ps float @sinking_img_sample(i1 %cond) {
1327
main_body:
14-
%i = call <3 x float> @llvm.amdgcn.image.sample.2d.v3f32.f32(i32 7, float undef, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0)
15-
br i1 undef, label %endif1, label %if1
28+
%i1 = call <3 x float> @llvm.amdgcn.image.sample.l.2d.v3f32.f32(i32 7, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0)
29+
%i2 = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32(i32 7, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0)
30+
%i3 = call <3 x float> @llvm.amdgcn.image.sample.c.lz.2d.v3f32.f32(i32 7, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0)
31+
%i4 = call <3 x float> @llvm.amdgcn.image.sample.c.l.2d.v3f32.f32(i32 7, float poison, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0)
32+
%i5 = call <3 x float> @llvm.amdgcn.image.sample.d.2d.v3f32.f32(i32 7, float poison, float poison, float poison, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0)
33+
%i6 = call <3 x float> @llvm.amdgcn.image.sample.c.d.2d.v3f32.f32(i32 7, float poison, float poison, float poison, float poison, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0)
34+
%i7 = call <3 x float> @llvm.amdgcn.image.sample.d.cl.2d.v3f32.f32(i32 7, float poison, float poison, float poison, float poison, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0)
35+
%i8 = call <3 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v3f32.f32(i32 7, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0)
36+
br i1 %cond, label %endif1, label %if1
1637

1738
if1: ; preds = %main_body
1839
call void @llvm.amdgcn.kill(i1 false) #4
1940
br label %exit
2041

2142
endif1: ; preds = %main_body
22-
%i22 = extractelement <3 x float> %i, i32 2
43+
%i22 = extractelement <3 x float> %i1, i32 1
2344
%i23 = call nsz arcp contract float @llvm.fma.f32(float %i22, float 0.000000e+00, float 0.000000e+00) #1
45+
%i24 = extractelement <3 x float> %i2, i32 1
46+
%i25 = call nsz arcp contract float @llvm.fma.f32(float %i23, float %i24, float 0.000000e+00) #1
47+
%i26 = extractelement <3 x float> %i3, i32 1
48+
%i27 = call nsz arcp contract float @llvm.fma.f32(float %i25, float %i26, float 0.000000e+00) #1
49+
%i28 = extractelement <3 x float> %i4, i32 1
50+
%i29 = call nsz arcp contract float @llvm.fma.f32(float %i27, float %i28, float 0.000000e+00) #1
51+
%i30 = extractelement <3 x float> %i5, i32 1
52+
%i31 = call nsz arcp contract float @llvm.fma.f32(float %i29, float %i30, float 0.000000e+00) #1
53+
%i32 = extractelement <3 x float> %i6, i32 1
54+
%i33 = call nsz arcp contract float @llvm.fma.f32(float %i31, float %i32, float 0.000000e+00) #1
55+
%i34 = extractelement <3 x float> %i7, i32 1
56+
%i35 = call nsz arcp contract float @llvm.fma.f32(float %i33, float %i34, float 0.000000e+00) #1
57+
%i36 = extractelement <3 x float> %i8, i32 1
58+
%i37 = call nsz arcp contract float @llvm.fma.f32(float %i35, float %i36, float 0.000000e+00) #1
2459
br label %exit
2560

2661
exit: ; preds = %endif1, %if1
27-
%i24 = phi float [ undef, %if1 ], [ %i23, %endif1 ]
28-
ret float %i24
62+
%i38 = phi float [ poison, %if1 ], [ %i37, %endif1 ]
63+
ret float %i38
2964
}
65+
66+
67+
; Test that image.sample instructions which use WQM are marked as Convergent and will be left in the first block.
68+
69+
; GCN-LABEL: {{^}}no_sinking_img_sample:
70+
; GCN: image_sample v
71+
; GCN: image_sample_c v
72+
; GCN: image_sample_cl v
73+
; GCN: image_sample_c_cl v
74+
; GCN: image_sample_b v
75+
; GCN: image_sample_c_b v
76+
; GCN: image_sample_b_cl v
77+
; GCN: branch
78+
; GCN-NOT: image_sample v
79+
; GCN-NOT: image_sample_c v
80+
; GCN-NOT: image_sample_cl v
81+
; GCN-NOT: image_sample_c_cl v
82+
; GCN-NOT: image_sample_b v
83+
; GCN-NOT: image_sample_c_b v
84+
; GCN-NOT: image_sample_b_cl v
85+
; GCN: exp null
86+
87+
define amdgpu_ps float @no_sinking_img_sample(i1 %cond) {
88+
main_body:
89+
%i1 = call <3 x float> @llvm.amdgcn.image.sample.2d.v3f32.f32(i32 7, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0)
90+
%i2 = call <3 x float> @llvm.amdgcn.image.sample.c.2d.v3f32.f32(i32 7, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0)
91+
%i3 = call <3 x float> @llvm.amdgcn.image.sample.cl.2d.v3f32.f32(i32 7, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0)
92+
%i4 = call <3 x float> @llvm.amdgcn.image.sample.c.cl.2d.v3f32.f32(i32 7, float poison, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0)
93+
%i5 = call <3 x float> @llvm.amdgcn.image.sample.b.2d.v3f32.f32(i32 7, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0)
94+
%i6 = call <3 x float> @llvm.amdgcn.image.sample.c.b.2d.v3f32.f32(i32 7, float poison, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0)
95+
%i7 = call <3 x float> @llvm.amdgcn.image.sample.b.cl.2d.v3f32.f32.f32(i32 7, float poison, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0)
96+
br i1 %cond, label %endif1, label %if1
97+
98+
if1: ; preds = %main_body
99+
call void @llvm.amdgcn.kill(i1 false) #4
100+
br label %exit
101+
102+
endif1: ; preds = %main_body
103+
%i22 = extractelement <3 x float> %i1, i32 2
104+
%i23 = call nsz arcp contract float @llvm.fma.f32(float %i22, float 0.000000e+00, float 0.000000e+00) #1
105+
%i24 = extractelement <3 x float> %i2, i32 2
106+
%i25 = call nsz arcp contract float @llvm.fma.f32(float %i23, float %i24, float 0.000000e+00) #1
107+
%i26 = extractelement <3 x float> %i3, i32 2
108+
%i27 = call nsz arcp contract float @llvm.fma.f32(float %i25, float %i26, float 0.000000e+00) #1
109+
%i28 = extractelement <3 x float> %i4, i32 2
110+
%i29 = call nsz arcp contract float @llvm.fma.f32(float %i27, float %i28, float 0.000000e+00) #1
111+
%i30 = extractelement <3 x float> %i5, i32 2
112+
%i31 = call nsz arcp contract float @llvm.fma.f32(float %i29, float %i30, float 0.000000e+00) #1
113+
%i32 = extractelement <3 x float> %i6, i32 2
114+
%i33 = call nsz arcp contract float @llvm.fma.f32(float %i31, float %i32, float 0.000000e+00) #1
115+
%i34 = extractelement <3 x float> %i7, i32 2
116+
%i35 = call nsz arcp contract float @llvm.fma.f32(float %i33, float %i34, float 0.000000e+00) #1
117+
br label %exit
118+
119+
exit: ; preds = %endif1, %if1
120+
%i36 = phi float [ poison, %if1 ], [ %i35, %endif1 ]
121+
ret float %i36
122+
}
123+
30124
; Function Attrs: nounwind readonly willreturn
31125
declare <3 x float> @llvm.amdgcn.image.sample.2d.v3f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3
126+
declare <3 x float> @llvm.amdgcn.image.sample.c.2d.v3f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3
127+
declare <3 x float> @llvm.amdgcn.image.sample.cl.2d.v3f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3
128+
declare <3 x float> @llvm.amdgcn.image.sample.c.cl.2d.v3f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3
129+
declare <3 x float> @llvm.amdgcn.image.sample.b.2d.v3f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3
130+
declare <3 x float> @llvm.amdgcn.image.sample.c.b.2d.v3f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3
131+
declare <3 x float> @llvm.amdgcn.image.sample.b.cl.2d.v3f32.f32.(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3
132+
declare <3 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v3f32.f32(i32 immarg, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3
133+
declare <3 x float> @llvm.amdgcn.image.sample.l.2d.v3f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3
134+
declare <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3
135+
declare <3 x float> @llvm.amdgcn.image.sample.c.lz.2d.v3f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3
136+
declare <3 x float> @llvm.amdgcn.image.sample.c.l.2d.v3f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3
137+
declare <3 x float> @llvm.amdgcn.image.sample.d.2d.v3f32.f32.f32(i32 immarg, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3
138+
declare <3 x float> @llvm.amdgcn.image.sample.c.d.2d.v3f32.f32.f32(i32 immarg, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3
139+
declare <3 x float> @llvm.amdgcn.image.sample.d.cl.2d.v3f32.f32.f32(i32 immarg, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3
140+
declare <3 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v3f32.f32.f32(i32 immarg, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3
32141

33142
; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
34143
declare float @llvm.fma.f32(float, float, float) #2

0 commit comments

Comments
 (0)