Skip to content

Commit 1a300d6

Browse files
authored
AMDGPU: Add SourceOfDivergence for int_amdgcn_global_load_tr (#79218)
1 parent dc410f9 commit 1a300d6

File tree

2 files changed

+82
-0
lines changed

2 files changed

+82
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,7 @@ def : SourceOfDivergence<int_amdgcn_wmma_f16_16x16x16_f16>;
414414
def : SourceOfDivergence<int_amdgcn_wmma_bf16_16x16x16_bf16>;
415415
def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu8>;
416416
def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu4>;
417+
def : SourceOfDivergence<int_amdgcn_global_load_tr>;
417418

418419
// The dummy boolean output is divergent from the IR's perspective,
419420
// but the mask results are uniform. These produce a divergent and

llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,78 @@ bb:
109109
ret void
110110
}
111111

112+
; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) %gep)
113+
define amdgpu_kernel void @global_load_tr_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
114+
bb:
115+
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
116+
%tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) %gep)
117+
store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8
118+
ret void
119+
}
120+
121+
; CHECK: DIVERGENT: %tmp0 = call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1) %gep)
122+
define amdgpu_kernel void @global_load_tr_b128_v8i16(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
123+
bb:
124+
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
125+
%tmp0 = call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1) %gep)
126+
store <8 x i16> %tmp0, ptr addrspace(1) %out, align 16
127+
ret void
128+
}
129+
130+
; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1) %gep)
131+
define amdgpu_kernel void @global_load_tr_b128_v8f16(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
132+
bb:
133+
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
134+
%tmp0 = call <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1) %gep)
135+
store <8 x half> %tmp0, ptr addrspace(1) %out, align 16
136+
ret void
137+
}
138+
139+
; CHECK: DIVERGENT: %tmp0 = call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1) %gep)
140+
define amdgpu_kernel void @global_load_tr_b128_v8bf16(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
141+
bb:
142+
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
143+
%tmp0 = call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1) %gep)
144+
store <8 x bfloat> %tmp0, ptr addrspace(1) %out, align 16
145+
ret void
146+
}
147+
148+
; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1) %gep)
149+
define amdgpu_kernel void @global_load_tr_b64_i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
150+
bb:
151+
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
152+
%tmp0 = call i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1) %gep)
153+
store i32 %tmp0, ptr addrspace(1) %out, align 4
154+
ret void
155+
}
156+
157+
; CHECK: DIVERGENT: %tmp0 = call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1) %gep)
158+
define amdgpu_kernel void @global_load_tr_b128_v4i16_(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
159+
bb:
160+
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
161+
%tmp0 = call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1) %gep)
162+
store <4 x i16> %tmp0, ptr addrspace(1) %out, align 8
163+
ret void
164+
}
165+
166+
; CHECK: DIVERGENT: %tmp0 = call <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1) %gep)
167+
define amdgpu_kernel void @global_load_tr_b128_v4f16(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
168+
bb:
169+
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
170+
%tmp0 = call <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1) %gep)
171+
store <4 x half> %tmp0, ptr addrspace(1) %out, align 8
172+
ret void
173+
}
174+
175+
; CHECK: DIVERGENT: %tmp0 = call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1) %gep)
176+
define amdgpu_kernel void @global_load_tr_b128_v4bf16(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
177+
bb:
178+
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
179+
%tmp0 = call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1) %gep)
180+
store <4 x bfloat> %tmp0, ptr addrspace(1) %out, align 8
181+
ret void
182+
}
183+
112184
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
113185
declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) #1
114186
declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) #1
@@ -125,5 +197,14 @@ declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16>, <16 x
125197
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) #1
126198
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) #1
127199

200+
declare <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1))
201+
declare <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1))
202+
declare <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1))
203+
declare <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1))
204+
declare i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1))
205+
declare <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1))
206+
declare <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1))
207+
declare <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1))
208+
128209
attributes #0 = { nounwind convergent }
129210
attributes #1 = { nounwind readnone convergent }

0 commit comments

Comments
 (0)