109
109
ret void
110
110
}
111
111
112
+ ; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) %gep)
113
+ define amdgpu_kernel void @global_load_tr_b64_v2i32 (ptr addrspace (1 ) %addr , ptr addrspace (1 ) %out ) {
114
+ bb:
115
+ %gep = getelementptr i64 , ptr addrspace (1 ) %addr , i32 4
116
+ %tmp0 = call <2 x i32 > @llvm.amdgcn.global.load.tr.v2i32 (ptr addrspace (1 ) %gep )
117
+ store <2 x i32 > %tmp0 , ptr addrspace (1 ) %out , align 8
118
+ ret void
119
+ }
120
+
121
+ ; CHECK: DIVERGENT: %tmp0 = call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1) %gep)
122
+ define amdgpu_kernel void @global_load_tr_b128_v8i16 (ptr addrspace (1 ) %addr , ptr addrspace (1 ) %out ) {
123
+ bb:
124
+ %gep = getelementptr i64 , ptr addrspace (1 ) %addr , i32 4
125
+ %tmp0 = call <8 x i16 > @llvm.amdgcn.global.load.tr.v8i16 (ptr addrspace (1 ) %gep )
126
+ store <8 x i16 > %tmp0 , ptr addrspace (1 ) %out , align 16
127
+ ret void
128
+ }
129
+
130
+ ; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1) %gep)
131
+ define amdgpu_kernel void @global_load_tr_b128_v8f16 (ptr addrspace (1 ) %addr , ptr addrspace (1 ) %out ) {
132
+ bb:
133
+ %gep = getelementptr i64 , ptr addrspace (1 ) %addr , i32 4
134
+ %tmp0 = call <8 x half > @llvm.amdgcn.global.load.tr.v8f16 (ptr addrspace (1 ) %gep )
135
+ store <8 x half > %tmp0 , ptr addrspace (1 ) %out , align 16
136
+ ret void
137
+ }
138
+
139
+ ; CHECK: DIVERGENT: %tmp0 = call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1) %gep)
140
+ define amdgpu_kernel void @global_load_tr_b128_v8bf16 (ptr addrspace (1 ) %addr , ptr addrspace (1 ) %out ) {
141
+ bb:
142
+ %gep = getelementptr i64 , ptr addrspace (1 ) %addr , i32 4
143
+ %tmp0 = call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16 (ptr addrspace (1 ) %gep )
144
+ store <8 x bfloat> %tmp0 , ptr addrspace (1 ) %out , align 16
145
+ ret void
146
+ }
147
+
148
+ ; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1) %gep)
149
+ define amdgpu_kernel void @global_load_tr_b64_i32 (ptr addrspace (1 ) %addr , ptr addrspace (1 ) %out ) {
150
+ bb:
151
+ %gep = getelementptr i64 , ptr addrspace (1 ) %addr , i32 4
152
+ %tmp0 = call i32 @llvm.amdgcn.global.load.tr.i32 (ptr addrspace (1 ) %gep )
153
+ store i32 %tmp0 , ptr addrspace (1 ) %out , align 4
154
+ ret void
155
+ }
156
+
157
+ ; CHECK: DIVERGENT: %tmp0 = call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1) %gep)
158
+ define amdgpu_kernel void @global_load_tr_b128_v4i16_ (ptr addrspace (1 ) %addr , ptr addrspace (1 ) %out ) {
159
+ bb:
160
+ %gep = getelementptr i64 , ptr addrspace (1 ) %addr , i32 4
161
+ %tmp0 = call <4 x i16 > @llvm.amdgcn.global.load.tr.v4i16 (ptr addrspace (1 ) %gep )
162
+ store <4 x i16 > %tmp0 , ptr addrspace (1 ) %out , align 8
163
+ ret void
164
+ }
165
+
166
+ ; CHECK: DIVERGENT: %tmp0 = call <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1) %gep)
167
+ define amdgpu_kernel void @global_load_tr_b128_v4f16 (ptr addrspace (1 ) %addr , ptr addrspace (1 ) %out ) {
168
+ bb:
169
+ %gep = getelementptr i64 , ptr addrspace (1 ) %addr , i32 4
170
+ %tmp0 = call <4 x half > @llvm.amdgcn.global.load.tr.v4f16 (ptr addrspace (1 ) %gep )
171
+ store <4 x half > %tmp0 , ptr addrspace (1 ) %out , align 8
172
+ ret void
173
+ }
174
+
175
+ ; CHECK: DIVERGENT: %tmp0 = call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1) %gep)
176
+ define amdgpu_kernel void @global_load_tr_b128_v4bf16 (ptr addrspace (1 ) %addr , ptr addrspace (1 ) %out ) {
177
+ bb:
178
+ %gep = getelementptr i64 , ptr addrspace (1 ) %addr , i32 4
179
+ %tmp0 = call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16 (ptr addrspace (1 ) %gep )
180
+ store <4 x bfloat> %tmp0 , ptr addrspace (1 ) %out , align 8
181
+ ret void
182
+ }
183
+
112
184
declare i32 @llvm.amdgcn.ds.swizzle (i32 , i32 ) #1
113
185
declare i32 @llvm.amdgcn.permlane16 (i32 , i32 , i32 , i32 , i1 , i1 ) #1
114
186
declare i32 @llvm.amdgcn.permlanex16 (i32 , i32 , i32 , i32 , i1 , i1 ) #1
@@ -125,5 +197,14 @@ declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16>, <16 x
125
197
declare <8 x i32 > @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32 (i1 immarg, <4 x i32 >, i1 immarg, <4 x i32 > , <8 x i32 >, i1 immarg) #1
126
198
declare <8 x i32 > @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32 (i1 immarg, <2 x i32 >, i1 immarg, <2 x i32 > , <8 x i32 >, i1 immarg) #1
127
199
200
+ declare <2 x i32 > @llvm.amdgcn.global.load.tr.v2i32 (ptr addrspace (1 ))
201
+ declare <8 x i16 > @llvm.amdgcn.global.load.tr.v8i16 (ptr addrspace (1 ))
202
+ declare <8 x half > @llvm.amdgcn.global.load.tr.v8f16 (ptr addrspace (1 ))
203
+ declare <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16 (ptr addrspace (1 ))
204
+ declare i32 @llvm.amdgcn.global.load.tr.i32 (ptr addrspace (1 ))
205
+ declare <4 x i16 > @llvm.amdgcn.global.load.tr.v4i16 (ptr addrspace (1 ))
206
+ declare <4 x half > @llvm.amdgcn.global.load.tr.v4f16 (ptr addrspace (1 ))
207
+ declare <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16 (ptr addrspace (1 ))
208
+
128
209
attributes #0 = { nounwind convergent }
129
210
attributes #1 = { nounwind readnone convergent }
0 commit comments