1
- ; RUN: llc -mtriple=amdgcn -mcpu=gfx942 --stop-after=si-fix-sgpr-copies < %s | FileCheck %s
1
+ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2
+ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
2
3
3
4
; iglp.opt should not be flagged as clobbering the memory operand for the global_load, and we should be able to
4
5
; lower into the scalar version (i.e. should not need to lower into vector version with waterfall loop)
5
- ; CHECK-NOT: WATERFALL
6
6
7
- define amdgpu_kernel void @_attn_forward_fp8e5_128x32x64_BW128 (ptr addrspace (1 ) %in , ptr addrspace (3 ) %out ) {
7
+ define amdgpu_kernel void @func (ptr addrspace (1 ) %in , ptr addrspace (3 ) %out ) {
8
+ ; CHECK-LABEL: func:
9
+ ; CHECK: ; %bb.0: ; %.lr.ph
10
+ ; CHECK-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
11
+ ; CHECK-NEXT: s_mov_b64 s[8:9], 0
12
+ ; CHECK-NEXT: s_mov_b64 s[10:11], 0
13
+ ; CHECK-NEXT: s_mov_b32 s3, 32
14
+ ; CHECK-NEXT: s_mov_b32 s2, 0
15
+ ; CHECK-NEXT: s_mov_b64 s[12:13], 0
16
+ ; CHECK-NEXT: .LBB0_1: ; %loop
17
+ ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
18
+ ; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11]
19
+ ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
20
+ ; CHECK-NEXT: s_add_u32 s10, s6, s12
21
+ ; CHECK-NEXT: s_addc_u32 s11, s7, s13
22
+ ; CHECK-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
23
+ ; CHECK-NEXT: s_add_i32 s3, s3, -1
24
+ ; CHECK-NEXT: s_cmp_lg_u32 s3, 0
25
+ ; CHECK-NEXT: ; iglp_opt mask(0x00000000)
26
+ ; CHECK-NEXT: s_cbranch_scc1 .LBB0_1
27
+ ; CHECK-NEXT: ; %bb.2: ; %end
28
+ ; CHECK-NEXT: s_and_b32 s1, s1, 0xffff
29
+ ; CHECK-NEXT: s_mov_b32 s3, s2
30
+ ; CHECK-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
31
+ ; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8
32
+ ; CHECK-NEXT: s_waitcnt vmcnt(0)
33
+ ; CHECK-NEXT: v_mov_b32_e32 v1, 0
34
+ ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
35
+ ; CHECK-NEXT: v_mov_b32_e32 v2, s0
36
+ ; CHECK-NEXT: v_and_b32_e32 v0, 0xff, v0
37
+ ; CHECK-NEXT: ds_write_b64 v2, v[0:1]
38
+ ; CHECK-NEXT: s_endpgm
8
39
.lr.ph:
9
- br label %1
40
+ br label %loop
10
41
11
- 1 : ; preds = %1, %.lr.ph
12
- %addr = phi ptr addrspace (1 ) [ null , %.lr.ph ], [ %gep , %1 ]
13
- %offset = phi i64 [ 0 , %.lr.ph ], [ %nextOff , %1 ]
14
- %inc = phi i32 [0 , %.lr.ph ], [ %incCond , %1 ]
42
+ loop : ; preds = %1, %.lr.ph
43
+ %addr = phi ptr addrspace (1 ) [ null , %.lr.ph ], [ %gep , %loop ]
44
+ %offset = phi i64 [ 0 , %.lr.ph ], [ %nextOff , %loop ]
45
+ %inc = phi i32 [0 , %.lr.ph ], [ %incCond , %loop ]
15
46
%rsrc = tail call ptr addrspace (8 ) @llvm.amdgcn.make.buffer.rsrc.p1 (ptr addrspace (1 ) %addr , i16 0 , i32 0 , i32 0 )
16
47
%load = tail call <2 x i32 > @llvm.amdgcn.raw.ptr.buffer.load.v2i32 (ptr addrspace (8 ) %rsrc , i32 0 , i32 0 , i32 0 )
17
48
%load.bc = bitcast <2 x i32 > %load to <8 x i8 >
@@ -25,15 +56,13 @@ define amdgpu_kernel void @_attn_forward_fp8e5_128x32x64_BW128(ptr addrspace(1)
25
56
%nextOff = extractelement <1 x i64 > %unmaskedload49 , i64 0
26
57
%incCond = add i32 %inc , 1
27
58
%cond = icmp eq i32 %incCond , 32
28
- br i1 %cond , label %2 , label %1
59
+ br i1 %cond , label %end , label %loop
29
60
30
- 2 :
61
+ end :
31
62
store <4 x half > %shuff , ptr addrspace (3 ) %out , align 8
32
63
ret void
33
64
}
34
65
35
- ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
36
66
declare ptr addrspace (8 ) @llvm.amdgcn.make.buffer.rsrc.p1 (ptr addrspace (1 ) readnone , i16 , i32 , i32 ) #0
37
67
38
- ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
39
68
declare <2 x i32 > @llvm.amdgcn.raw.ptr.buffer.load.v2i32 (ptr addrspace (8 ) nocapture readonly , i32 , i32 , i32 immarg) #1
0 commit comments