|
3 | 3 | ; Interleave loads and stores to fit into 9 VGPR limit.
|
4 | 4 | ; This requires to avoid load/store clustering.
|
5 | 5 |
|
| 6 | +; Reschedule the second scheduling region without clustering while |
| 7 | +; the first region is skipped. |
| 8 | + |
6 | 9 | ; GCN: global_load_dwordx4
|
7 | 10 | ; GCN: global_store_dwordx4
|
8 | 11 | ; GCN: global_load_dwordx4
|
|
12 | 15 | ; GCN: NumVgprs: {{[0-9]$}}
|
13 | 16 | ; GCN: ScratchSize: 0{{$}}
|
14 | 17 |
|
15 |
| -define amdgpu_kernel void @load_store_max_9vgprs(<4 x i32> addrspace(1)* nocapture noalias readonly %arg, <4 x i32> addrspace(1)* nocapture noalias %arg1) #1 { |
| 18 | +define amdgpu_kernel void @load_store_max_9vgprs(<4 x i32> addrspace(1)* nocapture noalias readonly %arg, <4 x i32> addrspace(1)* nocapture noalias %arg1, i1 %cnd) #1 { |
16 | 19 | bb:
|
17 | 20 | %id = call i32 @llvm.amdgcn.workitem.id.x()
|
18 | 21 | %base = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %id
|
| 22 | + br i1 %cnd, label %bb1, label %bb2 |
| 23 | + |
| 24 | +bb1: |
19 | 25 | %tmp = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 1
|
20 | 26 | %tmp2 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp, align 4
|
21 | 27 | %tmp3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 3
|
|
27 | 33 | store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp7, align 4
|
28 | 34 | %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 5
|
29 | 35 | store <4 x i32> %tmp6, <4 x i32> addrspace(1)* %tmp8, align 4
|
| 36 | + br label %bb2 |
| 37 | + |
| 38 | +bb2: |
30 | 39 | ret void
|
31 | 40 | }
|
32 | 41 |
|
|
0 commit comments