|
1 | 1 | ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
2 | 2 |
|
3 | 3 | ; GCN-LABEL: {{^}}test_mfma_loop_zeroinit:
|
4 |
| -; GCN-COUNT32: v_accvgpr_write_b32 |
| 4 | + |
| 5 | +; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only. |
| 6 | +; 3 vgprs are needed to avoid wait states between writes. |
| 7 | + |
| 8 | +; FIXME: We should not be using and temporary registers at all. |
| 9 | +; At the moment we initialize an sgpr, then copy it via vgprs. |
| 10 | + |
| 11 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2:v[0-9]+]] |
| 12 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3:v[0-9]+]] |
| 13 | + |
| 14 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1:v[0-9]+]] |
| 15 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] |
| 16 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] |
| 17 | + |
| 18 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] |
| 19 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] |
| 20 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] |
| 21 | + |
| 22 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] |
| 23 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] |
| 24 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] |
| 25 | + |
| 26 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] |
| 27 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] |
| 28 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] |
| 29 | + |
| 30 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] |
| 31 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] |
| 32 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] |
| 33 | + |
| 34 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] |
| 35 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] |
| 36 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] |
| 37 | + |
| 38 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] |
| 39 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] |
| 40 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] |
| 41 | + |
| 42 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] |
| 43 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] |
| 44 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] |
| 45 | + |
| 46 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] |
| 47 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] |
| 48 | +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] |
| 49 | + |
| 50 | +; Check that we do not copy agprs to vgprs and back inside the loop. |
| 51 | + |
5 | 52 | ; GCN: [[LOOP:BB[0-9_]+]]:
|
6 | 53 | ; GCN-NOT: v_accvgpr
|
7 | 54 | ; GCN: v_mfma_f32_32x32x1f32
|
8 | 55 | ; GCN-NOT: v_accvgpr
|
9 | 56 | ; GCN: s_cbranch_scc1 [[LOOP]]
|
| 57 | + |
| 58 | +; Final result should be read only once after the loop. |
| 59 | + |
10 | 60 | ; GCN-COUNT32: v_accvgpr_read_b32
|
| 61 | + |
11 | 62 | define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) {
|
12 | 63 | entry:
|
13 | 64 | br label %for.cond.preheader
|
|
0 commit comments