|
1 | 1 | ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
|
| 2 | +; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX940 %s |
2 | 3 |
|
3 | 4 | declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
|
4 | 5 | declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
|
@@ -44,9 +45,13 @@ declare i32 @llvm.amdgcn.workitem.id.x()
|
44 | 45 | ; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
|
45 | 46 | ; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
|
46 | 47 | ; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
|
| 48 | +; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2 |
| 49 | +; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1 |
| 50 | +; GFX940-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} |
47 | 51 | ; GFX90A-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
|
48 | 52 | ; GFX90A-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
|
49 | 53 | ; GFX90A: v_mfma_f32_32x32x4bf16_1k a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
|
| 54 | +; GFX940: v_mfma_f32_32x32x4_2b_bf16 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9+]}}], v{{\[}}[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 |
50 | 55 | ; GCN-NOT: v_accvgpr_read_b32
|
51 | 56 | ; GCN-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
|
52 | 57 | define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(<32 x float> addrspace(1)* %arg) #0 {
|
|
64 | 69 | ; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
|
65 | 70 | ; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
|
66 | 71 | ; GFX90A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
|
| 72 | +; GFX940-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} |
67 | 73 | ; GFX90A: v_mfma_f32_16x16x4bf16_1k a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
|
| 74 | +; GFX940: v_mfma_f32_16x16x4_4b_bf16 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9+]}}], v{{\[}}[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 |
68 | 75 | ; GCN-NOT: v_accvgpr_read_b32
|
69 | 76 | ; GCN-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
|
70 | 77 | define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(<16 x float> addrspace(1)* %arg) #0 {
|
|
82 | 89 | ; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
|
83 | 90 | ; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
|
84 | 91 | ; GFX90A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
|
| 92 | +; GFX940-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} |
85 | 93 | ; GFX90A: v_mfma_f32_4x4x4bf16_1k [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
|
| 94 | +; GFX940: v_mfma_f32_4x4x4_16b_bf16 [[RES:a\[[0-9]+:[0-9]+\]]], v{{\[}}[[ONE]]:{{[0-9+]}}], v{{\[}}[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 |
86 | 95 | ; GCN-NOT: v_accvgpr_read_b32
|
87 | 96 | ; GCN: global_store_dwordx4 v{{[0-9]+}}, [[RES]],
|
88 | 97 | define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(<4 x float> addrspace(1)* %arg) #0 {
|
|
100 | 109 | ; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
|
101 | 110 | ; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
|
102 | 111 | ; GFX90A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
|
| 112 | +; GFX940-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} |
103 | 113 | ; GFX90A: v_mfma_f32_32x32x8bf16_1k a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
|
| 114 | +; GFX940: v_mfma_f32_32x32x8_bf16 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9+]}}], v{{\[}}[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 |
104 | 115 | ; GCN-NOT: v_accvgpr_read_b32
|
105 | 116 | ; GCN-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
|
106 | 117 | define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(<16 x float> addrspace(1)* %arg) #0 {
|
|
118 | 129 | ; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
|
119 | 130 | ; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
|
120 | 131 | ; GFX90A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
|
| 132 | +; GFX940-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} |
121 | 133 | ; GFX90A: v_mfma_f32_16x16x16bf16_1k [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
|
| 134 | +; GFX940: v_mfma_f32_16x16x16_bf16 [[RES:a\[[0-9]+:[0-9]+\]]], v{{\[}}[[ONE]]:{{[0-9+]}}], v{{\[}}[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 |
122 | 135 | ; GCN-NOT: v_accvgpr_read_b32
|
123 | 136 | ; GCN: global_store_dwordx4 v{{[0-9]+}}, [[RES]],
|
124 | 137 | define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(<4 x float> addrspace(1)* %arg) #0 {
|
|
134 | 147 | ; GCN-LABEL: {{^}}test_mfma_f64_4x4x4f64:
|
135 | 148 | ; GFX90A: v_mfma_f64_4x4x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 0{{$}}
|
136 | 149 | ; GFX90A: v_mfma_f64_4x4x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
|
| 150 | +; GFX940: v_mfma_f64_4x4x4_4b_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 0{{$}} |
| 151 | +; GFX940: v_mfma_f64_4x4x4_4b_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0] |
137 | 152 | ; GCN: global_store_dwordx2
|
138 | 153 | define amdgpu_kernel void @test_mfma_f64_4x4x4f64(double addrspace(1)* %arg, double %a, double %b) #0 {
|
139 | 154 | bb:
|
|
146 | 161 | ; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64:
|
147 | 162 | ; GCN: s_load_dwordx8
|
148 | 163 | ; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
|
| 164 | +; GFX940: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 neg:[1,1,0] |
149 | 165 | ; GCN: global_store_dwordx4
|
150 | 166 | ; GCN: global_store_dwordx4
|
151 | 167 | define amdgpu_kernel void @test_mfma_f64_16x16x4f64(<4 x double> addrspace(1)* %arg, double %a, double %b) #0 {
|
|
159 | 175 | ; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm:
|
160 | 176 | ; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 0{{$}}
|
161 | 177 | ; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
|
| 178 | +; GFX940: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 0{{$}} |
| 179 | +; GFX940: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0] |
162 | 180 | ; GCN: global_store_dwordx4
|
163 | 181 | ; GCN: global_store_dwordx4
|
164 | 182 | define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(<4 x double> addrspace(1)* %arg, double %a, double %b) #0 {
|
|
171 | 189 |
|
172 | 190 | ; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_imm:
|
173 | 191 | ; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}}
|
| 192 | +; GFX940: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}} |
174 | 193 | ; GCN: global_store_dwordx4
|
175 | 194 | ; GCN: global_store_dwordx4
|
176 | 195 | define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(<4 x double> addrspace(1)* %arg, double %a, double %b) #0 {
|
|
183 | 202 | ; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_lit:
|
184 | 203 | ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
|
185 | 204 | ; GFX90A-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x405ec000
|
| 205 | +; GFX940-DAG: s_mov_b32 s{{[0-9]+}}, 0x405ec000 |
186 | 206 | ; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}}
|
| 207 | +; GFX940: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}} |
187 | 208 | ; GCN: global_store_dwordx4
|
188 | 209 | ; GCN: global_store_dwordx4
|
189 | 210 | define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(<4 x double> addrspace(1)* %arg, double %a, double %b) #0 {
|
|
0 commit comments