|
4 | 4 | declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
|
5 | 5 | declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
|
6 | 6 | declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
|
| 7 | +declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half>, <16 x half> , <16 x half>, i1 immarg) |
7 | 8 | declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
|
| 9 | +declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) |
8 | 10 | declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)
|
9 | 11 | declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)
|
10 | 12 |
|
|
78 | 80 | ret void
|
79 | 81 | }
|
80 | 82 |
|
| 83 | +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) { |
| 84 | +; W32-LABEL: test_wmma_f16_16x16x16_f16_untied: |
| 85 | +; W32: ; %bb.0: ; %bb |
| 86 | +; W32-NEXT: v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[32:39] |
| 87 | +; W32-NEXT: v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39] |
| 88 | +; W32-NEXT: s_clause 0x1 |
| 89 | +; W32-NEXT: global_store_b128 v[40:41], v[44:47], off |
| 90 | +; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16 |
| 91 | +; W32-NEXT: s_clause 0x1 |
| 92 | +; W32-NEXT: global_store_b128 v[42:43], v[32:35], off |
| 93 | +; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 |
| 94 | +; W32-NEXT: s_nop 0 |
| 95 | +; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) |
| 96 | +; W32-NEXT: s_endpgm |
| 97 | +bb: |
| 98 | + %res.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0) |
| 99 | + %res.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, i1 0) |
| 100 | + store <16 x half> %res.0, ptr addrspace(1) %out.0, align 32 |
| 101 | + store <16 x half> %res.1, ptr addrspace(1) %out.1, align 32 |
| 102 | + ret void |
| 103 | +} |
| 104 | + |
| 105 | +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) { |
| 106 | +; W32-LABEL: test_wmma_f16_16x16x16_f16_tied: |
| 107 | +; W32: ; %bb.0: ; %bb |
| 108 | +; W32-NEXT: v_dual_mov_b32 v51, v39 :: v_dual_mov_b32 v50, v38 |
| 109 | +; W32-NEXT: v_dual_mov_b32 v49, v37 :: v_dual_mov_b32 v48, v36 |
| 110 | +; W32-NEXT: v_dual_mov_b32 v47, v35 :: v_dual_mov_b32 v46, v34 |
| 111 | +; W32-NEXT: v_dual_mov_b32 v45, v33 :: v_dual_mov_b32 v44, v32 |
| 112 | +; W32-NEXT: v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39] |
| 113 | +; W32-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| 114 | +; W32-NEXT: v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[44:51] |
| 115 | +; W32-NEXT: s_clause 0x1 |
| 116 | +; W32-NEXT: global_store_b128 v[40:41], v[44:47], off |
| 117 | +; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16 |
| 118 | +; W32-NEXT: s_clause 0x1 |
| 119 | +; W32-NEXT: global_store_b128 v[42:43], v[32:35], off |
| 120 | +; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 |
| 121 | +; W32-NEXT: s_nop 0 |
| 122 | +; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) |
| 123 | +; W32-NEXT: s_endpgm |
| 124 | +bb: |
| 125 | + %res.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0) |
| 126 | + %res.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, i1 0) |
| 127 | + store <16 x half> %res.0, ptr addrspace(1) %out.0, align 32 |
| 128 | + store <16 x half> %res.1, ptr addrspace(1) %out.1, align 32 |
| 129 | + ret void |
| 130 | +} |
| 131 | + |
81 | 132 | ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
|
82 | 133 |
|
83 | 134 | define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, ptr addrspace(1) %out) {
|
|
112 | 163 | ret void
|
113 | 164 | }
|
114 | 165 |
|
| 166 | +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) { |
| 167 | +; W32-LABEL: test_wmma_bf16_16x16x16_bf16_untied: |
| 168 | +; W32: ; %bb.0: ; %bb |
| 169 | +; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[32:39] |
| 170 | +; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39] |
| 171 | +; W32-NEXT: s_clause 0x1 |
| 172 | +; W32-NEXT: global_store_b128 v[40:41], v[44:47], off |
| 173 | +; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16 |
| 174 | +; W32-NEXT: s_clause 0x1 |
| 175 | +; W32-NEXT: global_store_b128 v[42:43], v[32:35], off |
| 176 | +; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 |
| 177 | +; W32-NEXT: s_nop 0 |
| 178 | +; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) |
| 179 | +; W32-NEXT: s_endpgm |
| 180 | +bb: |
| 181 | + %res.0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0) |
| 182 | + %res.1 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, i1 0) |
| 183 | + store <16 x i16> %res.0, ptr addrspace(1) %out.0, align 32 |
| 184 | + store <16 x i16> %res.1, ptr addrspace(1) %out.1, align 32 |
| 185 | + ret void |
| 186 | +} |
| 187 | + |
| 188 | +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) { |
| 189 | +; W32-LABEL: test_wmma_bf16_16x16x16_bf16_tied: |
| 190 | +; W32: ; %bb.0: ; %bb |
| 191 | +; W32-NEXT: v_dual_mov_b32 v51, v39 :: v_dual_mov_b32 v50, v38 |
| 192 | +; W32-NEXT: v_dual_mov_b32 v49, v37 :: v_dual_mov_b32 v48, v36 |
| 193 | +; W32-NEXT: v_dual_mov_b32 v47, v35 :: v_dual_mov_b32 v46, v34 |
| 194 | +; W32-NEXT: v_dual_mov_b32 v45, v33 :: v_dual_mov_b32 v44, v32 |
| 195 | +; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39] |
| 196 | +; W32-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| 197 | +; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[44:51] |
| 198 | +; W32-NEXT: s_clause 0x1 |
| 199 | +; W32-NEXT: global_store_b128 v[40:41], v[44:47], off |
| 200 | +; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16 |
| 201 | +; W32-NEXT: s_clause 0x1 |
| 202 | +; W32-NEXT: global_store_b128 v[42:43], v[32:35], off |
| 203 | +; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 |
| 204 | +; W32-NEXT: s_nop 0 |
| 205 | +; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) |
| 206 | +; W32-NEXT: s_endpgm |
| 207 | +bb: |
| 208 | + %res.0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0) |
| 209 | + %res.1 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, i1 0) |
| 210 | + store <16 x i16> %res.0, ptr addrspace(1) %out.0, align 32 |
| 211 | + store <16 x i16> %res.1, ptr addrspace(1) %out.1, align 32 |
| 212 | + ret void |
| 213 | +} |
| 214 | + |
115 | 215 | ; @llvm.amdgcn.wmma.i32.16x16x16.iu8
|
116 | 216 |
|
117 | 217 | define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
|
|
0 commit comments