|
1 | 1 | ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
2 |
| -; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s |
3 |
| -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s |
4 |
| -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s |
| 2 | +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=CHECK,SDAG %s |
| 3 | +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=CHECK,SDAG %s |
| 4 | +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=CHECK,GISEL %s |
5 | 5 |
|
6 | 6 | define amdgpu_gs half @v_fptrunc_round_f32_to_f16_upward(float %a) {
|
7 | 7 | ; CHECK-LABEL: v_fptrunc_round_f32_to_f16_upward:
|
@@ -98,3 +98,321 @@ define amdgpu_gs void @s_fptrunc_round_f32_to_f16_upward_multiple_calls(float in
|
98 | 98 | store half %res5, ptr addrspace(1) %out, align 4
|
99 | 99 | ret void
|
100 | 100 | }
|
| 101 | + |
| 102 | +define amdgpu_gs <2 x half> @v_fptrunc_round_v2f32_to_v2f16_upward(<2 x float> %a) { |
| 103 | +; SDAG-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward: |
| 104 | +; SDAG: ; %bb.0: |
| 105 | +; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 |
| 106 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 |
| 107 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 |
| 108 | +; SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 |
| 109 | +; SDAG-NEXT: ; return to shader part epilog |
| 110 | +; |
| 111 | +; GISEL-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward: |
| 112 | +; GISEL: ; %bb.0: |
| 113 | +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 |
| 114 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 |
| 115 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 |
| 116 | +; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| 117 | +; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 |
| 118 | +; GISEL-NEXT: ; return to shader part epilog |
| 119 | + %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward") |
| 120 | + ret <2 x half> %res |
| 121 | +} |
| 122 | + |
| 123 | +define amdgpu_gs <2 x half> @v_fptrunc_round_v2f32_to_v2f16_downward(<2 x float> %a) { |
| 124 | +; SDAG-LABEL: v_fptrunc_round_v2f32_to_v2f16_downward: |
| 125 | +; SDAG: ; %bb.0: |
| 126 | +; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 |
| 127 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 |
| 128 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 |
| 129 | +; SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 |
| 130 | +; SDAG-NEXT: ; return to shader part epilog |
| 131 | +; |
| 132 | +; GISEL-LABEL: v_fptrunc_round_v2f32_to_v2f16_downward: |
| 133 | +; GISEL: ; %bb.0: |
| 134 | +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 |
| 135 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 |
| 136 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 |
| 137 | +; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| 138 | +; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 |
| 139 | +; GISEL-NEXT: ; return to shader part epilog |
| 140 | + %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.downward") |
| 141 | + ret <2 x half> %res |
| 142 | +} |
| 143 | + |
| 144 | +define amdgpu_gs void @v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x float> %a, <2 x float> %b, ptr addrspace(1) %out) { |
| 145 | +; SDAG-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls: |
| 146 | +; SDAG: ; %bb.0: |
| 147 | +; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 |
| 148 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 |
| 149 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 |
| 150 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v6, v2 |
| 151 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v7, v3 |
| 152 | +; SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 |
| 153 | +; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2 |
| 154 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v2 |
| 155 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v2, v3 |
| 156 | +; SDAG-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 |
| 157 | +; SDAG-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 |
| 158 | +; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0 |
| 159 | +; SDAG-NEXT: v_pk_add_f16 v0, v0, v3 |
| 160 | +; SDAG-NEXT: v_pk_add_f16 v0, v1, v0 |
| 161 | +; SDAG-NEXT: global_store_dword v[4:5], v0, off |
| 162 | +; SDAG-NEXT: s_endpgm |
| 163 | +; |
| 164 | +; GISEL-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls: |
| 165 | +; GISEL: ; %bb.0: |
| 166 | +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 |
| 167 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 |
| 168 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v2 |
| 169 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 |
| 170 | +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2 |
| 171 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 |
| 172 | +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 1 |
| 173 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v7, v3 |
| 174 | +; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| 175 | +; GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 |
| 176 | +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2 |
| 177 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 |
| 178 | +; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 |
| 179 | +; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 |
| 180 | +; GISEL-NEXT: v_lshl_or_b32 v1, v7, 16, v6 |
| 181 | +; GISEL-NEXT: v_lshl_or_b32 v2, v3, 16, v2 |
| 182 | +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0 |
| 183 | +; GISEL-NEXT: v_pk_add_f16 v0, v0, v1 |
| 184 | +; GISEL-NEXT: v_pk_add_f16 v0, v2, v0 |
| 185 | +; GISEL-NEXT: global_store_dword v[4:5], v0, off |
| 186 | +; GISEL-NEXT: s_endpgm |
| 187 | + %res1 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward") |
| 188 | + %res2 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.upward") |
| 189 | + %res3 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.downward") |
| 190 | + %res4 = fadd <2 x half> %res1, %res2 |
| 191 | + %res5 = fadd <2 x half> %res3, %res4 |
| 192 | + store <2 x half> %res5, ptr addrspace(1) %out, align 4 |
| 193 | + ret void |
| 194 | +} |
| 195 | + |
| 196 | +define amdgpu_gs <2 x i32> @s_fptrunc_round_v2f32_to_v2f16_upward(<2 x float> inreg %a, ptr addrspace(1) %out) { |
| 197 | +; CHECK-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward: |
| 198 | +; CHECK: ; %bb.0: |
| 199 | +; CHECK-NEXT: v_mov_b32_e32 v0, s0 |
| 200 | +; CHECK-NEXT: v_mov_b32_e32 v1, s1 |
| 201 | +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 |
| 202 | +; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 |
| 203 | +; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1 |
| 204 | +; CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| 205 | +; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1 |
| 206 | +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 |
| 207 | +; CHECK-NEXT: v_readfirstlane_b32 s1, v1 |
| 208 | +; CHECK-NEXT: ; return to shader part epilog |
| 209 | + %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward") |
| 210 | + %bitcast = bitcast <2 x half> %res to <2 x i16> |
| 211 | + %ret = zext <2 x i16> %bitcast to <2 x i32> |
| 212 | + ret <2 x i32> %ret |
| 213 | +} |
| 214 | + |
| 215 | +define amdgpu_gs <2 x i32> @s_fptrunc_round_v2f32_to_v2f16_downward(<2 x float> inreg %a, ptr addrspace(1) %out) { |
| 216 | +; CHECK-LABEL: s_fptrunc_round_v2f32_to_v2f16_downward: |
| 217 | +; CHECK: ; %bb.0: |
| 218 | +; CHECK-NEXT: v_mov_b32_e32 v0, s0 |
| 219 | +; CHECK-NEXT: v_mov_b32_e32 v1, s1 |
| 220 | +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 |
| 221 | +; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 |
| 222 | +; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1 |
| 223 | +; CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| 224 | +; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1 |
| 225 | +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 |
| 226 | +; CHECK-NEXT: v_readfirstlane_b32 s1, v1 |
| 227 | +; CHECK-NEXT: ; return to shader part epilog |
| 228 | + %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.downward") |
| 229 | + %bitcast = bitcast <2 x half> %res to <2 x i16> |
| 230 | + %ret = zext <2 x i16> %bitcast to <2 x i32> |
| 231 | + ret <2 x i32> %ret |
| 232 | +} |
| 233 | + |
| 234 | +define amdgpu_gs void @s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x float> inreg %a, <2 x float> inreg %b, ptr addrspace(1) %out) { |
| 235 | +; CHECK-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls: |
| 236 | +; CHECK: ; %bb.0: |
| 237 | +; CHECK-NEXT: v_mov_b32_e32 v2, s0 |
| 238 | +; CHECK-NEXT: v_mov_b32_e32 v3, s2 |
| 239 | +; CHECK-NEXT: v_mov_b32_e32 v4, s1 |
| 240 | +; CHECK-NEXT: v_mov_b32_e32 v5, s3 |
| 241 | +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 |
| 242 | +; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2 |
| 243 | +; CHECK-NEXT: v_cvt_f16_f32_e32 v6, v3 |
| 244 | +; CHECK-NEXT: v_cvt_f16_f32_e32 v4, v4 |
| 245 | +; CHECK-NEXT: v_cvt_f16_f32_e32 v7, v5 |
| 246 | +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2 |
| 247 | +; CHECK-NEXT: v_cvt_f16_f32_e32 v3, v3 |
| 248 | +; CHECK-NEXT: v_and_b32_e32 v2, 0xffff, v2 |
| 249 | +; CHECK-NEXT: v_and_b32_e32 v6, 0xffff, v6 |
| 250 | +; CHECK-NEXT: v_and_b32_e32 v3, 0xffff, v3 |
| 251 | +; CHECK-NEXT: v_lshl_or_b32 v2, v4, 16, v2 |
| 252 | +; CHECK-NEXT: v_cvt_f16_f32_e32 v4, v5 |
| 253 | +; CHECK-NEXT: v_lshl_or_b32 v5, v7, 16, v6 |
| 254 | +; CHECK-NEXT: v_lshl_or_b32 v3, v4, 16, v3 |
| 255 | +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0 |
| 256 | +; CHECK-NEXT: v_pk_add_f16 v2, v2, v5 |
| 257 | +; CHECK-NEXT: v_pk_add_f16 v2, v3, v2 |
| 258 | +; CHECK-NEXT: global_store_dword v[0:1], v2, off |
| 259 | +; CHECK-NEXT: s_endpgm |
| 260 | + %res1 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward") |
| 261 | + %res2 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.upward") |
| 262 | + %res3 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.downward") |
| 263 | + %res4 = fadd <2 x half> %res1, %res2 |
| 264 | + %res5 = fadd <2 x half> %res3, %res4 |
| 265 | + store <2 x half> %res5, ptr addrspace(1) %out, align 4 |
| 266 | + ret void |
| 267 | +} |
| 268 | + |
| 269 | +; FIXME |
| 270 | +; define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_upward(<3 x float> %a) { |
| 271 | +; %res = call <3 x half> @llvm.fptrunc.round.v3f16.v3f32(<3 x float> %a, metadata !"round.upward") |
| 272 | +; ret <3 x half> %res |
| 273 | +; } |
| 274 | + |
| 275 | +; define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_downward(<3 x float> %a) { |
| 276 | +; %res = call <3 x half> @llvm.fptrunc.round.v3f16.v3f32(<3 x float> %a, metadata !"round.downward") |
| 277 | +; ret <3 x half> %res |
| 278 | +; } |
| 279 | + |
| 280 | +define amdgpu_gs <4 x half> @v_fptrunc_round_v4f32_to_v4f16_upward(<4 x float> %a) { |
| 281 | +; SDAG-LABEL: v_fptrunc_round_v4f32_to_v4f16_upward: |
| 282 | +; SDAG: ; %bb.0: |
| 283 | +; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 |
| 284 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 |
| 285 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 |
| 286 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 |
| 287 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 |
| 288 | +; SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 |
| 289 | +; SDAG-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 |
| 290 | +; SDAG-NEXT: ; return to shader part epilog |
| 291 | +; |
| 292 | +; GISEL-LABEL: v_fptrunc_round_v4f32_to_v4f16_upward: |
| 293 | +; GISEL: ; %bb.0: |
| 294 | +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 |
| 295 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 |
| 296 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 |
| 297 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 |
| 298 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 |
| 299 | +; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| 300 | +; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 |
| 301 | +; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 |
| 302 | +; GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2 |
| 303 | +; GISEL-NEXT: ; return to shader part epilog |
| 304 | + %res = call <4 x half> @llvm.fptrunc.round.v4f16.v4f32(<4 x float> %a, metadata !"round.upward") |
| 305 | + ret <4 x half> %res |
| 306 | +} |
| 307 | + |
| 308 | +define amdgpu_gs <4 x half> @v_fptrunc_round_v4f32_to_v4f16_downward(<4 x float> %a) { |
| 309 | +; SDAG-LABEL: v_fptrunc_round_v4f32_to_v4f16_downward: |
| 310 | +; SDAG: ; %bb.0: |
| 311 | +; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 |
| 312 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 |
| 313 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 |
| 314 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 |
| 315 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 |
| 316 | +; SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 |
| 317 | +; SDAG-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 |
| 318 | +; SDAG-NEXT: ; return to shader part epilog |
| 319 | +; |
| 320 | +; GISEL-LABEL: v_fptrunc_round_v4f32_to_v4f16_downward: |
| 321 | +; GISEL: ; %bb.0: |
| 322 | +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 |
| 323 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 |
| 324 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 |
| 325 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 |
| 326 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 |
| 327 | +; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| 328 | +; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 |
| 329 | +; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 |
| 330 | +; GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2 |
| 331 | +; GISEL-NEXT: ; return to shader part epilog |
| 332 | + %res = call <4 x half> @llvm.fptrunc.round.v4f16.v4f32(<4 x float> %a, metadata !"round.downward") |
| 333 | + ret <4 x half> %res |
| 334 | +} |
| 335 | + |
| 336 | +define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_upward(<8 x float> %a) { |
| 337 | +; SDAG-LABEL: v_fptrunc_round_v8f32_to_v8f16_upward: |
| 338 | +; SDAG: ; %bb.0: |
| 339 | +; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 |
| 340 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 |
| 341 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 |
| 342 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 |
| 343 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 |
| 344 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 |
| 345 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 |
| 346 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 |
| 347 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 |
| 348 | +; SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 |
| 349 | +; SDAG-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 |
| 350 | +; SDAG-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 |
| 351 | +; SDAG-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 |
| 352 | +; SDAG-NEXT: ; return to shader part epilog |
| 353 | +; |
| 354 | +; GISEL-LABEL: v_fptrunc_round_v8f32_to_v8f16_upward: |
| 355 | +; GISEL: ; %bb.0: |
| 356 | +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 |
| 357 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 |
| 358 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 |
| 359 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 |
| 360 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6 |
| 361 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 |
| 362 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 |
| 363 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5 |
| 364 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v7, v7 |
| 365 | +; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| 366 | +; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 |
| 367 | +; GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 |
| 368 | +; GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 |
| 369 | +; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 |
| 370 | +; GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2 |
| 371 | +; GISEL-NEXT: v_lshl_or_b32 v2, v5, 16, v4 |
| 372 | +; GISEL-NEXT: v_lshl_or_b32 v3, v7, 16, v6 |
| 373 | +; GISEL-NEXT: ; return to shader part epilog |
| 374 | + %res = call <8 x half> @llvm.fptrunc.round.v8f16.v8f32(<8 x float> %a, metadata !"round.upward") |
| 375 | + ret <8 x half> %res |
| 376 | +} |
| 377 | + |
| 378 | +define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_downward(<8 x float> %a) { |
| 379 | +; SDAG-LABEL: v_fptrunc_round_v8f32_to_v8f16_downward: |
| 380 | +; SDAG: ; %bb.0: |
| 381 | +; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 |
| 382 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6 |
| 383 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4 |
| 384 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 |
| 385 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 |
| 386 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 |
| 387 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 |
| 388 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5 |
| 389 | +; SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7 |
| 390 | +; SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 |
| 391 | +; SDAG-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 |
| 392 | +; SDAG-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 |
| 393 | +; SDAG-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 |
| 394 | +; SDAG-NEXT: ; return to shader part epilog |
| 395 | +; |
| 396 | +; GISEL-LABEL: v_fptrunc_round_v8f32_to_v8f16_downward: |
| 397 | +; GISEL: ; %bb.0: |
| 398 | +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 |
| 399 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 |
| 400 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 |
| 401 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 |
| 402 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6 |
| 403 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 |
| 404 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 |
| 405 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5 |
| 406 | +; GISEL-NEXT: v_cvt_f16_f32_e32 v7, v7 |
| 407 | +; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| 408 | +; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 |
| 409 | +; GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 |
| 410 | +; GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 |
| 411 | +; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 |
| 412 | +; GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2 |
| 413 | +; GISEL-NEXT: v_lshl_or_b32 v2, v5, 16, v4 |
| 414 | +; GISEL-NEXT: v_lshl_or_b32 v3, v7, 16, v6 |
| 415 | +; GISEL-NEXT: ; return to shader part epilog |
| 416 | + %res = call <8 x half> @llvm.fptrunc.round.v8f16.v8f32(<8 x float> %a, metadata !"round.downward") |
| 417 | + ret <8 x half> %res |
| 418 | +} |
0 commit comments