-
Notifications
You must be signed in to change notification settings - Fork 14.3k
AMDGPU: Clean up maximum3/minimum3 tests #93025
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesThese were using patterns copied from older tests, before non-kernel functions were supported and manually written checks. Also stop using -flat-for-global, which only exists to try to share tests between SI/CI and VI+. This was also missing test coverage, we're incorrectly forming maximum3/minimum3 pre-gfx12. This is a pre-commit before fixing that. Patch is 115.76 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93025.diff 2 Files Affected:
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 6e45084dc4b80..fe6de63742b76 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -1,98 +1,1323 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
-
-; GCN-LABEL: {{^}}test_fmaximum3_olt_0_f32:
-; GCN: buffer_load_b32 [[REGC:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGB:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGA:v[0-9]+]]
-; GCN: v_maximum3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b32 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
- %a = load volatile float, ptr addrspace(1) %aptr, align 4
- %b = load volatile float, ptr addrspace(1) %bptr, align 4
- %c = load volatile float, ptr addrspace(1) %cptr, align 4
- %f0 = call float @llvm.maximum.f32(float %a, float %b)
- %f1 = call float @llvm.maximum.f32(float %f0, float %c)
- store float %f1, ptr addrspace(1) %out, align 4
- ret void
-}
-
-; Commute operand of second fmaximum
-; GCN-LABEL: {{^}}test_fmaximum3_olt_1_f32:
-; GCN: buffer_load_b32 [[REGB:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGA:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGC:v[0-9]+]]
-; GCN: v_maximum3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b32 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
- %a = load volatile float, ptr addrspace(1) %aptr, align 4
- %b = load volatile float, ptr addrspace(1) %bptr, align 4
- %c = load volatile float, ptr addrspace(1) %cptr, align 4
- %f0 = call float @llvm.maximum.f32(float %a, float %b)
- %f1 = call float @llvm.maximum.f32(float %c, float %f0)
- store float %f1, ptr addrspace(1) %out, align 4
- ret void
-}
-
-; GCN-LABEL: {{^}}test_fmaximum3_olt_0_f16:
-; GCN: buffer_load_u16 [[REGC:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGB:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGA:v[0-9]+]]
-; GCN: v_maximum3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b16 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
- %a = load volatile half, ptr addrspace(1) %aptr, align 2
- %b = load volatile half, ptr addrspace(1) %bptr, align 2
- %c = load volatile half, ptr addrspace(1) %cptr, align 2
- %f0 = call half @llvm.maximum.f16(half %a, half %b)
- %f1 = call half @llvm.maximum.f16(half %f0, half %c)
- store half %f1, ptr addrspace(1) %out, align 2
- ret void
-}
-
-; GCN-LABEL: {{^}}test_fmaximum3_olt_1_f16:
-; GCN: buffer_load_u16 [[REGA:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGB:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGC:v[0-9]+]]
-; GCN: v_maximum3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]]
-; GCN: buffer_store_b16 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
- %a = load volatile half, ptr addrspace(1) %aptr, align 2
- %b = load volatile half, ptr addrspace(1) %bptr, align 2
- %c = load volatile half, ptr addrspace(1) %cptr, align 2
- %f0 = call half @llvm.maximum.f16(half %a, half %b)
- %f1 = call half @llvm.maximum.f16(half %c, half %f0)
- store half %f1, ptr addrspace(1) %out, align 2
- ret void
-}
-
-; Checks whether the test passes; performMinMaxCombine() should not optimize vector patterns of maximum3
-; since there are no pack instructions for fmaximum3.
-; GCN-LABEL: {{^}}no_fmaximum3_v2f16:
-; GCN: v_pk_maximum_f16 v0, v0, v1
-; GCN: v_pk_maximum_f16 v0, v2, v0
-; GCN: v_pk_maximum_f16 v0, v0, v3
-; GCN-NEXT: s_setpc_b64
-define <2 x half> @no_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) {
-entry:
- %max = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
- %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> %max)
- %res = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max1, <2 x half> %d)
- ret <2 x half> %res
-}
-
-; GCN-LABEL: {{^}}no_fmaximum3_olt_0_f64:
-; GCN-COUNT-2: v_maximum_f64
-define amdgpu_kernel void @no_fmaximum3_olt_0_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
- %a = load volatile double, ptr addrspace(1) %aptr, align 4
- %b = load volatile double, ptr addrspace(1) %bptr, align 4
- %c = load volatile double, ptr addrspace(1) %cptr, align 4
- %f0 = call double @llvm.maximum.f64(double %a, double %b)
- %f1 = call double @llvm.maximum.f64(double %f0, double %c)
- store double %f1, ptr addrspace(1) %out, align 4
- ret void
-}
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -enable-var-scope -check-prefix=GFX12 %s
+
+declare half @llvm.fabs.f16(half)
+declare float @llvm.fabs.f32(float)
+declare <2 x half> @llvm.fabs.v2f16(<2 x half>)
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>)
declare double @llvm.maximum.f64(double, double)
declare float @llvm.maximum.f32(float, float)
declare half @llvm.maximum.f16(half, half)
declare <2 x half> @llvm.maximum.v2f16(<2 x half>, <2 x half>)
+declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>)
+declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
+
+define float @v_fmaximum3_f32(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %max0 = call float @llvm.maximum.f32(float %a, float %b)
+ %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+ ret float %max1
+}
+
+define float @v_fmaximum3_f32_commute(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_commute:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, v2, v0, v1
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %max0 = call float @llvm.maximum.f32(float %a, float %b)
+ %max1 = call float @llvm.maximum.f32(float %c, float %max0)
+ ret float %max1
+}
+
+define amdgpu_ps i32 @s_fmaximum3_f32(float inreg %a, float inreg %b, float inreg %c) {
+; GFX12-LABEL: s_fmaximum3_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_maximum3_f32 v0, s0, s1, v0
+; GFX12-NEXT: v_readfirstlane_b32 s0, v0
+; GFX12-NEXT: ; return to shader part epilog
+ %max0 = call float @llvm.maximum.f32(float %a, float %b)
+ %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+ %cast = bitcast float %max1 to i32
+ %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+ ret i32 %readfirstlane
+}
+
+define float @v_fmaximum3_f32_fabs0(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs0:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, |v0|, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %a.fabs = call float @llvm.fabs.f32(float %a)
+ %max0 = call float @llvm.maximum.f32(float %a.fabs, float %b)
+ %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+ ret float %max1
+}
+
+define float @v_fmaximum3_f32_fabs1(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, v0, |v1|, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %b.fabs = call float @llvm.fabs.f32(float %b)
+ %max0 = call float @llvm.maximum.f32(float %a, float %b.fabs)
+ %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+ ret float %max1
+}
+
+define float @v_fmaximum3_f32_fabs2(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs2:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, |v2|
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %c.fabs = call float @llvm.fabs.f32(float %c)
+ %max0 = call float @llvm.maximum.f32(float %a, float %b)
+ %max1 = call float @llvm.maximum.f32(float %max0, float %c.fabs)
+ ret float %max1
+}
+
+define float @v_fmaximum3_f32_fabs_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs_all:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, |v0|, |v1|, |v2|
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %a.fabs = call float @llvm.fabs.f32(float %a)
+ %b.fabs = call float @llvm.fabs.f32(float %b)
+ %c.fabs = call float @llvm.fabs.f32(float %c)
+ %max0 = call float @llvm.maximum.f32(float %a.fabs, float %b.fabs)
+ %max1 = call float @llvm.maximum.f32(float %max0, float %c.fabs)
+ ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg_all:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, -v0, -v1, -v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %a.fneg = fneg float %a
+ %b.fneg = fneg float %b
+ %c.fneg = fneg float %c
+ %max0 = call float @llvm.maximum.f32(float %a.fneg, float %b.fneg)
+ %max1 = call float @llvm.maximum.f32(float %max0, float %c.fneg)
+ ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg_fabs_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg_fabs_all:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, -|v0|, -|v1|, -|v2|
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %a.fabs = call float @llvm.fabs.f32(float %a)
+ %b.fabs = call float @llvm.fabs.f32(float %b)
+ %c.fabs = call float @llvm.fabs.f32(float %c)
+ %a.fneg.fabs = fneg float %a.fabs
+ %b.fneg.fabs = fneg float %b.fabs
+ %c.fneg.fabs = fneg float %c.fabs
+ %max0 = call float @llvm.maximum.f32(float %a.fneg.fabs, float %b.fneg.fabs)
+ %max1 = call float @llvm.maximum.f32(float %max0, float %c.fneg.fabs)
+ ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg0(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg0:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, -v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %a.fneg = fneg float %a
+ %max0 = call float @llvm.maximum.f32(float %a.fneg, float %b)
+ %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+ ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg1(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, v0, -v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %b.fneg = fneg float %b
+ %max0 = call float @llvm.maximum.f32(float %a, float %b.fneg)
+ %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+ ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg2(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg2:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, -v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %c.fneg = fneg float %c
+ %max0 = call float @llvm.maximum.f32(float %a, float %b)
+ %max1 = call float @llvm.maximum.f32(float %max0, float %c.fneg)
+ ret float %max1
+}
+
+define float @v_fmaximum3_f32_const0(float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_const0:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, v0, 0x41000000, v1
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %max0 = call float @llvm.maximum.f32(float 8.0, float %b)
+ %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+ ret float %max1
+}
+
+define float @v_fmaximum3_f32__const2(float %a, float %b) {
+; GFX12-LABEL: v_fmaximum3_f32__const2:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, 0x41000000
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %max0 = call float @llvm.maximum.f32(float %a, float %b)
+ %max1 = call float @llvm.maximum.f32(float %max0, float 8.0)
+ ret float %max1
+}
+
+define float @v_fmaximum3_f32_inlineimm0(float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_inlineimm0:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, v0, 4.0, v1
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %max0 = call float @llvm.maximum.f32(float 4.0, float %b)
+ %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+ ret float %max1
+}
+
+define float @v_fmaximum3_f32__inlineimm(float %a, float %b) {
+; GFX12-LABEL: v_fmaximum3_f32__inlineimm:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, 4.0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %max0 = call float @llvm.maximum.f32(float %a, float %b)
+ %max1 = call float @llvm.maximum.f32(float %max0, float 4.0)
+ ret float %max1
+}
+
+define float @v_fmaximum3_f32_const1_const2(float %a) {
+; GFX12-LABEL: v_fmaximum3_f32_const1_const2:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s0, 0x41000000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_maximum3_f32 v0, v0, s0, 0x41800000
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %max0 = call float @llvm.maximum.f32(float %a, float 8.0)
+ %max1 = call float @llvm.maximum.f32(float %max0, float 16.0)
+ ret float %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, v4, v0, v2
+; GFX12-NEXT: v_maximum3_f32 v1, v5, v1, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
+ %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %c, <2 x float> %max0)
+ ret <2 x float> %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32_commute:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, v0, v2, v4
+; GFX12-NEXT: v_maximum3_f32 v1, v1, v3, v5
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
+ %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c)
+ ret <2 x float> %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32__fabs_all:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, |v0|, |v2|, |v4|
+; GFX12-NEXT: v_maximum3_f32 v1, |v1|, |v3|, |v5|
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %a.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+ %b.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %b)
+ %c.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %c)
+ %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a.fabs, <2 x float> %b.fabs)
+ %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c.fabs)
+ ret <2 x float> %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32__fneg_all:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, -v0, -v2, -v4
+; GFX12-NEXT: v_maximum3_f32 v1, -v1, -v3, -v5
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %a.fneg = fneg <2 x float> %a
+ %b.fneg = fneg <2 x float> %b
+ %c.fneg = fneg <2 x float> %c
+ %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a.fneg, <2 x float> %b.fneg)
+ %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c.fneg)
+ ret <2 x float> %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32__inlineimm1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum3_f32 v0, v0, 2.0, v2
+; GFX12-NEXT: v_maximum3_f32 v1, v1, 2.0, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> <float 2.0, float 2.0>)
+ %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c)
+ ret <2 x float> %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32__inlineimm2:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maximum_f32 v0, v4, 4.0
+; GFX12-NEXT: v_maximum_f32 v1, v5, 4.0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
+ %max1 = call <...
[truncated]
|
6d443aa
to
1920ab0
Compare
1920ab0
to
f7938ae
Compare
These were using patterns copied from older tests, before non-kernel functions were supported and manually written checks. Also stop using -flat-for-global, which only exists to try to share tests between SI/CI and VI+. This was also missing test coverage, we're incorrectly forming maximum3/minimum3 pre-gfx12. This is a pre-commit before fixing that.
f7938ae
to
f64f065
Compare
These were using patterns copied from older tests, before non-kernel functions were supported and manually written checks. Also stop using -flat-for-global, which only exists to try to share tests between SI/CI and VI+.
This was also missing test coverage, we're incorrectly forming maximum3/minimum3 pre-gfx12. This is a pre-commit before fixing that.