Skip to content

Commit f608ac6

Browse files
committed
AMDGPU: Push fneg into bitcast of integer select
Avoids some regressions in the math libraries in a future patch.
1 parent 6d3b779 commit f608ac6

File tree

3 files changed

+92
-80
lines changed

3 files changed

+92
-80
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -596,9 +596,12 @@ static bool fnegFoldsIntoOp(const SDNode *N) {
596596
// TODO: Is there a benefit to checking the conditions performFNegCombine
597597
// does? We don't for the other cases.
598598
SDValue BCSrc = N->getOperand(0);
599-
return BCSrc.getOpcode() == ISD::BUILD_VECTOR &&
600-
BCSrc.getNumOperands() == 2 &&
601-
BCSrc.getOperand(1).getValueSizeInBits() == 32;
599+
if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
600+
return BCSrc.getNumOperands() == 2 &&
601+
BCSrc.getOperand(1).getValueSizeInBits() == 32;
602+
}
603+
604+
return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
602605
}
603606

604607
return fnegFoldsIntoOpcode(Opc);
@@ -4182,6 +4185,25 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
41824185
return Result;
41834186
}
41844187

4188+
if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32) {
4189+
// fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
4190+
// select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
4191+
SDValue LHS =
4192+
DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
4193+
SDValue RHS =
4194+
DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
4195+
4196+
SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
4197+
SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
4198+
4199+
SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, MVT::f32,
4200+
BCSrc.getOperand(0), NegLHS, NegRHS);
4201+
if (!BCSrc.hasOneUse())
4202+
DAG.ReplaceAllUsesWith(BCSrc,
4203+
DAG.getNode(ISD::FNEG, SL, VT, NewSelect));
4204+
return NewSelect;
4205+
}
4206+
41854207
return SDValue();
41864208
}
41874209
default:

llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll

Lines changed: 37 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -3023,16 +3023,17 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1
30233023
; SI-NEXT: s_load_dword s4, s[0:1], 0xb
30243024
; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
30253025
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
3026+
; SI-NEXT: v_bfrev_b32_e32 v0, 1
30263027
; SI-NEXT: s_waitcnt lgkmcnt(0)
3027-
; SI-NEXT: s_and_b32 s4, 1, s4
3028-
; SI-NEXT: s_cselect_b32 s3, 0, s3
3029-
; SI-NEXT: s_xor_b32 s3, s3, 0x80000000
3030-
; SI-NEXT: s_cmp_eq_u32 s4, 1
3028+
; SI-NEXT: s_bitcmp1_b32 s4, 0
3029+
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
3030+
; SI-NEXT: v_mov_b32_e32 v1, s3
3031+
; SI-NEXT: s_and_b64 s[6:7], s[4:5], exec
3032+
; SI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5]
30313033
; SI-NEXT: s_cselect_b32 s2, 0, s2
3032-
; SI-NEXT: s_cselect_b32 s3, 0, s3
30333034
; SI-NEXT: v_mov_b32_e32 v3, s1
3035+
; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
30343036
; SI-NEXT: v_mov_b32_e32 v0, s2
3035-
; SI-NEXT: v_mov_b32_e32 v1, s3
30363037
; SI-NEXT: v_mov_b32_e32 v2, s0
30373038
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
30383039
; SI-NEXT: s_endpgm
@@ -3042,16 +3043,17 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1
30423043
; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
30433044
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
30443045
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
3046+
; VI-NEXT: v_bfrev_b32_e32 v0, 1
30453047
; VI-NEXT: s_waitcnt lgkmcnt(0)
3046-
; VI-NEXT: s_and_b32 s4, 1, s4
3047-
; VI-NEXT: s_cselect_b32 s3, 0, s3
3048-
; VI-NEXT: s_xor_b32 s3, s3, 0x80000000
3049-
; VI-NEXT: s_cmp_eq_u32 s4, 1
3048+
; VI-NEXT: s_bitcmp1_b32 s4, 0
3049+
; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
3050+
; VI-NEXT: v_mov_b32_e32 v1, s3
3051+
; VI-NEXT: s_and_b64 s[6:7], s[4:5], exec
3052+
; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5]
30503053
; VI-NEXT: s_cselect_b32 s2, 0, s2
3051-
; VI-NEXT: s_cselect_b32 s3, 0, s3
30523054
; VI-NEXT: v_mov_b32_e32 v3, s1
3055+
; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
30533056
; VI-NEXT: v_mov_b32_e32 v0, s2
3054-
; VI-NEXT: v_mov_b32_e32 v1, s3
30553057
; VI-NEXT: v_mov_b32_e32 v2, s0
30563058
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
30573059
; VI-NEXT: s_endpgm
@@ -3067,9 +3069,9 @@ define double @v_fneg_select_infloop_regression_f64(double %arg, i1 %arg1) {
30673069
; GCN: ; %bb.0:
30683070
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30693071
; GCN-NEXT: v_and_b32_e32 v2, 1, v2
3072+
; GCN-NEXT: v_bfrev_b32_e32 v3, 1
30703073
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
3071-
; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
3072-
; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
3074+
; GCN-NEXT: v_cndmask_b32_e64 v1, -v1, v3, vcc
30733075
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
30743076
; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
30753077
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -3221,19 +3223,17 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %a
32213223
; SI-NEXT: s_load_dword s4, s[0:1], 0xb
32223224
; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
32233225
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
3226+
; SI-NEXT: v_bfrev_b32_e32 v0, 1
32243227
; SI-NEXT: s_waitcnt lgkmcnt(0)
3225-
; SI-NEXT: s_and_b32 s4, 1, s4
3226-
; SI-NEXT: s_cselect_b32 s2, 0, s2
3227-
; SI-NEXT: s_xor_b32 s2, s2, 0x80000000
3228-
; SI-NEXT: s_cmp_eq_u32 s4, 1
3229-
; SI-NEXT: s_cselect_b32 s3, 0, s3
3230-
; SI-NEXT: s_cselect_b32 s2, 0, s2
3231-
; SI-NEXT: s_xor_b32 s3, s3, 0x80000000
3232-
; SI-NEXT: s_cmp_eq_u32 s4, 1
3233-
; SI-NEXT: v_mov_b32_e32 v0, s2
3234-
; SI-NEXT: s_cselect_b32 s2, 0, s3
3235-
; SI-NEXT: v_mov_b32_e32 v3, s1
3228+
; SI-NEXT: s_bitcmp1_b32 s4, 0
32363229
; SI-NEXT: v_mov_b32_e32 v1, s2
3230+
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
3231+
; SI-NEXT: v_cndmask_b32_e64 v2, -v1, v0, s[4:5]
3232+
; SI-NEXT: v_mov_b32_e32 v1, s3
3233+
; SI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5]
3234+
; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
3235+
; SI-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[4:5]
3236+
; SI-NEXT: v_mov_b32_e32 v3, s1
32373237
; SI-NEXT: v_mov_b32_e32 v2, s0
32383238
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
32393239
; SI-NEXT: s_endpgm
@@ -3243,19 +3243,17 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %a
32433243
; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
32443244
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
32453245
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
3246+
; VI-NEXT: v_bfrev_b32_e32 v0, 1
32463247
; VI-NEXT: s_waitcnt lgkmcnt(0)
3247-
; VI-NEXT: s_and_b32 s4, 1, s4
3248-
; VI-NEXT: s_cselect_b32 s2, 0, s2
3249-
; VI-NEXT: s_xor_b32 s2, s2, 0x80000000
3250-
; VI-NEXT: s_cmp_eq_u32 s4, 1
3251-
; VI-NEXT: s_cselect_b32 s3, 0, s3
3252-
; VI-NEXT: s_cselect_b32 s2, 0, s2
3253-
; VI-NEXT: s_xor_b32 s3, s3, 0x80000000
3254-
; VI-NEXT: s_cmp_eq_u32 s4, 1
3255-
; VI-NEXT: v_mov_b32_e32 v0, s2
3256-
; VI-NEXT: s_cselect_b32 s2, 0, s3
3257-
; VI-NEXT: v_mov_b32_e32 v3, s1
3248+
; VI-NEXT: s_bitcmp1_b32 s4, 0
32583249
; VI-NEXT: v_mov_b32_e32 v1, s2
3250+
; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
3251+
; VI-NEXT: v_cndmask_b32_e64 v2, -v1, v0, s[4:5]
3252+
; VI-NEXT: v_mov_b32_e32 v1, s3
3253+
; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5]
3254+
; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
3255+
; VI-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[4:5]
3256+
; VI-NEXT: v_mov_b32_e32 v3, s1
32593257
; VI-NEXT: v_mov_b32_e32 v2, s0
32603258
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
32613259
; VI-NEXT: s_endpgm
@@ -3271,11 +3269,10 @@ define <2 x float> @v_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1
32713269
; GCN: ; %bb.0:
32723270
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32733271
; GCN-NEXT: v_and_b32_e32 v2, 1, v2
3272+
; GCN-NEXT: v_bfrev_b32_e32 v3, 1
32743273
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
3275-
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
3276-
; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
3277-
; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
3278-
; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
3274+
; GCN-NEXT: v_cndmask_b32_e64 v1, -v1, v3, vcc
3275+
; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, v3, vcc
32793276
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
32803277
; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
32813278
; GCN-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll

Lines changed: 30 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -398,19 +398,18 @@ define double @fneg_xor_select_f64(i1 %cond, double %arg0, double %arg1) {
398398
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
399399
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
400400
; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
401-
; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
402-
; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
401+
; GCN-NEXT: v_cndmask_b32_e64 v1, -v4, -v2, vcc
403402
; GCN-NEXT: s_setpc_b64 s[30:31]
404403
;
405404
; GFX11-LABEL: fneg_xor_select_f64:
406405
; GFX11: ; %bb.0:
407406
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
408407
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
409408
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
410-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
409+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
411410
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
412-
; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2
413-
; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
411+
; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc_lo
412+
; GFX11-NEXT: v_cndmask_b32_e64 v1, -v4, -v2, vcc_lo
414413
; GFX11-NEXT: s_setpc_b64 s[30:31]
415414
%select = select i1 %cond, double %arg0, double %arg1
416415
%fneg = fneg double %select
@@ -422,38 +421,38 @@ define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %ar
422421
; GFX7: ; %bb.0:
423422
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
424423
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
425-
; GFX7-NEXT: v_mov_b32_e32 v7, v1
426424
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
427-
; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
428-
; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
425+
; GFX7-NEXT: v_cndmask_b32_e64 v2, -v4, -v2, vcc
426+
; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
427+
; GFX7-NEXT: v_xor_b32_e32 v1, 0x80000000, v2
429428
; GFX7-NEXT: flat_store_dwordx2 v[5:6], v[0:1]
430-
; GFX7-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
429+
; GFX7-NEXT: v_mov_b32_e32 v1, v2
431430
; GFX7-NEXT: s_waitcnt vmcnt(0)
432431
; GFX7-NEXT: s_setpc_b64 s[30:31]
433432
;
434433
; GFX9-LABEL: fneg_xor_select_f64_multi_user:
435434
; GFX9: ; %bb.0:
436435
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
437436
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
438-
; GFX9-NEXT: v_mov_b32_e32 v7, v1
439437
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
440-
; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
441-
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
438+
; GFX9-NEXT: v_cndmask_b32_e64 v2, -v4, -v2, vcc
439+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
440+
; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v2
442441
; GFX9-NEXT: global_store_dwordx2 v[5:6], v[0:1], off
443-
; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
442+
; GFX9-NEXT: v_mov_b32_e32 v1, v2
444443
; GFX9-NEXT: s_waitcnt vmcnt(0)
445444
; GFX9-NEXT: s_setpc_b64 s[30:31]
446445
;
447446
; GFX11-LABEL: fneg_xor_select_f64_multi_user:
448447
; GFX11: ; %bb.0:
449448
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
450449
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
451-
; GFX11-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v0, 1, v0
452-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
450+
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
451+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
453452
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
454-
; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v2 :: v_dual_cndmask_b32 v0, v3, v7
455-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
456-
; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
453+
; GFX11-NEXT: v_cndmask_b32_e64 v2, -v4, -v2, vcc_lo
454+
; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc_lo
455+
; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v2
457456
; GFX11-NEXT: global_store_b64 v[5:6], v[0:1], off
458457
; GFX11-NEXT: v_mov_b32_e32 v1, v2
459458
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -497,14 +496,13 @@ define double @select_fneg_select_fneg_f64(i1 %cond0, i1 %cond1, double %arg0, d
497496
; GCN: ; %bb.0:
498497
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
499498
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
499+
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
500500
; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
501501
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
502-
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
503502
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
504-
; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc
505-
; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
503+
; GCN-NEXT: v_cndmask_b32_e64 v2, -v3, -v5, vcc
506504
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
507-
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
505+
; GCN-NEXT: v_cndmask_b32_e64 v1, -v2, v2, vcc
508506
; GCN-NEXT: s_setpc_b64 s[30:31]
509507
;
510508
; GFX11-LABEL: select_fneg_select_fneg_f64:
@@ -513,16 +511,13 @@ define double @select_fneg_select_fneg_f64(i1 %cond0, i1 %cond1, double %arg0, d
513511
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
514512
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
515513
; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
516-
; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
517-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
514+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
518515
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
519-
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
520-
; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
521-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
516+
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_and_b32 v1, 1, v1
517+
; GFX11-NEXT: v_cndmask_b32_e64 v2, -v3, -v5, vcc_lo
518+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
522519
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
523-
; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
524-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
525-
; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
520+
; GFX11-NEXT: v_cndmask_b32_e64 v1, -v2, v2, vcc_lo
526521
; GFX11-NEXT: s_setpc_b64 s[30:31]
527522
%fneg0 = fneg double %arg0
528523
%select0 = select i1 %cond0, double %arg1, double %fneg0
@@ -894,10 +889,9 @@ define double @cospiD_pattern1(i32 %arg, double %arg1, double %arg2) {
894889
; GCN-NEXT: v_and_b32_e32 v5, 1, v0
895890
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
896891
; GCN-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
897-
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
898-
; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
892+
; GCN-NEXT: v_cndmask_b32_e64 v1, -v2, -v4, vcc
899893
; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 1, v0
900-
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
894+
; GCN-NEXT: v_cndmask_b32_e64 v1, -v1, v1, vcc
901895
; GCN-NEXT: v_mov_b32_e32 v0, v3
902896
; GCN-NEXT: s_setpc_b64 s[30:31]
903897
;
@@ -909,12 +903,11 @@ define double @cospiD_pattern1(i32 %arg, double %arg1, double %arg2) {
909903
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
910904
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5
911905
; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo
912-
; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
906+
; GFX11-NEXT: v_cndmask_b32_e64 v1, -v2, -v4, vcc_lo
913907
; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 1, v0
914908
; GFX11-NEXT: v_mov_b32_e32 v0, v3
915-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
916-
; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
917-
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
909+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
910+
; GFX11-NEXT: v_cndmask_b32_e64 v1, -v1, v1, vcc_lo
918911
; GFX11-NEXT: s_setpc_b64 s[30:31]
919912
%i = and i32 %arg, 1
920913
%i3 = icmp eq i32 %i, 0

0 commit comments

Comments
 (0)