Skip to content

Commit a564425

Browse files
authored
[AMDGPU] Make <2 x bfloat> fneg legal (#142870)
1 parent 25642ea commit a564425

File tree

3 files changed

+75
-167
lines changed

3 files changed

+75
-167
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -751,7 +751,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
751751
setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
752752
Subtarget->hasVOP3PInsts() ? Legal : Custom);
753753

754-
setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
754+
setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
755755
// This isn't really legal, but this avoids the legalizer unrolling it (and
756756
// allows matching fneg (fabs x) patterns)
757757
setOperationAction(ISD::FABS, MVT::v2f16, Legal);

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1830,10 +1830,12 @@ def : GCNPat <
18301830
>;
18311831
} // End foreach fp16vt = ...
18321832

1833+
foreach v2fp16vt = [v2f16, v2bf16] in {
18331834
def : GCNPat <
1834-
(UniformUnaryFrag<fneg> (v2f16 SReg_32:$src)),
1835+
(UniformUnaryFrag<fneg> (v2fp16vt SReg_32:$src)),
18351836
(S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000)))
18361837
>;
1838+
}
18371839

18381840
def : GCNPat <
18391841
(UniformUnaryFrag<fabs> (v2f16 SReg_32:$src)),
@@ -1974,10 +1976,12 @@ def : GCNPat <
19741976
} // End SubtargetPredicate = UseRealTrue16Insts
19751977
} // End foreach fp16vt = ...
19761978

1979+
foreach v2fp16vt = [v2f16, v2bf16] in {
19771980
def : GCNPat <
1978-
(fneg (v2f16 VGPR_32:$src)),
1981+
(fneg (v2fp16vt VGPR_32:$src)),
19791982
(V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src)
19801983
>;
1984+
}
19811985

19821986
def : GCNPat <
19831987
(fabs (v2f16 VGPR_32:$src)),

llvm/test/CodeGen/AMDGPU/fneg.bf16.ll

Lines changed: 68 additions & 164 deletions
Original file line numberDiff line numberDiff line change
@@ -383,12 +383,7 @@ define amdgpu_kernel void @s_fneg_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in
383383
; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
384384
; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
385385
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
386-
; GFX8-NEXT: s_xor_b32 s3, s2, 0x8000
387-
; GFX8-NEXT: s_lshr_b32 s2, s2, 16
388-
; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000
389-
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
390-
; GFX8-NEXT: s_lshl_b32 s2, s2, 16
391-
; GFX8-NEXT: s_or_b32 s2, s3, s2
386+
; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000
392387
; GFX8-NEXT: v_mov_b32_e32 v0, s0
393388
; GFX8-NEXT: v_mov_b32_e32 v1, s1
394389
; GFX8-NEXT: v_mov_b32_e32 v2, s2
@@ -401,44 +396,22 @@ define amdgpu_kernel void @s_fneg_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in
401396
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
402397
; GFX9-NEXT: v_mov_b32_e32 v0, 0
403398
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
404-
; GFX9-NEXT: s_xor_b32 s3, s2, 0x8000
405-
; GFX9-NEXT: s_lshr_b32 s2, s2, 16
406-
; GFX9-NEXT: s_xor_b32 s2, s2, 0x8000
407-
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2
399+
; GFX9-NEXT: s_xor_b32 s2, s2, 0x80008000
408400
; GFX9-NEXT: v_mov_b32_e32 v1, s2
409401
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
410402
; GFX9-NEXT: s_endpgm
411403
;
412-
; GFX11-TRUE16-LABEL: s_fneg_v2bf16:
413-
; GFX11-TRUE16: ; %bb.0:
414-
; GFX11-TRUE16-NEXT: s_clause 0x1
415-
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
416-
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
417-
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
418-
; GFX11-TRUE16-NEXT: s_mov_b32 s3, s2
419-
; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
420-
; GFX11-TRUE16-NEXT: s_xor_b32 s3, s3, 0x8000
421-
; GFX11-TRUE16-NEXT: s_xor_b32 s2, s2, 0x8000
422-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
423-
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s3, s2
424-
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
425-
; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
426-
; GFX11-TRUE16-NEXT: s_endpgm
427-
;
428-
; GFX11-FAKE16-LABEL: s_fneg_v2bf16:
429-
; GFX11-FAKE16: ; %bb.0:
430-
; GFX11-FAKE16-NEXT: s_clause 0x1
431-
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
432-
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
433-
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
434-
; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s2, 16
435-
; GFX11-FAKE16-NEXT: s_xor_b32 s2, s2, 0x8000
436-
; GFX11-FAKE16-NEXT: s_xor_b32 s3, s3, 0x8000
437-
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
438-
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s3
439-
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
440-
; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
441-
; GFX11-FAKE16-NEXT: s_endpgm
404+
; GFX11-LABEL: s_fneg_v2bf16:
405+
; GFX11: ; %bb.0:
406+
; GFX11-NEXT: s_clause 0x1
407+
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
408+
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
409+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
410+
; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000
411+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
412+
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
413+
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
414+
; GFX11-NEXT: s_endpgm
442415
%fneg = fsub <2 x bfloat> <bfloat -0.0, bfloat -0.0>, %in
443416
store <2 x bfloat> %fneg, ptr addrspace(1) %out
444417
ret void
@@ -473,15 +446,10 @@ define amdgpu_kernel void @s_fneg_v2bf16_nonload(ptr addrspace(1) %out) #0 {
473446
; GFX8-NEXT: ;;#ASMSTART
474447
; GFX8-NEXT: ; def s2
475448
; GFX8-NEXT: ;;#ASMEND
476-
; GFX8-NEXT: s_xor_b32 s3, s2, 0x8000
477-
; GFX8-NEXT: s_lshr_b32 s2, s2, 16
478-
; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000
479-
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
480-
; GFX8-NEXT: s_lshl_b32 s2, s2, 16
481-
; GFX8-NEXT: s_or_b32 s2, s3, s2
449+
; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000
450+
; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
482451
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
483452
; GFX8-NEXT: v_mov_b32_e32 v0, s0
484-
; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
485453
; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
486454
; GFX8-NEXT: v_mov_b32_e32 v1, s1
487455
; GFX8-NEXT: v_mov_b32_e32 v2, s2
@@ -494,10 +462,7 @@ define amdgpu_kernel void @s_fneg_v2bf16_nonload(ptr addrspace(1) %out) #0 {
494462
; GFX9-NEXT: ;;#ASMSTART
495463
; GFX9-NEXT: ; def s2
496464
; GFX9-NEXT: ;;#ASMEND
497-
; GFX9-NEXT: s_xor_b32 s3, s2, 0x8000
498-
; GFX9-NEXT: s_lshr_b32 s2, s2, 16
499-
; GFX9-NEXT: s_xor_b32 s2, s2, 0x8000
500-
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2
465+
; GFX9-NEXT: s_xor_b32 s2, s2, 0x80008000
501466
; GFX9-NEXT: v_mov_b32_e32 v0, 0
502467
; GFX9-NEXT: v_mov_b32_e32 v1, s2
503468
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -510,11 +475,8 @@ define amdgpu_kernel void @s_fneg_v2bf16_nonload(ptr addrspace(1) %out) #0 {
510475
; GFX11-NEXT: ;;#ASMSTART
511476
; GFX11-NEXT: ; def s2
512477
; GFX11-NEXT: ;;#ASMEND
513-
; GFX11-NEXT: s_lshr_b32 s3, s2, 16
514-
; GFX11-NEXT: s_xor_b32 s2, s2, 0x8000
515-
; GFX11-NEXT: s_xor_b32 s3, s3, 0x8000
516-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
517-
; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s3
478+
; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000
479+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
518480
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
519481
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
520482
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -561,59 +523,34 @@ define amdgpu_kernel void @v_fneg_v2bf16(ptr addrspace(1) %out, ptr addrspace(1)
561523
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
562524
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
563525
; GFX8-NEXT: flat_load_dword v2, v[0:1]
564-
; GFX8-NEXT: v_mov_b32_e32 v3, 0x8000
565526
; GFX8-NEXT: s_waitcnt vmcnt(0)
566-
; GFX8-NEXT: v_xor_b32_e32 v4, 0x8000, v2
567-
; GFX8-NEXT: v_xor_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
568-
; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
527+
; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
569528
; GFX8-NEXT: flat_store_dword v[0:1], v2
570529
; GFX8-NEXT: s_endpgm
571530
;
572531
; GFX9-LABEL: v_fneg_v2bf16:
573532
; GFX9: ; %bb.0:
574533
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
575534
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
576-
; GFX9-NEXT: s_mov_b32 s2, 0x8000
577535
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
578536
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
579537
; GFX9-NEXT: s_waitcnt vmcnt(0)
580-
; GFX9-NEXT: v_xor_b32_e32 v2, 0x8000, v1
581-
; GFX9-NEXT: v_xor_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
582-
; GFX9-NEXT: s_mov_b32 s2, 0x5040100
583-
; GFX9-NEXT: v_perm_b32 v1, v1, v2, s2
538+
; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
584539
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
585540
; GFX9-NEXT: s_endpgm
586541
;
587-
; GFX11-TRUE16-LABEL: v_fneg_v2bf16:
588-
; GFX11-TRUE16: ; %bb.0:
589-
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
590-
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
591-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
592-
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
593-
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
594-
; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[0:1]
595-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
596-
; GFX11-TRUE16-NEXT: v_xor_b16 v1.l, 0x8000, v1.l
597-
; GFX11-TRUE16-NEXT: v_xor_b16 v1.h, 0x8000, v1.h
598-
; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
599-
; GFX11-TRUE16-NEXT: s_endpgm
600-
;
601-
; GFX11-FAKE16-LABEL: v_fneg_v2bf16:
602-
; GFX11-FAKE16: ; %bb.0:
603-
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
604-
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
605-
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
606-
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
607-
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
608-
; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[0:1]
609-
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
610-
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
611-
; GFX11-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
612-
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
613-
; GFX11-FAKE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2
614-
; GFX11-FAKE16-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
615-
; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
616-
; GFX11-FAKE16-NEXT: s_endpgm
542+
; GFX11-LABEL: v_fneg_v2bf16:
543+
; GFX11: ; %bb.0:
544+
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
545+
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
546+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
547+
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
548+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
549+
; GFX11-NEXT: global_load_b32 v1, v0, s[0:1]
550+
; GFX11-NEXT: s_waitcnt vmcnt(0)
551+
; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
552+
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
553+
; GFX11-NEXT: s_endpgm
617554
%tid = call i32 @llvm.amdgcn.workitem.id.x()
618555
%gep.in = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i32 %tid
619556
%gep.out = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i32 %tid
@@ -651,12 +588,7 @@ define amdgpu_kernel void @fneg_free_v2bf16(ptr addrspace(1) %out, i32 %in) #0 {
651588
; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
652589
; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
653590
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
654-
; GFX8-NEXT: s_xor_b32 s3, s2, 0x8000
655-
; GFX8-NEXT: s_lshr_b32 s2, s2, 16
656-
; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000
657-
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
658-
; GFX8-NEXT: s_lshl_b32 s2, s2, 16
659-
; GFX8-NEXT: s_or_b32 s2, s3, s2
591+
; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000
660592
; GFX8-NEXT: v_mov_b32_e32 v0, s0
661593
; GFX8-NEXT: v_mov_b32_e32 v1, s1
662594
; GFX8-NEXT: v_mov_b32_e32 v2, s2
@@ -669,44 +601,22 @@ define amdgpu_kernel void @fneg_free_v2bf16(ptr addrspace(1) %out, i32 %in) #0 {
669601
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
670602
; GFX9-NEXT: v_mov_b32_e32 v0, 0
671603
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
672-
; GFX9-NEXT: s_xor_b32 s3, s2, 0x8000
673-
; GFX9-NEXT: s_lshr_b32 s2, s2, 16
674-
; GFX9-NEXT: s_xor_b32 s2, s2, 0x8000
675-
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2
604+
; GFX9-NEXT: s_xor_b32 s2, s2, 0x80008000
676605
; GFX9-NEXT: v_mov_b32_e32 v1, s2
677606
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
678607
; GFX9-NEXT: s_endpgm
679608
;
680-
; GFX11-TRUE16-LABEL: fneg_free_v2bf16:
681-
; GFX11-TRUE16: ; %bb.0:
682-
; GFX11-TRUE16-NEXT: s_clause 0x1
683-
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
684-
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
685-
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
686-
; GFX11-TRUE16-NEXT: s_mov_b32 s3, s2
687-
; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
688-
; GFX11-TRUE16-NEXT: s_xor_b32 s3, s3, 0x8000
689-
; GFX11-TRUE16-NEXT: s_xor_b32 s2, s2, 0x8000
690-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
691-
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s3, s2
692-
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
693-
; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
694-
; GFX11-TRUE16-NEXT: s_endpgm
695-
;
696-
; GFX11-FAKE16-LABEL: fneg_free_v2bf16:
697-
; GFX11-FAKE16: ; %bb.0:
698-
; GFX11-FAKE16-NEXT: s_clause 0x1
699-
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
700-
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
701-
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
702-
; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s2, 16
703-
; GFX11-FAKE16-NEXT: s_xor_b32 s2, s2, 0x8000
704-
; GFX11-FAKE16-NEXT: s_xor_b32 s3, s3, 0x8000
705-
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
706-
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s3
707-
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
708-
; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
709-
; GFX11-FAKE16-NEXT: s_endpgm
609+
; GFX11-LABEL: fneg_free_v2bf16:
610+
; GFX11: ; %bb.0:
611+
; GFX11-NEXT: s_clause 0x1
612+
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
613+
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
614+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
615+
; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000
616+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
617+
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
618+
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
619+
; GFX11-NEXT: s_endpgm
710620
%bc = bitcast i32 %in to <2 x bfloat>
711621
%fsub = fsub <2 x bfloat> <bfloat -0.0, bfloat -0.0>, %bc
712622
store <2 x bfloat> %fsub, ptr addrspace(1) %out
@@ -754,12 +664,12 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
754664
; GFX8-NEXT: v_mov_b32_e32 v0, s0
755665
; GFX8-NEXT: v_mov_b32_e32 v1, s1
756666
; GFX8-NEXT: s_waitcnt vmcnt(0)
757-
; GFX8-NEXT: v_xor_b32_sdwa v4, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
758-
; GFX8-NEXT: v_xor_b32_sdwa v3, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
759-
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
760-
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
761-
; GFX8-NEXT: v_mul_f32_e32 v3, v3, v5
762-
; GFX8-NEXT: v_mul_f32_e32 v2, v4, v2
667+
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
668+
; GFX8-NEXT: v_xor_b32_sdwa v5, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
669+
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
670+
; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
671+
; GFX8-NEXT: v_mul_f32_e32 v3, v5, v4
672+
; GFX8-NEXT: v_mul_f32_e32 v2, v2, v6
763673
; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
764674
; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
765675
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
@@ -786,22 +696,22 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
786696
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
787697
; GFX9-NEXT: s_mov_b32 s2, 0x8000
788698
; GFX9-NEXT: s_waitcnt vmcnt(0)
699+
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1
789700
; GFX9-NEXT: v_xor_b32_sdwa v4, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
790-
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
791-
; GFX9-NEXT: v_xor_b32_sdwa v3, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
792-
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
793-
; GFX9-NEXT: v_mul_f32_e32 v4, v4, v5
794-
; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1
795-
; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1
701+
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
702+
; GFX9-NEXT: v_xor_b32_sdwa v1, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
703+
; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
704+
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5
705+
; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
796706
; GFX9-NEXT: v_bfe_u32 v6, v1, 16, 1
797-
; GFX9-NEXT: v_add_u32_e32 v3, v3, v4
798-
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4
707+
; GFX9-NEXT: v_add_u32_e32 v4, v4, v3
708+
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
799709
; GFX9-NEXT: v_add_u32_e32 v6, v6, v1
800-
; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
801-
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
710+
; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4
711+
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
802712
; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1
803713
; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6
804-
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
714+
; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
805715
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
806716
; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc
807717
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
@@ -1024,10 +934,9 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2bf16(ptr addrspace(1) %in) #
1024934
; GFX8-NEXT: v_mov_b32_e32 v0, s0
1025935
; GFX8-NEXT: v_mov_b32_e32 v1, s1
1026936
; GFX8-NEXT: flat_load_dword v0, v[0:1]
1027-
; GFX8-NEXT: v_mov_b32_e32 v1, 0x8000
1028937
; GFX8-NEXT: s_waitcnt vmcnt(0)
1029-
; GFX8-NEXT: v_xor_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1030-
; GFX8-NEXT: v_xor_b32_e32 v0, 0x8000, v0
938+
; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
939+
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1031940
; GFX8-NEXT: flat_store_short v[0:1], v0
1032941
; GFX8-NEXT: s_waitcnt vmcnt(0)
1033942
; GFX8-NEXT: flat_store_short v[0:1], v1
@@ -1040,13 +949,11 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2bf16(ptr addrspace(1) %in) #
1040949
; GFX9-NEXT: v_mov_b32_e32 v0, 0
1041950
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1042951
; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
1043-
; GFX9-NEXT: s_mov_b32 s0, 0x8000
1044952
; GFX9-NEXT: s_waitcnt vmcnt(0)
1045-
; GFX9-NEXT: v_xor_b32_sdwa v1, v0, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1046-
; GFX9-NEXT: v_xor_b32_e32 v0, 0x8000, v0
953+
; GFX9-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
1047954
; GFX9-NEXT: global_store_short v[0:1], v0, off
1048955
; GFX9-NEXT: s_waitcnt vmcnt(0)
1049-
; GFX9-NEXT: global_store_short v[0:1], v1, off
956+
; GFX9-NEXT: global_store_short_d16_hi v[0:1], v0, off
1050957
; GFX9-NEXT: s_waitcnt vmcnt(0)
1051958
; GFX9-NEXT: s_endpgm
1052959
;
@@ -1057,13 +964,10 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2bf16(ptr addrspace(1) %in) #
1057964
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1058965
; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
1059966
; GFX11-NEXT: s_waitcnt vmcnt(0)
1060-
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1061-
; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1062-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1063-
; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1
967+
; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
1064968
; GFX11-NEXT: global_store_b16 v[0:1], v0, off dlc
1065969
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1066-
; GFX11-NEXT: global_store_b16 v[0:1], v1, off dlc
970+
; GFX11-NEXT: global_store_d16_hi_b16 v[0:1], v0, off dlc
1067971
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1068972
; GFX11-NEXT: s_endpgm
1069973
%val = load <2 x bfloat>, ptr addrspace(1) %in

0 commit comments

Comments
 (0)