@@ -383,12 +383,7 @@ define amdgpu_kernel void @s_fneg_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in
383
383
; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
384
384
; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
385
385
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
386
- ; GFX8-NEXT: s_xor_b32 s3, s2, 0x8000
387
- ; GFX8-NEXT: s_lshr_b32 s2, s2, 16
388
- ; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000
389
- ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
390
- ; GFX8-NEXT: s_lshl_b32 s2, s2, 16
391
- ; GFX8-NEXT: s_or_b32 s2, s3, s2
386
+ ; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000
392
387
; GFX8-NEXT: v_mov_b32_e32 v0, s0
393
388
; GFX8-NEXT: v_mov_b32_e32 v1, s1
394
389
; GFX8-NEXT: v_mov_b32_e32 v2, s2
@@ -401,44 +396,22 @@ define amdgpu_kernel void @s_fneg_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in
401
396
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
402
397
; GFX9-NEXT: v_mov_b32_e32 v0, 0
403
398
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
404
- ; GFX9-NEXT: s_xor_b32 s3, s2, 0x8000
405
- ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
406
- ; GFX9-NEXT: s_xor_b32 s2, s2, 0x8000
407
- ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2
399
+ ; GFX9-NEXT: s_xor_b32 s2, s2, 0x80008000
408
400
; GFX9-NEXT: v_mov_b32_e32 v1, s2
409
401
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
410
402
; GFX9-NEXT: s_endpgm
411
403
;
412
- ; GFX11-TRUE16-LABEL: s_fneg_v2bf16:
413
- ; GFX11-TRUE16: ; %bb.0:
414
- ; GFX11-TRUE16-NEXT: s_clause 0x1
415
- ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
416
- ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
417
- ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
418
- ; GFX11-TRUE16-NEXT: s_mov_b32 s3, s2
419
- ; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
420
- ; GFX11-TRUE16-NEXT: s_xor_b32 s3, s3, 0x8000
421
- ; GFX11-TRUE16-NEXT: s_xor_b32 s2, s2, 0x8000
422
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
423
- ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s3, s2
424
- ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
425
- ; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
426
- ; GFX11-TRUE16-NEXT: s_endpgm
427
- ;
428
- ; GFX11-FAKE16-LABEL: s_fneg_v2bf16:
429
- ; GFX11-FAKE16: ; %bb.0:
430
- ; GFX11-FAKE16-NEXT: s_clause 0x1
431
- ; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
432
- ; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
433
- ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
434
- ; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s2, 16
435
- ; GFX11-FAKE16-NEXT: s_xor_b32 s2, s2, 0x8000
436
- ; GFX11-FAKE16-NEXT: s_xor_b32 s3, s3, 0x8000
437
- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
438
- ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s3
439
- ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
440
- ; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
441
- ; GFX11-FAKE16-NEXT: s_endpgm
404
+ ; GFX11-LABEL: s_fneg_v2bf16:
405
+ ; GFX11: ; %bb.0:
406
+ ; GFX11-NEXT: s_clause 0x1
407
+ ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
408
+ ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
409
+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
410
+ ; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000
411
+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
412
+ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
413
+ ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
414
+ ; GFX11-NEXT: s_endpgm
442
415
%fneg = fsub <2 x bfloat> <bfloat -0 .0 , bfloat -0 .0 >, %in
443
416
store <2 x bfloat> %fneg , ptr addrspace (1 ) %out
444
417
ret void
@@ -473,15 +446,10 @@ define amdgpu_kernel void @s_fneg_v2bf16_nonload(ptr addrspace(1) %out) #0 {
473
446
; GFX8-NEXT: ;;#ASMSTART
474
447
; GFX8-NEXT: ; def s2
475
448
; GFX8-NEXT: ;;#ASMEND
476
- ; GFX8-NEXT: s_xor_b32 s3, s2, 0x8000
477
- ; GFX8-NEXT: s_lshr_b32 s2, s2, 16
478
- ; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000
479
- ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
480
- ; GFX8-NEXT: s_lshl_b32 s2, s2, 16
481
- ; GFX8-NEXT: s_or_b32 s2, s3, s2
449
+ ; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000
450
+ ; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
482
451
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
483
452
; GFX8-NEXT: v_mov_b32_e32 v0, s0
484
- ; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
485
453
; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
486
454
; GFX8-NEXT: v_mov_b32_e32 v1, s1
487
455
; GFX8-NEXT: v_mov_b32_e32 v2, s2
@@ -494,10 +462,7 @@ define amdgpu_kernel void @s_fneg_v2bf16_nonload(ptr addrspace(1) %out) #0 {
494
462
; GFX9-NEXT: ;;#ASMSTART
495
463
; GFX9-NEXT: ; def s2
496
464
; GFX9-NEXT: ;;#ASMEND
497
- ; GFX9-NEXT: s_xor_b32 s3, s2, 0x8000
498
- ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
499
- ; GFX9-NEXT: s_xor_b32 s2, s2, 0x8000
500
- ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2
465
+ ; GFX9-NEXT: s_xor_b32 s2, s2, 0x80008000
501
466
; GFX9-NEXT: v_mov_b32_e32 v0, 0
502
467
; GFX9-NEXT: v_mov_b32_e32 v1, s2
503
468
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -510,11 +475,8 @@ define amdgpu_kernel void @s_fneg_v2bf16_nonload(ptr addrspace(1) %out) #0 {
510
475
; GFX11-NEXT: ;;#ASMSTART
511
476
; GFX11-NEXT: ; def s2
512
477
; GFX11-NEXT: ;;#ASMEND
513
- ; GFX11-NEXT: s_lshr_b32 s3, s2, 16
514
- ; GFX11-NEXT: s_xor_b32 s2, s2, 0x8000
515
- ; GFX11-NEXT: s_xor_b32 s3, s3, 0x8000
516
- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
517
- ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s3
478
+ ; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000
479
+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
518
480
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
519
481
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
520
482
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -561,59 +523,34 @@ define amdgpu_kernel void @v_fneg_v2bf16(ptr addrspace(1) %out, ptr addrspace(1)
561
523
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
562
524
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
563
525
; GFX8-NEXT: flat_load_dword v2, v[0:1]
564
- ; GFX8-NEXT: v_mov_b32_e32 v3, 0x8000
565
526
; GFX8-NEXT: s_waitcnt vmcnt(0)
566
- ; GFX8-NEXT: v_xor_b32_e32 v4, 0x8000, v2
567
- ; GFX8-NEXT: v_xor_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
568
- ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
527
+ ; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
569
528
; GFX8-NEXT: flat_store_dword v[0:1], v2
570
529
; GFX8-NEXT: s_endpgm
571
530
;
572
531
; GFX9-LABEL: v_fneg_v2bf16:
573
532
; GFX9: ; %bb.0:
574
533
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
575
534
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
576
- ; GFX9-NEXT: s_mov_b32 s2, 0x8000
577
535
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
578
536
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
579
537
; GFX9-NEXT: s_waitcnt vmcnt(0)
580
- ; GFX9-NEXT: v_xor_b32_e32 v2, 0x8000, v1
581
- ; GFX9-NEXT: v_xor_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
582
- ; GFX9-NEXT: s_mov_b32 s2, 0x5040100
583
- ; GFX9-NEXT: v_perm_b32 v1, v1, v2, s2
538
+ ; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
584
539
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
585
540
; GFX9-NEXT: s_endpgm
586
541
;
587
- ; GFX11-TRUE16-LABEL: v_fneg_v2bf16:
588
- ; GFX11-TRUE16: ; %bb.0:
589
- ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
590
- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
591
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
592
- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
593
- ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
594
- ; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[0:1]
595
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
596
- ; GFX11-TRUE16-NEXT: v_xor_b16 v1.l, 0x8000, v1.l
597
- ; GFX11-TRUE16-NEXT: v_xor_b16 v1.h, 0x8000, v1.h
598
- ; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
599
- ; GFX11-TRUE16-NEXT: s_endpgm
600
- ;
601
- ; GFX11-FAKE16-LABEL: v_fneg_v2bf16:
602
- ; GFX11-FAKE16: ; %bb.0:
603
- ; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
604
- ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
605
- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
606
- ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
607
- ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
608
- ; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[0:1]
609
- ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
610
- ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
611
- ; GFX11-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
612
- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
613
- ; GFX11-FAKE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2
614
- ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
615
- ; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
616
- ; GFX11-FAKE16-NEXT: s_endpgm
542
+ ; GFX11-LABEL: v_fneg_v2bf16:
543
+ ; GFX11: ; %bb.0:
544
+ ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
545
+ ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
546
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
547
+ ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
548
+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
549
+ ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1]
550
+ ; GFX11-NEXT: s_waitcnt vmcnt(0)
551
+ ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
552
+ ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
553
+ ; GFX11-NEXT: s_endpgm
617
554
%tid = call i32 @llvm.amdgcn.workitem.id.x ()
618
555
%gep.in = getelementptr inbounds <2 x bfloat>, ptr addrspace (1 ) %in , i32 %tid
619
556
%gep.out = getelementptr inbounds <2 x bfloat>, ptr addrspace (1 ) %in , i32 %tid
@@ -651,12 +588,7 @@ define amdgpu_kernel void @fneg_free_v2bf16(ptr addrspace(1) %out, i32 %in) #0 {
651
588
; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
652
589
; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
653
590
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
654
- ; GFX8-NEXT: s_xor_b32 s3, s2, 0x8000
655
- ; GFX8-NEXT: s_lshr_b32 s2, s2, 16
656
- ; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000
657
- ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
658
- ; GFX8-NEXT: s_lshl_b32 s2, s2, 16
659
- ; GFX8-NEXT: s_or_b32 s2, s3, s2
591
+ ; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000
660
592
; GFX8-NEXT: v_mov_b32_e32 v0, s0
661
593
; GFX8-NEXT: v_mov_b32_e32 v1, s1
662
594
; GFX8-NEXT: v_mov_b32_e32 v2, s2
@@ -669,44 +601,22 @@ define amdgpu_kernel void @fneg_free_v2bf16(ptr addrspace(1) %out, i32 %in) #0 {
669
601
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
670
602
; GFX9-NEXT: v_mov_b32_e32 v0, 0
671
603
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
672
- ; GFX9-NEXT: s_xor_b32 s3, s2, 0x8000
673
- ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
674
- ; GFX9-NEXT: s_xor_b32 s2, s2, 0x8000
675
- ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2
604
+ ; GFX9-NEXT: s_xor_b32 s2, s2, 0x80008000
676
605
; GFX9-NEXT: v_mov_b32_e32 v1, s2
677
606
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
678
607
; GFX9-NEXT: s_endpgm
679
608
;
680
- ; GFX11-TRUE16-LABEL: fneg_free_v2bf16:
681
- ; GFX11-TRUE16: ; %bb.0:
682
- ; GFX11-TRUE16-NEXT: s_clause 0x1
683
- ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
684
- ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
685
- ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
686
- ; GFX11-TRUE16-NEXT: s_mov_b32 s3, s2
687
- ; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
688
- ; GFX11-TRUE16-NEXT: s_xor_b32 s3, s3, 0x8000
689
- ; GFX11-TRUE16-NEXT: s_xor_b32 s2, s2, 0x8000
690
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
691
- ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s3, s2
692
- ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
693
- ; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
694
- ; GFX11-TRUE16-NEXT: s_endpgm
695
- ;
696
- ; GFX11-FAKE16-LABEL: fneg_free_v2bf16:
697
- ; GFX11-FAKE16: ; %bb.0:
698
- ; GFX11-FAKE16-NEXT: s_clause 0x1
699
- ; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
700
- ; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
701
- ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
702
- ; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s2, 16
703
- ; GFX11-FAKE16-NEXT: s_xor_b32 s2, s2, 0x8000
704
- ; GFX11-FAKE16-NEXT: s_xor_b32 s3, s3, 0x8000
705
- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
706
- ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s3
707
- ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
708
- ; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
709
- ; GFX11-FAKE16-NEXT: s_endpgm
609
+ ; GFX11-LABEL: fneg_free_v2bf16:
610
+ ; GFX11: ; %bb.0:
611
+ ; GFX11-NEXT: s_clause 0x1
612
+ ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
613
+ ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
614
+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
615
+ ; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000
616
+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
617
+ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
618
+ ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
619
+ ; GFX11-NEXT: s_endpgm
710
620
%bc = bitcast i32 %in to <2 x bfloat>
711
621
%fsub = fsub <2 x bfloat> <bfloat -0 .0 , bfloat -0 .0 >, %bc
712
622
store <2 x bfloat> %fsub , ptr addrspace (1 ) %out
@@ -754,12 +664,12 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
754
664
; GFX8-NEXT: v_mov_b32_e32 v0, s0
755
665
; GFX8-NEXT: v_mov_b32_e32 v1, s1
756
666
; GFX8-NEXT: s_waitcnt vmcnt(0)
757
- ; GFX8-NEXT: v_xor_b32_sdwa v4, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
758
- ; GFX8-NEXT: v_xor_b32_sdwa v3 , v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
759
- ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16 , v2
760
- ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
761
- ; GFX8-NEXT: v_mul_f32_e32 v3, v3, v5
762
- ; GFX8-NEXT: v_mul_f32_e32 v2, v4, v2
667
+ ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
668
+ ; GFX8-NEXT: v_xor_b32_sdwa v5 , v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
669
+ ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000 , v2
670
+ ; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
671
+ ; GFX8-NEXT: v_mul_f32_e32 v3, v5, v4
672
+ ; GFX8-NEXT: v_mul_f32_e32 v2, v2, v6
763
673
; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
764
674
; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
765
675
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
@@ -786,22 +696,22 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
786
696
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
787
697
; GFX9-NEXT: s_mov_b32 s2, 0x8000
788
698
; GFX9-NEXT: s_waitcnt vmcnt(0)
699
+ ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1
789
700
; GFX9-NEXT: v_xor_b32_sdwa v4, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
790
- ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
791
- ; GFX9-NEXT: v_xor_b32_sdwa v3, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
792
- ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
793
- ; GFX9-NEXT: v_mul_f32_e32 v4, v4, v5
794
- ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1
795
- ; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1
701
+ ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
702
+ ; GFX9-NEXT: v_xor_b32_sdwa v1, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
703
+ ; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
704
+ ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5
705
+ ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
796
706
; GFX9-NEXT: v_bfe_u32 v6, v1, 16, 1
797
- ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4
798
- ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4
707
+ ; GFX9-NEXT: v_add_u32_e32 v4, v4, v3
708
+ ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
799
709
; GFX9-NEXT: v_add_u32_e32 v6, v6, v1
800
- ; GFX9-NEXT: v_add_u32_e32 v3 , 0x7fff, v3
801
- ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
710
+ ; GFX9-NEXT: v_add_u32_e32 v4 , 0x7fff, v4
711
+ ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
802
712
; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1
803
713
; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6
804
- ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3 , v5, vcc
714
+ ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4 , v5, vcc
805
715
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
806
716
; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc
807
717
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
@@ -1024,10 +934,9 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2bf16(ptr addrspace(1) %in) #
1024
934
; GFX8-NEXT: v_mov_b32_e32 v0, s0
1025
935
; GFX8-NEXT: v_mov_b32_e32 v1, s1
1026
936
; GFX8-NEXT: flat_load_dword v0, v[0:1]
1027
- ; GFX8-NEXT: v_mov_b32_e32 v1, 0x8000
1028
937
; GFX8-NEXT: s_waitcnt vmcnt(0)
1029
- ; GFX8-NEXT: v_xor_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1030
- ; GFX8-NEXT: v_xor_b32_e32 v0, 0x8000 , v0
938
+ ; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
939
+ ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16 , v0
1031
940
; GFX8-NEXT: flat_store_short v[0:1], v0
1032
941
; GFX8-NEXT: s_waitcnt vmcnt(0)
1033
942
; GFX8-NEXT: flat_store_short v[0:1], v1
@@ -1040,13 +949,11 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2bf16(ptr addrspace(1) %in) #
1040
949
; GFX9-NEXT: v_mov_b32_e32 v0, 0
1041
950
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1042
951
; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
1043
- ; GFX9-NEXT: s_mov_b32 s0, 0x8000
1044
952
; GFX9-NEXT: s_waitcnt vmcnt(0)
1045
- ; GFX9-NEXT: v_xor_b32_sdwa v1, v0, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1046
- ; GFX9-NEXT: v_xor_b32_e32 v0, 0x8000, v0
953
+ ; GFX9-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
1047
954
; GFX9-NEXT: global_store_short v[0:1], v0, off
1048
955
; GFX9-NEXT: s_waitcnt vmcnt(0)
1049
- ; GFX9-NEXT: global_store_short v[0:1], v1 , off
956
+ ; GFX9-NEXT: global_store_short_d16_hi v[0:1], v0 , off
1050
957
; GFX9-NEXT: s_waitcnt vmcnt(0)
1051
958
; GFX9-NEXT: s_endpgm
1052
959
;
@@ -1057,13 +964,10 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2bf16(ptr addrspace(1) %in) #
1057
964
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1058
965
; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
1059
966
; GFX11-NEXT: s_waitcnt vmcnt(0)
1060
- ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1061
- ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1062
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1063
- ; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1
967
+ ; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
1064
968
; GFX11-NEXT: global_store_b16 v[0:1], v0, off dlc
1065
969
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1066
- ; GFX11-NEXT: global_store_b16 v[0:1], v1 , off dlc
970
+ ; GFX11-NEXT: global_store_d16_hi_b16 v[0:1], v0 , off dlc
1067
971
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1068
972
; GFX11-NEXT: s_endpgm
1069
973
%val = load <2 x bfloat>, ptr addrspace (1 ) %in
0 commit comments