@@ -463,39 +463,41 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a,
463
463
define amdgpu_kernel void @test_smul24_i33 (ptr addrspace (1 ) %out , i33 %a , i33 %b ) #0 {
464
464
; SI-LABEL: test_smul24_i33:
465
465
; SI: ; %bb.0: ; %entry
466
- ; SI-NEXT: s_load_dword s6, s[4:5], 0xd
467
- ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
468
- ; SI-NEXT: s_load_dword s4, s[4:5], 0xb
469
- ; SI-NEXT: s_mov_b32 s3, 0xf000
470
- ; SI-NEXT: s_mov_b32 s2, -1
466
+ ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
467
+ ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
468
+ ; SI-NEXT: s_mov_b32 s7, 0xf000
469
+ ; SI-NEXT: s_mov_b32 s6, -1
471
470
; SI-NEXT: s_waitcnt lgkmcnt(0)
472
- ; SI-NEXT: s_bfe_i32 s5, s6, 0x180000
473
- ; SI-NEXT: s_bfe_i32 s4, s4, 0x180000
474
- ; SI-NEXT: v_mov_b32_e32 v0, s5
475
- ; SI-NEXT: s_mul_i32 s5, s4, s5
476
- ; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s4, v0
477
- ; SI-NEXT: v_mov_b32_e32 v0, s5
471
+ ; SI-NEXT: s_mov_b32 s4, s0
472
+ ; SI-NEXT: s_mov_b32 s5, s1
473
+ ; SI-NEXT: s_bfe_i32 s0, s8, 0x180000
474
+ ; SI-NEXT: s_bfe_i32 s1, s2, 0x180000
475
+ ; SI-NEXT: v_mov_b32_e32 v0, s0
476
+ ; SI-NEXT: s_mul_i32 s0, s1, s0
477
+ ; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s1, v0
478
+ ; SI-NEXT: v_mov_b32_e32 v0, s0
478
479
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 31
479
480
; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 31
480
- ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3 ], 0
481
+ ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7 ], 0
481
482
; SI-NEXT: s_endpgm
482
483
;
483
484
; VI-LABEL: test_smul24_i33:
484
485
; VI: ; %bb.0: ; %entry
485
- ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
486
- ; VI-NEXT: s_load_dword s3, s[4:5], 0x34
487
- ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
486
+ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
487
+ ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
488
+ ; VI-NEXT: s_mov_b32 s7, 0xf000
489
+ ; VI-NEXT: s_mov_b32 s6, -1
488
490
; VI-NEXT: s_waitcnt lgkmcnt(0)
489
491
; VI-NEXT: s_bfe_i32 s2, s2, 0x180000
490
- ; VI-NEXT: s_bfe_i32 s3, s3 , 0x180000
492
+ ; VI-NEXT: s_bfe_i32 s3, s4 , 0x180000
491
493
; VI-NEXT: v_mov_b32_e32 v0, s3
492
494
; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s2, v0
493
495
; VI-NEXT: v_mul_i32_i24_e32 v0, s2, v0
494
496
; VI-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
495
- ; VI-NEXT: s_mov_b32 s3, 0xf000
497
+ ; VI-NEXT: s_mov_b32 s4, s0
496
498
; VI-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1]
497
- ; VI-NEXT: s_mov_b32 s2, -1
498
- ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3 ], 0
499
+ ; VI-NEXT: s_mov_b32 s5, s1
500
+ ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7 ], 0
499
501
; VI-NEXT: s_endpgm
500
502
;
501
503
; GFX9-LABEL: test_smul24_i33:
@@ -574,30 +576,32 @@ entry:
574
576
define amdgpu_kernel void @test_smulhi24_i33 (ptr addrspace (1 ) %out , i33 %a , i33 %b ) {
575
577
; SI-LABEL: test_smulhi24_i33:
576
578
; SI: ; %bb.0: ; %entry
577
- ; SI-NEXT: s_load_dword s6, s[4:5], 0xd
578
- ; SI-NEXT: s_load_dword s7, s[4:5], 0xb
579
- ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
580
- ; SI-NEXT: s_mov_b32 s3, 0xf000
581
- ; SI-NEXT: s_mov_b32 s2, -1
579
+ ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
580
+ ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
581
+ ; SI-NEXT: s_mov_b32 s7, 0xf000
582
+ ; SI-NEXT: s_mov_b32 s6, -1
582
583
; SI-NEXT: s_waitcnt lgkmcnt(0)
583
- ; SI-NEXT: v_mov_b32_e32 v0, s6
584
- ; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s7, v0
584
+ ; SI-NEXT: s_mov_b32 s4, s0
585
+ ; SI-NEXT: s_mov_b32 s5, s1
586
+ ; SI-NEXT: v_mov_b32_e32 v0, s8
587
+ ; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s2, v0
585
588
; SI-NEXT: v_and_b32_e32 v0, 1, v0
586
- ; SI-NEXT: buffer_store_dword v0, off, s[0:3 ], 0
589
+ ; SI-NEXT: buffer_store_dword v0, off, s[4:7 ], 0
587
590
; SI-NEXT: s_endpgm
588
591
;
589
592
; VI-LABEL: test_smulhi24_i33:
590
593
; VI: ; %bb.0: ; %entry
591
- ; VI-NEXT: s_load_dword s6, s[4:5], 0x34
592
- ; VI-NEXT: s_load_dword s7, s[4:5], 0x2c
593
- ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
594
- ; VI-NEXT: s_mov_b32 s3, 0xf000
595
- ; VI-NEXT: s_mov_b32 s2, -1
594
+ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
595
+ ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
596
+ ; VI-NEXT: s_mov_b32 s7, 0xf000
597
+ ; VI-NEXT: s_mov_b32 s6, -1
596
598
; VI-NEXT: s_waitcnt lgkmcnt(0)
597
- ; VI-NEXT: v_mov_b32_e32 v0, s6
598
- ; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s7, v0
599
+ ; VI-NEXT: s_mov_b32 s4, s0
600
+ ; VI-NEXT: v_mov_b32_e32 v0, s8
601
+ ; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s2, v0
602
+ ; VI-NEXT: s_mov_b32 s5, s1
599
603
; VI-NEXT: v_and_b32_e32 v0, 1, v0
600
- ; VI-NEXT: buffer_store_dword v0, off, s[0:3 ], 0
604
+ ; VI-NEXT: buffer_store_dword v0, off, s[4:7 ], 0
601
605
; VI-NEXT: s_endpgm
602
606
;
603
607
; GFX9-LABEL: test_smulhi24_i33:
0 commit comments