Skip to content

Commit 15fb186

Browse files
committed
[AMDGPU] Stop reserving $vcc_hi in wave32 mode
This gives us one extra SGPR to play with. The comment suggested that it could cause bugs, but I have tested it with Vulkan CTS with the default wave size for compute shaders set to 32 and did not find any problems.
1 parent 29e42da commit 15fb186

File tree

6 files changed

+88
-99
lines changed

6 files changed

+88
-99
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -612,13 +612,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
612612
// Reserve null register - it shall never be allocated
613613
reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
614614

615-
// Disallow vcc_hi allocation in wave32. It may be allocated but most likely
616-
// will result in bugs.
617-
if (isWave32) {
618-
Reserved.set(AMDGPU::VCC);
619-
Reserved.set(AMDGPU::VCC_HI);
620-
}
621-
622615
// Reserve SGPRs.
623616
//
624617
unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -38342,12 +38342,11 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
3834238342
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
3834338343
; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
3834438344
; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
38345-
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
3834638345
; GFX10-NEXT: v_and_b32_e32 v8, 1, v8
3834738346
; GFX10-NEXT: v_and_b32_e32 v10, 1, v10
38347+
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
3834838348
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
3834938349
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
38350-
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
3835138350
; GFX10-NEXT: v_and_b32_e32 v5, 1, v5
3835238351
; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
3835338352
; GFX10-NEXT: v_and_b32_e32 v9, 1, v9
@@ -38366,7 +38365,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
3836638365
; GFX10-NEXT: v_cmp_eq_u32_e64 s17, 1, v4
3836738366
; GFX10-NEXT: v_cmp_eq_u32_e64 s18, 1, v2
3836838367
; GFX10-NEXT: v_cmp_eq_u32_e64 s19, 1, v0
38369-
; GFX10-NEXT: v_writelane_b32 v40, s35, 3
38368+
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
3837038369
; GFX10-NEXT: v_cmp_eq_u32_e64 s20, 1, v27
3837138370
; GFX10-NEXT: v_cmp_eq_u32_e64 s21, 1, v25
3837238371
; GFX10-NEXT: v_cmp_eq_u32_e64 s22, 1, v23
@@ -38377,10 +38376,10 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
3837738376
; GFX10-NEXT: v_cmp_eq_u32_e64 s27, 1, v13
3837838377
; GFX10-NEXT: v_cmp_eq_u32_e64 s28, 1, v11
3837938378
; GFX10-NEXT: v_cmp_eq_u32_e64 s29, 1, v7
38380-
; GFX10-NEXT: v_cmp_eq_u32_e64 s30, 1, v3
38381-
; GFX10-NEXT: v_cmp_eq_u32_e64 s31, 1, v1
38382-
; GFX10-NEXT: v_cmp_eq_u32_e64 s34, 1, v5
38383-
; GFX10-NEXT: v_cmp_eq_u32_e64 s35, 1, v9
38379+
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_hi, 1, v3
38380+
; GFX10-NEXT: v_cmp_eq_u32_e64 s30, 1, v1
38381+
; GFX10-NEXT: v_cmp_eq_u32_e64 s31, 1, v5
38382+
; GFX10-NEXT: v_cmp_eq_u32_e64 s34, 1, v9
3838438383
; GFX10-NEXT: s_waitcnt vmcnt(32)
3838538384
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v31
3838638385
; GFX10-NEXT: s_waitcnt vmcnt(31)
@@ -38460,10 +38459,10 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
3846038459
; GFX10-NEXT: v_cndmask_b32_e64 v6, v29, v39, s27
3846138460
; GFX10-NEXT: v_cndmask_b32_e64 v5, v28, v26, s28
3846238461
; GFX10-NEXT: v_cndmask_b32_e64 v20, v51, v20, s29
38463-
; GFX10-NEXT: v_cndmask_b32_e64 v0, v14, v12, s31
38464-
; GFX10-NEXT: v_cndmask_b32_e64 v1, v55, v16, s30
38465-
; GFX10-NEXT: v_cndmask_b32_e64 v2, v53, v18, s34
38466-
; GFX10-NEXT: v_cndmask_b32_e64 v12, v24, v22, s35
38462+
; GFX10-NEXT: v_cndmask_b32_e64 v0, v14, v12, s30
38463+
; GFX10-NEXT: v_cndmask_b32_e64 v1, v55, v16, vcc_hi
38464+
; GFX10-NEXT: v_cndmask_b32_e64 v2, v53, v18, s31
38465+
; GFX10-NEXT: v_cndmask_b32_e64 v12, v24, v22, s34
3846738466
; GFX10-NEXT: v_cndmask_b32_e64 v16, v4, v3, s4
3846838467
; GFX10-NEXT: v_perm_b32 v0, v0, v64, 0x5040100
3846938468
; GFX10-NEXT: v_perm_b32 v1, v1, v54, 0x5040100
@@ -38481,7 +38480,6 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
3848138480
; GFX10-NEXT: v_perm_b32 v13, v66, v13, 0x5040100
3848238481
; GFX10-NEXT: v_perm_b32 v14, v65, v17, 0x5040100
3848338482
; GFX10-NEXT: v_perm_b32 v15, v16, v15, 0x5040100
38484-
; GFX10-NEXT: v_readlane_b32 s35, v40, 3
3848538483
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
3848638484
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
3848738485
; GFX10-NEXT: v_readlane_b32 s30, v40, 0

llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1259,17 +1259,17 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind {
12591259
; GFX10SELDAG-LABEL: isnan_v4f16:
12601260
; GFX10SELDAG: ; %bb.0:
12611261
; GFX10SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1262-
; GFX10SELDAG-NEXT: v_mov_b32_e32 v2, 3
1263-
; GFX10SELDAG-NEXT: v_cmp_class_f16_e64 s5, v0, 3
1264-
; GFX10SELDAG-NEXT: v_cmp_class_f16_sdwa s4, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
1265-
; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5
1266-
; GFX10SELDAG-NEXT: v_cmp_class_f16_sdwa s5, v0, v2 src0_sel:WORD_1 src1_sel:DWORD
1262+
; GFX10SELDAG-NEXT: v_cmp_class_f16_e64 s4, v0, 3
1263+
; GFX10SELDAG-NEXT: v_mov_b32_e32 v3, 3
1264+
; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4
1265+
; GFX10SELDAG-NEXT: v_cmp_class_f16_e64 s4, v1, 3
1266+
; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
1267+
; GFX10SELDAG-NEXT: v_cmp_class_f16_sdwa s4, v0, v3 src0_sel:WORD_1 src1_sel:DWORD
1268+
; GFX10SELDAG-NEXT: v_mov_b32_e32 v0, v5
1269+
; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4
1270+
; GFX10SELDAG-NEXT: v_cmp_class_f16_sdwa s4, v1, v3 src0_sel:WORD_1 src1_sel:DWORD
1271+
; GFX10SELDAG-NEXT: v_mov_b32_e32 v1, v4
12671272
; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
1268-
; GFX10SELDAG-NEXT: v_mov_b32_e32 v0, v4
1269-
; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5
1270-
; GFX10SELDAG-NEXT: v_cmp_class_f16_e64 s5, v1, 3
1271-
; GFX10SELDAG-NEXT: v_mov_b32_e32 v1, v5
1272-
; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5
12731273
; GFX10SELDAG-NEXT: s_setpc_b64 s[30:31]
12741274
;
12751275
; GFX10GLISEL-LABEL: isnan_v4f16:

llvm/test/CodeGen/AMDGPU/load-constant-i16.ll

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7404,35 +7404,35 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
74047404
; GFX12-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v9, s31
74057405
; GFX12-NEXT: v_dual_mov_b32 v8, s30 :: v_dual_mov_b32 v11, s35
74067406
; GFX12-NEXT: v_dual_mov_b32 v10, s34 :: v_dual_mov_b32 v3, s5
7407-
; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x100000
7408-
; GFX12-NEXT: s_lshr_b32 s12, s0, 16
7409-
; GFX12-NEXT: s_mov_b32 s14, s1
7410-
; GFX12-NEXT: s_lshr_b32 s16, s1, 16
7411-
; GFX12-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x100000
7407+
; GFX12-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x100000
74127408
; GFX12-NEXT: s_lshr_b32 s2, s2, 16
74137409
; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
74147410
; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v5, s23
74157411
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v13, s25
7412+
; GFX12-NEXT: s_mov_b32 s12, s1
7413+
; GFX12-NEXT: s_lshr_b32 s14, s1, 16
74167414
; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000
74177415
; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000
74187416
; GFX12-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
74197417
; GFX12-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v7, s7
7418+
; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x100000
7419+
; GFX12-NEXT: s_lshr_b32 s0, s0, 16
74207420
; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
74217421
; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v17, s19
7422+
; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000
74227423
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000
7423-
; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000
74247424
; GFX12-NEXT: v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v19, s21
74257425
; GFX12-NEXT: v_mov_b32_e32 v18, s20
7426-
; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000
7426+
; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
74277427
; GFX12-NEXT: s_clause 0x1
74287428
; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:80
74297429
; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:64
7430-
; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
7430+
; GFX12-NEXT: v_dual_mov_b32 v1, s17 :: v_dual_mov_b32 v0, s16
74317431
; GFX12-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
7432-
; GFX12-NEXT: v_dual_mov_b32 v9, s15 :: v_dual_mov_b32 v8, s14
7433-
; GFX12-NEXT: v_dual_mov_b32 v11, s17 :: v_dual_mov_b32 v10, s16
7432+
; GFX12-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v8, s12
7433+
; GFX12-NEXT: v_dual_mov_b32 v11, s15 :: v_dual_mov_b32 v10, s14
74347434
; GFX12-NEXT: v_dual_mov_b32 v21, s11 :: v_dual_mov_b32 v20, s10
7435-
; GFX12-NEXT: v_dual_mov_b32 v23, s13 :: v_dual_mov_b32 v22, s12
7435+
; GFX12-NEXT: v_dual_mov_b32 v23, s1 :: v_dual_mov_b32 v22, s0
74367436
; GFX12-NEXT: s_clause 0x5
74377437
; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:112
74387438
; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:96

llvm/test/CodeGen/AMDGPU/load-constant-i8.ll

Lines changed: 45 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -8808,90 +8808,90 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
88088808
; GFX12-NEXT: v_lshrrev_b16 v2, 8, s6
88098809
; GFX12-NEXT: v_lshrrev_b16 v4, 8, s5
88108810
; GFX12-NEXT: v_lshrrev_b16 v8, 8, s2
8811-
; GFX12-NEXT: s_lshr_b32 s24, s7, 16
8811+
; GFX12-NEXT: s_lshr_b32 s22, s7, 16
88128812
; GFX12-NEXT: v_bfe_i32 v31, v1, 0, 8
8813-
; GFX12-NEXT: s_lshr_b32 s42, s2, 24
8814-
; GFX12-NEXT: s_mov_b32 s48, s7
8813+
; GFX12-NEXT: s_lshr_b32 s40, s2, 24
8814+
; GFX12-NEXT: s_mov_b32 s46, s7
88158815
; GFX12-NEXT: v_lshrrev_b16 v5, 8, s4
88168816
; GFX12-NEXT: v_lshrrev_b16 v7, 8, s1
8817-
; GFX12-NEXT: s_lshr_b32 s26, s6, 16
8818-
; GFX12-NEXT: s_lshr_b32 s44, s1, 16
8817+
; GFX12-NEXT: s_lshr_b32 s24, s6, 16
8818+
; GFX12-NEXT: s_lshr_b32 s42, s1, 16
88198819
; GFX12-NEXT: s_ashr_i64 s[58:59], s[6:7], 56
8820-
; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
8821-
; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
8822-
; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
8820+
; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000
8821+
; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
8822+
; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
88238823
; GFX12-NEXT: v_lshrrev_b16 v6, 8, s3
88248824
; GFX12-NEXT: v_lshrrev_b16 v3, 8, s0
8825-
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v33, s24
8826-
; GFX12-NEXT: s_lshr_b32 s28, s6, 24
8827-
; GFX12-NEXT: s_lshr_b32 s30, s5, 16
8828-
; GFX12-NEXT: s_lshr_b32 s40, s2, 16
8825+
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v33, s22
8826+
; GFX12-NEXT: s_lshr_b32 s26, s6, 24
8827+
; GFX12-NEXT: s_lshr_b32 s28, s5, 16
8828+
; GFX12-NEXT: s_lshr_b32 s38, s2, 16
88298829
; GFX12-NEXT: v_bfe_i32 v11, v8, 0, 8
88308830
; GFX12-NEXT: v_bfe_i32 v23, v4, 0, 8
88318831
; GFX12-NEXT: v_bfe_i32 v27, v2, 0, 8
88328832
; GFX12-NEXT: v_ashrrev_i32_e32 v32, 31, v31
8833-
; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000
8834-
; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
8835-
; GFX12-NEXT: v_dual_mov_b32 v34, s25 :: v_dual_mov_b32 v35, s58
8836-
; GFX12-NEXT: v_dual_mov_b32 v36, s59 :: v_dual_mov_b32 v37, s26
8837-
; GFX12-NEXT: v_dual_mov_b32 v56, s43 :: v_dual_mov_b32 v29, s48
8838-
; GFX12-NEXT: v_mov_b32_e32 v30, s49
8839-
; GFX12-NEXT: s_lshr_b32 s46, s0, 24
8840-
; GFX12-NEXT: s_mov_b32 s50, s5
8841-
; GFX12-NEXT: s_mov_b32 s52, s3
8842-
; GFX12-NEXT: s_lshr_b32 s34, s4, 16
8843-
; GFX12-NEXT: s_lshr_b32 s36, s4, 24
8844-
; GFX12-NEXT: s_ashr_i64 s[22:23], s[2:3], 56
8833+
; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
8834+
; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
8835+
; GFX12-NEXT: v_dual_mov_b32 v34, s23 :: v_dual_mov_b32 v35, s58
8836+
; GFX12-NEXT: v_dual_mov_b32 v36, s59 :: v_dual_mov_b32 v37, s24
8837+
; GFX12-NEXT: v_dual_mov_b32 v56, s41 :: v_dual_mov_b32 v29, s46
8838+
; GFX12-NEXT: v_mov_b32_e32 v30, s47
8839+
; GFX12-NEXT: s_lshr_b32 s44, s0, 24
8840+
; GFX12-NEXT: s_mov_b32 s48, s5
8841+
; GFX12-NEXT: s_mov_b32 s50, s3
8842+
; GFX12-NEXT: s_lshr_b32 s30, s4, 16
8843+
; GFX12-NEXT: s_lshr_b32 s34, s4, 24
8844+
; GFX12-NEXT: s_ashr_i64 s[54:55], s[2:3], 56
88458845
; GFX12-NEXT: s_ashr_i64 s[56:57], s[4:5], 56
88468846
; GFX12-NEXT: v_bfe_i32 v7, v7, 0, 8
88478847
; GFX12-NEXT: v_bfe_i32 v19, v5, 0, 8
8848-
; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
8849-
; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
8848+
; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
88508849
; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
8851-
; GFX12-NEXT: s_lshr_b32 s38, s3, 16
8852-
; GFX12-NEXT: s_mov_b32 s54, s1
8850+
; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
8851+
; GFX12-NEXT: s_lshr_b32 s36, s3, 16
8852+
; GFX12-NEXT: s_mov_b32 s52, s1
88538853
; GFX12-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000
88548854
; GFX12-NEXT: s_bfe_i64 s[14:15], s[4:5], 0x80000
88558855
; GFX12-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x80000
8856-
; GFX12-NEXT: s_bfe_i64 s[2:3], s[52:53], 0x80000
8857-
; GFX12-NEXT: s_bfe_i64 s[4:5], s[50:51], 0x80000
8858-
; GFX12-NEXT: s_bfe_i64 s[6:7], s[46:47], 0x80000
8856+
; GFX12-NEXT: s_bfe_i64 s[2:3], s[50:51], 0x80000
8857+
; GFX12-NEXT: s_bfe_i64 s[4:5], s[48:49], 0x80000
8858+
; GFX12-NEXT: s_bfe_i64 s[6:7], s[44:45], 0x80000
88598859
; GFX12-NEXT: s_lshr_b32 s20, s0, 16
88608860
; GFX12-NEXT: s_ashr_i64 s[18:19], s[0:1], 56
88618861
; GFX12-NEXT: v_bfe_i32 v3, v3, 0, 8
88628862
; GFX12-NEXT: v_bfe_i32 v15, v6, 0, 8
8863-
; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
88648863
; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
8865-
; GFX12-NEXT: v_dual_mov_b32 v38, s27 :: v_dual_mov_b32 v39, s28
8866-
; GFX12-NEXT: v_dual_mov_b32 v40, s29 :: v_dual_mov_b32 v41, s30
8867-
; GFX12-NEXT: v_dual_mov_b32 v42, s31 :: v_dual_mov_b32 v43, s56
8868-
; GFX12-NEXT: v_dual_mov_b32 v44, s57 :: v_dual_mov_b32 v45, s34
8869-
; GFX12-NEXT: v_dual_mov_b32 v52, s23 :: v_dual_mov_b32 v53, s40
8870-
; GFX12-NEXT: v_dual_mov_b32 v54, s41 :: v_dual_mov_b32 v55, s42
8864+
; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
8865+
; GFX12-NEXT: v_dual_mov_b32 v38, s25 :: v_dual_mov_b32 v39, s26
8866+
; GFX12-NEXT: v_dual_mov_b32 v40, s27 :: v_dual_mov_b32 v41, s28
8867+
; GFX12-NEXT: v_dual_mov_b32 v42, s29 :: v_dual_mov_b32 v43, s56
8868+
; GFX12-NEXT: v_dual_mov_b32 v44, s57 :: v_dual_mov_b32 v45, s30
8869+
; GFX12-NEXT: v_dual_mov_b32 v52, s55 :: v_dual_mov_b32 v53, s38
8870+
; GFX12-NEXT: v_dual_mov_b32 v54, s39 :: v_dual_mov_b32 v55, s40
88718871
; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000
8872-
; GFX12-NEXT: s_bfe_i64 s[0:1], s[54:55], 0x80000
8872+
; GFX12-NEXT: s_bfe_i64 s[0:1], s[52:53], 0x80000
88738873
; GFX12-NEXT: v_ashrrev_i32_e32 v12, 31, v11
88748874
; GFX12-NEXT: v_ashrrev_i32_e32 v24, 31, v23
88758875
; GFX12-NEXT: v_ashrrev_i32_e32 v28, 31, v27
88768876
; GFX12-NEXT: global_store_b128 v0, v[33:36], s[8:9] offset:240
8877-
; GFX12-NEXT: v_mov_b32_e32 v33, s44
8877+
; GFX12-NEXT: v_mov_b32_e32 v33, s42
88788878
; GFX12-NEXT: global_store_b128 v0, v[29:32], s[8:9] offset:224
88798879
; GFX12-NEXT: v_dual_mov_b32 v25, s16 :: v_dual_mov_b32 v26, s17
88808880
; GFX12-NEXT: v_dual_mov_b32 v32, s7 :: v_dual_mov_b32 v21, s4
88818881
; GFX12-NEXT: v_dual_mov_b32 v22, s5 :: v_dual_mov_b32 v17, s14
88828882
; GFX12-NEXT: v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v9, s12
88838883
; GFX12-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v5, s0
88848884
; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
8885-
; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
8886-
; GFX12-NEXT: v_dual_mov_b32 v46, s35 :: v_dual_mov_b32 v47, s36
8887-
; GFX12-NEXT: v_dual_mov_b32 v48, s37 :: v_dual_mov_b32 v49, s38
8888-
; GFX12-NEXT: v_dual_mov_b32 v34, s45 :: v_dual_mov_b32 v35, s18
8885+
; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
8886+
; GFX12-NEXT: v_dual_mov_b32 v46, s31 :: v_dual_mov_b32 v47, s34
8887+
; GFX12-NEXT: v_dual_mov_b32 v48, s35 :: v_dual_mov_b32 v49, s36
8888+
; GFX12-NEXT: v_dual_mov_b32 v34, s43 :: v_dual_mov_b32 v35, s18
88898889
; GFX12-NEXT: v_dual_mov_b32 v36, s19 :: v_dual_mov_b32 v29, s20
88908890
; GFX12-NEXT: v_ashrrev_i32_e32 v8, 31, v7
88918891
; GFX12-NEXT: v_ashrrev_i32_e32 v20, 31, v19
88928892
; GFX12-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v13, s2
88938893
; GFX12-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v1, s10
8894-
; GFX12-NEXT: v_dual_mov_b32 v50, s39 :: v_dual_mov_b32 v51, s22
8894+
; GFX12-NEXT: v_dual_mov_b32 v50, s37 :: v_dual_mov_b32 v51, s54
88958895
; GFX12-NEXT: v_dual_mov_b32 v30, s21 :: v_dual_mov_b32 v31, s6
88968896
; GFX12-NEXT: v_ashrrev_i32_e32 v4, 31, v3
88978897
; GFX12-NEXT: v_ashrrev_i32_e32 v16, 31, v15

0 commit comments

Comments
 (0)