Skip to content

Commit 6cf5acc

Browse files
committed
RegAllocGreedy: Fix subrange based instruction split logic
Fix the logic for readsLaneSubset. Check at the correct point for the use operands of the instruction, instead of the result. Only consider the use register operands, and stop considering whether the subranges are actually live at this point. This avoids some unproductive splits. This also happens to avoid a use after free due to a split of an unspillable register. That issue still exists if the instruction does not reference the full set of register lanes.
1 parent 93220e7 commit 6cf5acc

9 files changed

+486
-461
lines changed

llvm/lib/CodeGen/RegAllocGreedy.cpp

Lines changed: 26 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1350,13 +1350,27 @@ static unsigned getNumAllocatableRegsForConstraints(
13501350
return RCI.getNumAllocatableRegs(ConstrainedRC);
13511351
}
13521352

1353-
static LaneBitmask getInstReadLaneMask(const MachineRegisterInfo &MRI,
1354-
const TargetRegisterInfo &TRI,
1355-
const MachineInstr &FirstMI,
1356-
Register Reg) {
1357-
LaneBitmask Mask;
1353+
/// Return true if \p MI at \P Use reads a strict subset of the lanes of \p
1354+
/// VirtReg (not the whole register).
1355+
static bool readsLaneStrictSubset(const MachineRegisterInfo &MRI,
1356+
const MachineInstr *MI,
1357+
const LiveInterval &VirtReg,
1358+
const TargetRegisterInfo *TRI,
1359+
const TargetInstrInfo *TII) {
1360+
// Early check the common case. Beware of the semi-formed bundles SplitKit
1361+
// creates by setting the bundle flag on copies without a matching BUNDLE.
1362+
1363+
auto DestSrc = TII->isCopyInstr(*MI);
1364+
if (DestSrc && !MI->isBundled() &&
1365+
DestSrc->Destination->getSubReg() == DestSrc->Source->getSubReg())
1366+
return false;
1367+
1368+
Register Reg = VirtReg.reg();
1369+
1370+
// FIXME: We're only considering uses, but should be consider defs too?
1371+
LaneBitmask UseMask;
13581372
SmallVector<std::pair<MachineInstr *, unsigned>, 8> Ops;
1359-
(void)AnalyzeVirtRegInBundle(const_cast<MachineInstr &>(FirstMI), Reg, &Ops);
1373+
(void)AnalyzeVirtRegInBundle(const_cast<MachineInstr &>(*MI), Reg, &Ops);
13601374

13611375
for (auto [MI, OpIdx] : Ops) {
13621376
const MachineOperand &MO = MI->getOperand(OpIdx);
@@ -1365,46 +1379,20 @@ static LaneBitmask getInstReadLaneMask(const MachineRegisterInfo &MRI,
13651379
if (SubReg == 0 && MO.isUse()) {
13661380
if (MO.isUndef())
13671381
continue;
1368-
return MRI.getMaxLaneMaskForVReg(Reg);
1382+
return false;
13691383
}
13701384

1371-
LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(SubReg);
1385+
LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(SubReg);
13721386
if (MO.isDef()) {
13731387
if (!MO.isUndef())
1374-
Mask |= ~SubRegMask;
1388+
UseMask |= ~SubRegMask;
13751389
} else
1376-
Mask |= SubRegMask;
1377-
}
1378-
1379-
return Mask;
1380-
}
1381-
1382-
/// Return true if \p MI at \P Use reads a subset of the lanes live in \p
1383-
/// VirtReg.
1384-
static bool readsLaneSubset(const MachineRegisterInfo &MRI,
1385-
const MachineInstr *MI, const LiveInterval &VirtReg,
1386-
const TargetRegisterInfo *TRI, SlotIndex Use,
1387-
const TargetInstrInfo *TII) {
1388-
// Early check the common case. Beware of the semi-formed bundles SplitKit
1389-
// creates by setting the bundle flag on copies without a matching BUNDLE.
1390-
1391-
auto DestSrc = TII->isCopyInstr(*MI);
1392-
if (DestSrc && !MI->isBundled() &&
1393-
DestSrc->Destination->getSubReg() == DestSrc->Source->getSubReg())
1394-
return false;
1395-
1396-
// FIXME: We're only considering uses, but should be consider defs too?
1397-
LaneBitmask ReadMask = getInstReadLaneMask(MRI, *TRI, *MI, VirtReg.reg());
1398-
1399-
LaneBitmask LiveAtMask;
1400-
for (const LiveInterval::SubRange &S : VirtReg.subranges()) {
1401-
if (S.liveAt(Use))
1402-
LiveAtMask |= S.LaneMask;
1390+
UseMask |= SubRegMask;
14031391
}
14041392

14051393
// If the live lanes aren't different from the lanes used by the instruction,
14061394
// this doesn't help.
1407-
return (ReadMask & ~(LiveAtMask & TRI->getCoveringLanes())).any();
1395+
return UseMask != MRI.getMaxLaneMaskForVReg(VirtReg.reg());
14081396
}
14091397

14101398
/// tryInstructionSplit - Split a live range around individual instructions.
@@ -1456,7 +1444,7 @@ unsigned RAGreedy::tryInstructionSplit(const LiveInterval &VirtReg,
14561444
TII, TRI, RegClassInfo)) ||
14571445
// TODO: Handle split for subranges with subclass constraints?
14581446
(!SplitSubClass && VirtReg.hasSubRanges() &&
1459-
!readsLaneSubset(*MRI, MI, VirtReg, TRI, Use, TII))) {
1447+
!readsLaneStrictSubset(*MRI, MI, VirtReg, TRI, TII))) {
14601448
LLVM_DEBUG(dbgs() << " skip:\t" << Use << '\t' << *MI);
14611449
continue;
14621450
}

llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll

Lines changed: 60 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -3181,7 +3181,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
31813181
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
31823182
; GFX11-NEXT: s_and_b32 s33, s33, 0xfffffe00
31833183
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
3184-
; GFX11-NEXT: scratch_store_b32 off, v60, s33 offset:1600 ; 4-byte Folded Spill
3184+
; GFX11-NEXT: scratch_store_b32 off, v63, s33 offset:1584 ; 4-byte Folded Spill
31853185
; GFX11-NEXT: s_mov_b32 exec_lo, s0
31863186
; GFX11-NEXT: s_mov_b32 s0, 0
31873187
; GFX11-NEXT: v_mov_b32_e32 v4, 0
@@ -3191,19 +3191,22 @@ define amdgpu_gfx void @call_72xi32() #1 {
31913191
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
31923192
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
31933193
; GFX11-NEXT: s_addk_i32 s32, 0xa00
3194-
; GFX11-NEXT: s_clause 0xb
3195-
; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:44
3196-
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:40
3197-
; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:36
3198-
; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:32
3199-
; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:28
3200-
; GFX11-NEXT: scratch_store_b32 off, v45, s33 offset:24
3201-
; GFX11-NEXT: scratch_store_b32 off, v46, s33 offset:20
3202-
; GFX11-NEXT: scratch_store_b32 off, v47, s33 offset:16
3203-
; GFX11-NEXT: scratch_store_b32 off, v56, s33 offset:12
3204-
; GFX11-NEXT: scratch_store_b32 off, v57, s33 offset:8
3205-
; GFX11-NEXT: scratch_store_b32 off, v58, s33 offset:4
3206-
; GFX11-NEXT: scratch_store_b32 off, v59, s33
3194+
; GFX11-NEXT: s_clause 0xe
3195+
; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:56
3196+
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:52
3197+
; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:48
3198+
; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:44
3199+
; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:40
3200+
; GFX11-NEXT: scratch_store_b32 off, v45, s33 offset:36
3201+
; GFX11-NEXT: scratch_store_b32 off, v46, s33 offset:32
3202+
; GFX11-NEXT: scratch_store_b32 off, v47, s33 offset:28
3203+
; GFX11-NEXT: scratch_store_b32 off, v56, s33 offset:24
3204+
; GFX11-NEXT: scratch_store_b32 off, v57, s33 offset:20
3205+
; GFX11-NEXT: scratch_store_b32 off, v58, s33 offset:16
3206+
; GFX11-NEXT: scratch_store_b32 off, v59, s33 offset:12
3207+
; GFX11-NEXT: scratch_store_b32 off, v60, s33 offset:8
3208+
; GFX11-NEXT: scratch_store_b32 off, v61, s33 offset:4
3209+
; GFX11-NEXT: scratch_store_b32 off, v62, s33
32073210
; GFX11-NEXT: s_add_i32 s0, s32, 0xa0
32083211
; GFX11-NEXT: s_add_i32 s1, s32, 0x90
32093212
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
@@ -3224,7 +3227,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
32243227
; GFX11-NEXT: s_add_i32 s0, s32, 32
32253228
; GFX11-NEXT: s_add_i32 s1, s32, 16
32263229
; GFX11-NEXT: s_add_i32 s2, s33, 0x200
3227-
; GFX11-NEXT: v_writelane_b32 v60, s30, 0
3230+
; GFX11-NEXT: v_writelane_b32 v63, s30, 0
32283231
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0
32293232
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1
32303233
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, 0
@@ -3245,7 +3248,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
32453248
; GFX11-NEXT: v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0
32463249
; GFX11-NEXT: s_mov_b32 s1, return_72xi32@abs32@hi
32473250
; GFX11-NEXT: s_mov_b32 s0, return_72xi32@abs32@lo
3248-
; GFX11-NEXT: v_writelane_b32 v60, s31, 1
3251+
; GFX11-NEXT: v_writelane_b32 v63, s31, 1
32493252
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
32503253
; GFX11-NEXT: s_clause 0x1
32513254
; GFX11-NEXT: scratch_load_b128 v[45:48], off, s33 offset:624
@@ -3267,7 +3270,8 @@ define amdgpu_gfx void @call_72xi32() #1 {
32673270
; GFX11-NEXT: s_waitcnt vmcnt(2)
32683271
; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v1, v4
32693272
; GFX11-NEXT: s_waitcnt vmcnt(0)
3270-
; GFX11-NEXT: scratch_store_b128 off, v[16:19], s33 offset:1584 ; 16-byte Folded Spill
3273+
; GFX11-NEXT: v_dual_mov_b32 v62, v19 :: v_dual_mov_b32 v61, v18
3274+
; GFX11-NEXT: v_mov_b32_e32 v60, v17
32713275
; GFX11-NEXT: s_clause 0x3
32723276
; GFX11-NEXT: scratch_load_b128 v[16:19], off, s33 offset:528
32733277
; GFX11-NEXT: scratch_load_b128 v[20:23], off, s33 offset:544
@@ -3285,17 +3289,18 @@ define amdgpu_gfx void @call_72xi32() #1 {
32853289
; GFX11-NEXT: s_waitcnt vmcnt(0)
32863290
; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1536 ; 16-byte Folded Spill
32873291
; GFX11-NEXT: scratch_store_b128 off, v[32:35], s32
3288-
; GFX11-NEXT: v_dual_mov_b32 v31, v47 :: v_dual_mov_b32 v32, v36
3292+
; GFX11-NEXT: v_mov_b32_e32 v32, v36
32893293
; GFX11-NEXT: v_dual_mov_b32 v33, v48 :: v_dual_mov_b32 v34, v49
3294+
; GFX11-NEXT: v_mov_b32_e32 v49, v52
32903295
; GFX11-NEXT: v_dual_mov_b32 v35, v50 :: v_dual_mov_b32 v48, v51
3291-
; GFX11-NEXT: v_dual_mov_b32 v49, v52 :: v_dual_mov_b32 v50, v53
3292-
; GFX11-NEXT: v_dual_mov_b32 v51, v54 :: v_dual_mov_b32 v36, v55
3293-
; GFX11-NEXT: v_dual_mov_b32 v53, v41 :: v_dual_mov_b32 v52, v40
3294-
; GFX11-NEXT: v_dual_mov_b32 v54, v42 :: v_dual_mov_b32 v41, v56
3295-
; GFX11-NEXT: v_dual_mov_b32 v55, v43 :: v_dual_mov_b32 v40, v44
3296-
; GFX11-NEXT: v_dual_mov_b32 v42, v57 :: v_dual_mov_b32 v57, v12
3296+
; GFX11-NEXT: v_dual_mov_b32 v50, v53 :: v_dual_mov_b32 v51, v54
3297+
; GFX11-NEXT: v_mov_b32_e32 v36, v55
3298+
; GFX11-NEXT: v_dual_mov_b32 v52, v40 :: v_dual_mov_b32 v53, v41
3299+
; GFX11-NEXT: v_dual_mov_b32 v54, v42 :: v_dual_mov_b32 v55, v43
3300+
; GFX11-NEXT: v_mov_b32_e32 v40, v44
3301+
; GFX11-NEXT: v_dual_mov_b32 v41, v56 :: v_dual_mov_b32 v42, v57
32973302
; GFX11-NEXT: v_dual_mov_b32 v43, v58 :: v_dual_mov_b32 v56, v59
3298-
; GFX11-NEXT: v_mov_b32_e32 v58, v13
3303+
; GFX11-NEXT: v_dual_mov_b32 v57, v12 :: v_dual_mov_b32 v58, v13
32993304
; GFX11-NEXT: v_dual_mov_b32 v12, v15 :: v_dual_mov_b32 v13, v0
33003305
; GFX11-NEXT: v_dual_mov_b32 v15, v2 :: v_dual_mov_b32 v0, v3
33013306
; GFX11-NEXT: v_dual_mov_b32 v2, v5 :: v_dual_mov_b32 v3, v6
@@ -3310,57 +3315,58 @@ define amdgpu_gfx void @call_72xi32() #1 {
33103315
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s2
33113316
; GFX11-NEXT: v_mov_b32_e32 v0, 24
33123317
; GFX11-NEXT: s_add_i32 s2, s32, 0x70
3313-
; GFX11-NEXT: v_mov_b32_e32 v6, v17
3318+
; GFX11-NEXT: v_mov_b32_e32 v2, v60
33143319
; GFX11-NEXT: scratch_store_b128 off, v[12:15], s2
3315-
; GFX11-NEXT: v_mov_b32_e32 v13, v24
3320+
; GFX11-NEXT: v_mov_b32_e32 v15, v26
33163321
; GFX11-NEXT: s_add_i32 s2, s32, 0x6c
3317-
; GFX11-NEXT: v_mov_b32_e32 v7, v18
3322+
; GFX11-NEXT: v_dual_mov_b32 v4, v62 :: v_dual_mov_b32 v13, v24
33183323
; GFX11-NEXT: scratch_store_b32 off, v0, s2
33193324
; GFX11-NEXT: s_add_i32 s2, s32, 0x60
3320-
; GFX11-NEXT: v_dual_mov_b32 v8, v19 :: v_dual_mov_b32 v15, v26
3325+
; GFX11-NEXT: v_dual_mov_b32 v6, v17 :: v_dual_mov_b32 v31, v47
33213326
; GFX11-NEXT: scratch_store_b96 off, v[56:58], s2
33223327
; GFX11-NEXT: s_add_i32 s2, s32, 0x50
3323-
; GFX11-NEXT: v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v29, v45
3328+
; GFX11-NEXT: v_mov_b32_e32 v7, v18
33243329
; GFX11-NEXT: scratch_store_b128 off, v[40:43], s2
33253330
; GFX11-NEXT: s_add_i32 s2, s32, 64
3326-
; GFX11-NEXT: v_mov_b32_e32 v14, v25
3331+
; GFX11-NEXT: v_dual_mov_b32 v8, v19 :: v_dual_mov_b32 v29, v45
33273332
; GFX11-NEXT: scratch_store_b128 off, v[52:55], s2
33283333
; GFX11-NEXT: s_add_i32 s2, s32, 48
3329-
; GFX11-NEXT: v_mov_b32_e32 v16, v27
3334+
; GFX11-NEXT: v_mov_b32_e32 v12, v23
33303335
; GFX11-NEXT: scratch_store_b128 off, v[36:39], s2
33313336
; GFX11-NEXT: s_add_i32 s2, s32, 32
3332-
; GFX11-NEXT: v_mov_b32_e32 v30, v46
3337+
; GFX11-NEXT: v_mov_b32_e32 v14, v25
33333338
; GFX11-NEXT: scratch_store_b128 off, v[48:51], s2
33343339
; GFX11-NEXT: s_add_i32 s2, s32, 16
3340+
; GFX11-NEXT: v_mov_b32_e32 v16, v27
33353341
; GFX11-NEXT: scratch_store_b128 off, v[32:35], s2
3336-
; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1584 ; 16-byte Folded Reload
3337-
; GFX11-NEXT: s_waitcnt vmcnt(0)
3338-
; GFX11-NEXT: v_mov_b32_e32 v1, 42
33393342
; GFX11-NEXT: s_clause 0x2
33403343
; GFX11-NEXT: scratch_load_b128 v[17:20], off, s33 offset:1568
33413344
; GFX11-NEXT: scratch_load_b128 v[21:24], off, s33 offset:1552
33423345
; GFX11-NEXT: scratch_load_b128 v[25:28], off, s33 offset:1536
33433346
; GFX11-NEXT: s_add_i32 s2, s33, 0x400
3344-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3345-
; GFX11-NEXT: v_mov_b32_e32 v0, s2
3347+
; GFX11-NEXT: v_dual_mov_b32 v3, v61 :: v_dual_mov_b32 v30, v46
3348+
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 42
33463349
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
3347-
; GFX11-NEXT: s_clause 0xb
3348-
; GFX11-NEXT: scratch_load_b32 v59, off, s33
3349-
; GFX11-NEXT: scratch_load_b32 v58, off, s33 offset:4
3350-
; GFX11-NEXT: scratch_load_b32 v57, off, s33 offset:8
3351-
; GFX11-NEXT: scratch_load_b32 v56, off, s33 offset:12
3352-
; GFX11-NEXT: scratch_load_b32 v47, off, s33 offset:16
3353-
; GFX11-NEXT: scratch_load_b32 v46, off, s33 offset:20
3354-
; GFX11-NEXT: scratch_load_b32 v45, off, s33 offset:24
3355-
; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:28
3356-
; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:32
3357-
; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:36
3358-
; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:40
3359-
; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:44
3360-
; GFX11-NEXT: v_readlane_b32 s31, v60, 1
3361-
; GFX11-NEXT: v_readlane_b32 s30, v60, 0
3350+
; GFX11-NEXT: s_clause 0xe
3351+
; GFX11-NEXT: scratch_load_b32 v62, off, s33
3352+
; GFX11-NEXT: scratch_load_b32 v61, off, s33 offset:4
3353+
; GFX11-NEXT: scratch_load_b32 v60, off, s33 offset:8
3354+
; GFX11-NEXT: scratch_load_b32 v59, off, s33 offset:12
3355+
; GFX11-NEXT: scratch_load_b32 v58, off, s33 offset:16
3356+
; GFX11-NEXT: scratch_load_b32 v57, off, s33 offset:20
3357+
; GFX11-NEXT: scratch_load_b32 v56, off, s33 offset:24
3358+
; GFX11-NEXT: scratch_load_b32 v47, off, s33 offset:28
3359+
; GFX11-NEXT: scratch_load_b32 v46, off, s33 offset:32
3360+
; GFX11-NEXT: scratch_load_b32 v45, off, s33 offset:36
3361+
; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:40
3362+
; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:44
3363+
; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:48
3364+
; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:52
3365+
; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:56
3366+
; GFX11-NEXT: v_readlane_b32 s31, v63, 1
3367+
; GFX11-NEXT: v_readlane_b32 s30, v63, 0
33623368
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
3363-
; GFX11-NEXT: scratch_load_b32 v60, off, s33 offset:1600 ; 4-byte Folded Reload
3369+
; GFX11-NEXT: scratch_load_b32 v63, off, s33 offset:1584 ; 4-byte Folded Reload
33643370
; GFX11-NEXT: s_mov_b32 exec_lo, s0
33653371
; GFX11-NEXT: s_addk_i32 s32, 0xf600
33663372
; GFX11-NEXT: s_mov_b32 s33, s34
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# XFAIL: *
2+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -simplify-mir -start-before=greedy,2 -stress-regalloc=4 -stop-before=virtregrewriter,2 -filetype=null -verify-regalloc %s
3+
4+
# This test is similar to
5+
# inflated-reg-class-snippet-copy-use-after-free.mir, except it is
6+
# still broken when the use instruction does not read the full set of
7+
# lanes
8+
9+
--- |
10+
define amdgpu_kernel void @inflated_reg_class_copy_use_after_free_lane_subset() {
11+
ret void
12+
}
13+
...
14+
---
15+
name: inflated_reg_class_copy_use_after_free_lane_subset
16+
tracksRegLiveness: true
17+
machineFunctionInfo:
18+
explicitKernArgSize: 8
19+
maxKernArgAlign: 8
20+
isEntryFunction: true
21+
memoryBound: true
22+
waveLimiter: true
23+
scratchRSrcReg: '$sgpr72_sgpr73_sgpr74_sgpr75'
24+
stackPtrOffsetReg: '$sgpr32'
25+
returnsVoid: true
26+
occupancy: 7
27+
vgprForAGPRCopy: '$vgpr255'
28+
sgprForEXECCopy: '$sgpr74_sgpr75'
29+
longBranchReservedReg: ''
30+
body: |
31+
bb.0:
32+
liveins: $vgpr0, $sgpr4_sgpr5
33+
34+
%0:vgpr_32 = IMPLICIT_DEF
35+
renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed undef renamable $sgpr4_sgpr5, 0, 0 :: (load (s64), addrspace 4)
36+
S_NOP 0, implicit-def undef %1.sub12_sub13_sub14_sub15:vreg_512_align2
37+
S_NOP 0, implicit-def %1.sub8_sub9_sub10_sub11:vreg_512_align2
38+
S_NOP 0, implicit-def %1.sub4_sub5_sub6_sub7:vreg_512_align2
39+
S_NOP 0, implicit-def %1.sub0_sub1_sub2_sub3:vreg_512_align2
40+
S_NOP 0, implicit-def early-clobber %2:vreg_512_align2, implicit %1.sub0_sub1_sub2_sub3, implicit %1.sub4_sub5_sub6_sub7
41+
%1.sub2:vreg_512_align2 = COPY %2.sub3
42+
%1.sub3:vreg_512_align2 = COPY %2.sub2
43+
%1.sub4:vreg_512_align2 = COPY %2.sub0
44+
%1.sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
45+
%1.sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
46+
%1.sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
47+
%1.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
48+
%1.sub9:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
49+
%1.sub10:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
50+
%1.sub11:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
51+
%1.sub12:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
52+
%1.sub13:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
53+
%1.sub14:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
54+
%1.sub15:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
55+
S_NOP 0, implicit-def %1:vreg_512_align2, implicit %1.sub0_sub1_sub2_sub3, implicit %1.sub4_sub5_sub6_sub7, implicit %1.sub8_sub9_sub10_sub11
56+
GLOBAL_STORE_DWORDX4_SADDR undef %3:vgpr_32, %1.sub12_sub13_sub14_sub15, undef renamable $sgpr0_sgpr1, 96, 0, implicit $exec :: (store (s128), addrspace 1)
57+
S_ENDPGM 0
58+
59+
...

0 commit comments

Comments
 (0)