Skip to content

AMDGPU: Mark v_mov_b64_pseudo as a VOP1 instruction #128677

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
let isMoveImm = 1;
let SchedRW = [Write64Bit];
let Size = 4;
let VOP1 = 1; // Not entirely correct, but close enough.
let UseNamedOperandTable = 1;
}

Expand Down
22 changes: 21 additions & 1 deletion llvm/test/CodeGen/AMDGPU/remat-sop.mir
Original file line number Diff line number Diff line change
Expand Up @@ -653,4 +653,24 @@ body: |
S_ENDPGM 0
...


---
name: test_remat_s_mov_b64_imm_pseudo
tracksRegLiveness: true
body: |
bb.0:
; GCN-LABEL: name: test_remat_s_mov_b64_imm_pseudo
; GCN: renamable $sgpr0_sgpr1 = S_MOV_B64_IMM_PSEUDO 1
; GCN-NEXT: renamable $sgpr2_sgpr3 = S_MOV_B64_IMM_PSEUDO 2
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3
; GCN-NEXT: renamable $sgpr0_sgpr1 = S_MOV_B64_IMM_PSEUDO 3
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1
; GCN-NEXT: S_ENDPGM 0
%0:sgpr_64 = S_MOV_B64_IMM_PSEUDO 1
%1:sgpr_64 = S_MOV_B64_IMM_PSEUDO 2
%2:sgpr_64 = S_MOV_B64_IMM_PSEUDO 3
S_NOP 0, implicit %0
S_NOP 0, implicit %1
S_NOP 0, implicit %2
S_ENDPGM 0
...
78 changes: 34 additions & 44 deletions llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[6:7]
; GLOBALNESS1-NEXT: s_load_dwordx4 s[76:79], s[8:9], 0x0
; GLOBALNESS1-NEXT: s_load_dword s6, s[8:9], 0x14
; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v0
; GLOBALNESS1-NEXT: v_mov_b32_e32 v42, 0
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0
; GLOBALNESS1-NEXT: global_store_dword v[0:1], v42, off
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[44:45], 0, 0
; GLOBALNESS1-NEXT: global_store_dword v[44:45], v42, off
; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0)
; GLOBALNESS1-NEXT: global_load_dword v2, v42, s[76:77]
; GLOBALNESS1-NEXT: s_mov_b64 s[40:41], s[4:5]
Expand All @@ -46,6 +45,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s17
; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v0
; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0
; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0
; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x40994400
Expand Down Expand Up @@ -73,13 +73,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v0
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v1
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3
; GLOBALNESS1-NEXT: v_mov_b32_e32 v46, 0x80
; GLOBALNESS1-NEXT: s_mov_b32 s70, s16
; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[8:9]
; GLOBALNESS1-NEXT: s_mov_b32 s71, s15
; GLOBALNESS1-NEXT: s_mov_b32 s72, s14
; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11]
; GLOBALNESS1-NEXT: v_mov_b32_e32 v47, 0
; GLOBALNESS1-NEXT: s_mov_b32 s32, 0
; GLOBALNESS1-NEXT: ; implicit-def: $vgpr44_vgpr45
; GLOBALNESS1-NEXT: ; implicit-def: $vgpr56_vgpr57
; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
Expand All @@ -106,17 +108,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: .LBB1_3: ; %Flow28
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7]
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[44:45], v[0:1], v[0:1] op_sel:[0,1]
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_30
; GLOBALNESS1-NEXT: .LBB1_4: ; %bb5
; GLOBALNESS1-NEXT: ; =>This Loop Header: Depth=1
; GLOBALNESS1-NEXT: ; Child Loop BB1_16 Depth 2
; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0x80
; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0
; GLOBALNESS1-NEXT: flat_load_dword v40, v[0:1]
; GLOBALNESS1-NEXT: flat_load_dword v40, v[46:47]
; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40
; GLOBALNESS1-NEXT: buffer_store_dword v42, off, s[0:3], 0
; GLOBALNESS1-NEXT: flat_load_dword v46, v[0:1]
; GLOBALNESS1-NEXT: flat_load_dword v58, v[46:47]
; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0
; GLOBALNESS1-NEXT: s_getpc_b64 s[4:5]
; GLOBALNESS1-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
Expand Down Expand Up @@ -160,8 +160,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_24
; GLOBALNESS1-NEXT: ; %bb.10: ; %baz.exit.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0
; GLOBALNESS1-NEXT: flat_load_dword v0, v[2:3]
; GLOBALNESS1-NEXT: flat_load_dword v0, v[44:45]
; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[62:63], 0, v0
; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0
Expand All @@ -170,17 +169,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_26
; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[44:45], off
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[54:55]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_13
; GLOBALNESS1-NEXT: ; %bb.12: ; %bb39.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0
; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off
; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS1-NEXT: .LBB1_13: ; %bb44.lr.ph.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v58
; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc
; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
Expand Down Expand Up @@ -237,7 +235,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41
; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0)
; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77]
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[46:47], 0, 0
; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41]
; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37]
; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[68:69]
Expand All @@ -246,14 +243,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_mov_b32 s13, s71
; GLOBALNESS1-NEXT: s_mov_b32 s14, s70
; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41
; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[44:45], off
; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[56:57], off
; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77]
; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[64:65]
; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_14
; GLOBALNESS1-NEXT: ; %bb.23: ; %bb62.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2
; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42
; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[42:43], off
; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS1-NEXT: s_branch .LBB1_14
; GLOBALNESS1-NEXT: .LBB1_24: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1
Expand All @@ -274,14 +271,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0
; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off
; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS1-NEXT: s_branch .LBB1_1
; GLOBALNESS1-NEXT: .LBB1_29: ; %bb73.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0
; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off
; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS1-NEXT: s_branch .LBB1_2
; GLOBALNESS1-NEXT: .LBB1_30: ; %loop.exit.guard
; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
Expand Down Expand Up @@ -326,10 +321,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[6:7]
; GLOBALNESS0-NEXT: s_load_dwordx4 s[72:75], s[8:9], 0x0
; GLOBALNESS0-NEXT: s_load_dword s6, s[8:9], 0x14
; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v0
; GLOBALNESS0-NEXT: v_mov_b32_e32 v42, 0
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0
; GLOBALNESS0-NEXT: global_store_dword v[0:1], v42, off
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[44:45], 0, 0
; GLOBALNESS0-NEXT: global_store_dword v[44:45], v42, off
; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0)
; GLOBALNESS0-NEXT: global_load_dword v2, v42, s[72:73]
; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[4:5]
Expand All @@ -338,6 +332,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s17
; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v0
; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0
; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0
; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x40994400
Expand Down Expand Up @@ -365,13 +360,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v0
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v1
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3
; GLOBALNESS0-NEXT: v_mov_b32_e32 v46, 0x80
; GLOBALNESS0-NEXT: s_mov_b32 s68, s16
; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9]
; GLOBALNESS0-NEXT: s_mov_b32 s69, s15
; GLOBALNESS0-NEXT: s_mov_b32 s70, s14
; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11]
; GLOBALNESS0-NEXT: v_mov_b32_e32 v47, 0
; GLOBALNESS0-NEXT: s_mov_b32 s32, 0
; GLOBALNESS0-NEXT: ; implicit-def: $vgpr44_vgpr45
; GLOBALNESS0-NEXT: ; implicit-def: $vgpr56_vgpr57
; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
Expand All @@ -398,17 +395,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: .LBB1_3: ; %Flow28
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7]
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[44:45], v[0:1], v[0:1] op_sel:[0,1]
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_30
; GLOBALNESS0-NEXT: .LBB1_4: ; %bb5
; GLOBALNESS0-NEXT: ; =>This Loop Header: Depth=1
; GLOBALNESS0-NEXT: ; Child Loop BB1_16 Depth 2
; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0x80
; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0
; GLOBALNESS0-NEXT: flat_load_dword v40, v[0:1]
; GLOBALNESS0-NEXT: flat_load_dword v40, v[46:47]
; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40
; GLOBALNESS0-NEXT: buffer_store_dword v42, off, s[0:3], 0
; GLOBALNESS0-NEXT: flat_load_dword v46, v[0:1]
; GLOBALNESS0-NEXT: flat_load_dword v58, v[46:47]
; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0
; GLOBALNESS0-NEXT: s_getpc_b64 s[4:5]
; GLOBALNESS0-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
Expand Down Expand Up @@ -452,8 +447,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_24
; GLOBALNESS0-NEXT: ; %bb.10: ; %baz.exit.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0
; GLOBALNESS0-NEXT: flat_load_dword v0, v[2:3]
; GLOBALNESS0-NEXT: flat_load_dword v0, v[44:45]
; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[62:63], 0, v0
; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0
Expand All @@ -462,17 +456,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_26
; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[44:45], off
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[54:55]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_13
; GLOBALNESS0-NEXT: ; %bb.12: ; %bb39.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0
; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off
; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS0-NEXT: .LBB1_13: ; %bb44.lr.ph.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v58
; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc
; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
Expand Down Expand Up @@ -529,7 +522,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41
; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0)
; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79]
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[46:47], 0, 0
; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41]
; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37]
; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[72:73]
Expand All @@ -538,14 +530,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_mov_b32 s13, s69
; GLOBALNESS0-NEXT: s_mov_b32 s14, s68
; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41
; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[44:45], off
; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[56:57], off
; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79]
; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[64:65]
; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_14
; GLOBALNESS0-NEXT: ; %bb.23: ; %bb62.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2
; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42
; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[42:43], off
; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS0-NEXT: s_branch .LBB1_14
; GLOBALNESS0-NEXT: .LBB1_24: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1
Expand All @@ -566,14 +558,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0
; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off
; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS0-NEXT: s_branch .LBB1_1
; GLOBALNESS0-NEXT: .LBB1_29: ; %bb73.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0
; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off
; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS0-NEXT: s_branch .LBB1_2
; GLOBALNESS0-NEXT: .LBB1_30: ; %loop.exit.guard
; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5]
Expand Down
48 changes: 46 additions & 2 deletions llvm/test/CodeGen/AMDGPU/vgpr-remat.mir
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
# Check that we get two move-immediates into %1 and %2, instead of a copy from
# %1 to %2, because that would introduce a dependency and maybe a stall.
---
name: f
name: remat_v_mov_b32_e32
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: f
; CHECK-LABEL: name: remat_v_mov_b32_e32
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; CHECK-NEXT: liveins: $sgpr0
Expand Down Expand Up @@ -46,3 +46,47 @@ body: |
%4.sub1:vreg_96 = COPY %2:vgpr_32
S_ENDPGM 0, implicit %4
...

---
name: remat_v_mov_b64_pseudo
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: remat_v_mov_b64_pseudo
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; CHECK-NEXT: liveins: $sgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: undef [[V_MOV_B:%[0-9]+]].sub0_sub1:vreg_192_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
; CHECK-NEXT: [[V_MOV_B:%[0-9]+]].sub2_sub3:vreg_192_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
; CHECK-NEXT: $exec = S_MOV_B64_term [[COPY]]
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.2(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[V_MOV_B:%[0-9]+]].sub0_sub1:vreg_192_align2 = V_MUL_F64_e64 0, [[V_MOV_B]].sub0_sub1, 0, [[V_MOV_B]].sub0_sub1, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_MOV_B:%[0-9]+]].sub2_sub3:vreg_192_align2 = V_MUL_F64_e64 0, [[V_MOV_B]].sub2_sub3, 0, [[V_MOV_B]].sub2_sub3, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]]
bb.0:
liveins: $sgpr0
%0:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
%1:vreg_64_align2 = COPY %0:vreg_64_align2
%2:vreg_64_align2 = COPY %0:vreg_64_align2
%3:sreg_64 = COPY $sgpr0_sgpr1
$exec = S_MOV_B64_term %3:sreg_64
S_CBRANCH_EXECZ %bb.2, implicit $exec
S_BRANCH %bb.1

bb.1:
%1:vreg_64_align2 = V_MUL_F64_e64 0, %1:vreg_64_align2, 0, %1:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
%2:vreg_64_align2 = V_MUL_F64_e64 0, %2:vreg_64_align2, 0, %2:vreg_64_align2, 0, 0, implicit $mode, implicit $exec

bb.2:
undef %4.sub0_sub1:vreg_192 = COPY %1:vreg_64_align2
%4.sub2_sub3:vreg_192 = COPY %2:vreg_64_align2
S_ENDPGM 0, implicit %4
...