Skip to content

AMDGPU: Mark v_mov_b64_pseudo as a VOP1 instruction #128677

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 25, 2025

Conversation

arsenm
Copy link
Contributor

@arsenm arsenm commented Feb 25, 2025

This is mostly true, and it tricks the rematerialization
code into handling this without special casing it.

This is mostly true, and it tricks the rematerialization
code into handling this without special casing it.
Copy link
Contributor Author

arsenm commented Feb 25, 2025

This stack of pull requests is managed by Graphite. Learn more about stacking.

@llvmbot
Copy link
Member

llvmbot commented Feb 25, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

This is mostly true, and it tricks the rematerialization
code into handling this without special casing it.


Full diff: https://github.com/llvm/llvm-project/pull/128677.diff

4 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+1)
  • (modified) llvm/test/CodeGen/AMDGPU/remat-sop.mir (+21-1)
  • (modified) llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll (+34-44)
  • (modified) llvm/test/CodeGen/AMDGPU/vgpr-remat.mir (+46-2)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index cca49ee80a60e..6f80dbcfe5e71 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -136,6 +136,7 @@ def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
   let isMoveImm = 1;
   let SchedRW = [Write64Bit];
   let Size = 4;
+  let VOP1 = 1; // Not entirely correct, but close enough.
   let UseNamedOperandTable = 1;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/remat-sop.mir b/llvm/test/CodeGen/AMDGPU/remat-sop.mir
index 81aa3a39de42f..1da55cf535449 100644
--- a/llvm/test/CodeGen/AMDGPU/remat-sop.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat-sop.mir
@@ -653,4 +653,24 @@ body:             |
     S_ENDPGM 0
 ...
 
-
+---
+name:            test_remat_s_mov_b64_imm_pseudo
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: test_remat_s_mov_b64_imm_pseudo
+    ; GCN: renamable $sgpr0_sgpr1 = S_MOV_B64_IMM_PSEUDO 1
+    ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_MOV_B64_IMM_PSEUDO 2
+    ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1
+    ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3
+    ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_MOV_B64_IMM_PSEUDO 3
+    ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1
+    ; GCN-NEXT: S_ENDPGM 0
+    %0:sgpr_64 = S_MOV_B64_IMM_PSEUDO 1
+    %1:sgpr_64 = S_MOV_B64_IMM_PSEUDO 2
+    %2:sgpr_64 = S_MOV_B64_IMM_PSEUDO 3
+    S_NOP 0, implicit %0
+    S_NOP 0, implicit %1
+    S_NOP 0, implicit %2
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index dd78c2f46dde8..a6e6341914ed0 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -34,10 +34,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[36:37], s[6:7]
 ; GLOBALNESS1-NEXT:    s_load_dwordx4 s[76:79], s[8:9], 0x0
 ; GLOBALNESS1-NEXT:    s_load_dword s6, s[8:9], 0x14
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v41, v0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v42, 0
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS1-NEXT:    global_store_dword v[0:1], v42, off
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[44:45], 0, 0
+; GLOBALNESS1-NEXT:    global_store_dword v[44:45], v42, off
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    global_load_dword v2, v42, s[76:77]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[40:41], s[4:5]
@@ -46,6 +45,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GLOBALNESS1-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GLOBALNESS1-NEXT:    s_add_u32 s0, s0, s17
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v41, v0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS1-NEXT:    s_addc_u32 s1, s1, 0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x40994400
@@ -73,13 +73,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[52:53], 1, v0
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v1
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[46:47], 1, v3
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v46, 0x80
 ; GLOBALNESS1-NEXT:    s_mov_b32 s70, s16
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; GLOBALNESS1-NEXT:    s_mov_b32 s71, s15
 ; GLOBALNESS1-NEXT:    s_mov_b32 s72, s14
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v47, 0
 ; GLOBALNESS1-NEXT:    s_mov_b32 s32, 0
-; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr44_vgpr45
+; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr56_vgpr57
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -106,17 +108,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:  .LBB1_3: ; %Flow28
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[44:45], v[0:1], v[0:1] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_30
 ; GLOBALNESS1-NEXT:  .LBB1_4: ; %bb5
 ; GLOBALNESS1-NEXT:    ; =>This Loop Header: Depth=1
 ; GLOBALNESS1-NEXT:    ; Child Loop BB1_16 Depth 2
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0x80
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0
-; GLOBALNESS1-NEXT:    flat_load_dword v40, v[0:1]
+; GLOBALNESS1-NEXT:    flat_load_dword v40, v[46:47]
 ; GLOBALNESS1-NEXT:    s_add_u32 s8, s38, 40
 ; GLOBALNESS1-NEXT:    buffer_store_dword v42, off, s[0:3], 0
-; GLOBALNESS1-NEXT:    flat_load_dword v46, v[0:1]
+; GLOBALNESS1-NEXT:    flat_load_dword v58, v[46:47]
 ; GLOBALNESS1-NEXT:    s_addc_u32 s9, s39, 0
 ; GLOBALNESS1-NEXT:    s_getpc_b64 s[4:5]
 ; GLOBALNESS1-NEXT:    s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
@@ -160,8 +160,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_24
 ; GLOBALNESS1-NEXT:  ; %bb.10: ; %baz.exit.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS1-NEXT:    flat_load_dword v0, v[2:3]
+; GLOBALNESS1-NEXT:    flat_load_dword v0, v[44:45]
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[62:63], 0, v0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
@@ -170,17 +169,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_26
 ; GLOBALNESS1-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
+; GLOBALNESS1-NEXT:    global_load_dwordx2 v[0:1], v[44:45], off
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[54:55]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_13
 ; GLOBALNESS1-NEXT:  ; %bb.12: ; %bb39.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS1-NEXT:  .LBB1_13: ; %bb44.lr.ph.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v46
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v58
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS1-NEXT:    v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
@@ -237,7 +235,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[76:77]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[46:47], 0, 0
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[68:69]
@@ -246,14 +243,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b32 s13, s71
 ; GLOBALNESS1-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[56:57], off
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[76:77]
 ; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[64:65]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_14
 ; GLOBALNESS1-NEXT:  ; %bb.23: ; %bb62.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[46:47], v[42:43], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_14
 ; GLOBALNESS1-NEXT:  .LBB1_24: ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
@@ -274,14 +271,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:  ; %bb.28: ; %bb69.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_1
 ; GLOBALNESS1-NEXT:  .LBB1_29: ; %bb73.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_2
 ; GLOBALNESS1-NEXT:  .LBB1_30: ; %loop.exit.guard
 ; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
@@ -326,10 +321,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[6:7]
 ; GLOBALNESS0-NEXT:    s_load_dwordx4 s[72:75], s[8:9], 0x0
 ; GLOBALNESS0-NEXT:    s_load_dword s6, s[8:9], 0x14
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v41, v0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v42, 0
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS0-NEXT:    global_store_dword v[0:1], v42, off
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[44:45], 0, 0
+; GLOBALNESS0-NEXT:    global_store_dword v[44:45], v42, off
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    global_load_dword v2, v42, s[72:73]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[40:41], s[4:5]
@@ -338,6 +332,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GLOBALNESS0-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GLOBALNESS0-NEXT:    s_add_u32 s0, s0, s17
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v41, v0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS0-NEXT:    s_addc_u32 s1, s1, 0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x40994400
@@ -365,13 +360,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[52:53], 1, v0
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v1
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[46:47], 1, v3
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v46, 0x80
 ; GLOBALNESS0-NEXT:    s_mov_b32 s68, s16
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; GLOBALNESS0-NEXT:    s_mov_b32 s69, s15
 ; GLOBALNESS0-NEXT:    s_mov_b32 s70, s14
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v47, 0
 ; GLOBALNESS0-NEXT:    s_mov_b32 s32, 0
-; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr44_vgpr45
+; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr56_vgpr57
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -398,17 +395,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:  .LBB1_3: ; %Flow28
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[44:45], v[0:1], v[0:1] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_30
 ; GLOBALNESS0-NEXT:  .LBB1_4: ; %bb5
 ; GLOBALNESS0-NEXT:    ; =>This Loop Header: Depth=1
 ; GLOBALNESS0-NEXT:    ; Child Loop BB1_16 Depth 2
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0x80
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0
-; GLOBALNESS0-NEXT:    flat_load_dword v40, v[0:1]
+; GLOBALNESS0-NEXT:    flat_load_dword v40, v[46:47]
 ; GLOBALNESS0-NEXT:    s_add_u32 s8, s38, 40
 ; GLOBALNESS0-NEXT:    buffer_store_dword v42, off, s[0:3], 0
-; GLOBALNESS0-NEXT:    flat_load_dword v46, v[0:1]
+; GLOBALNESS0-NEXT:    flat_load_dword v58, v[46:47]
 ; GLOBALNESS0-NEXT:    s_addc_u32 s9, s39, 0
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[4:5]
 ; GLOBALNESS0-NEXT:    s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
@@ -452,8 +447,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_24
 ; GLOBALNESS0-NEXT:  ; %bb.10: ; %baz.exit.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS0-NEXT:    flat_load_dword v0, v[2:3]
+; GLOBALNESS0-NEXT:    flat_load_dword v0, v[44:45]
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[62:63], 0, v0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
@@ -462,17 +456,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_26
 ; GLOBALNESS0-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
+; GLOBALNESS0-NEXT:    global_load_dwordx2 v[0:1], v[44:45], off
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[54:55]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_13
 ; GLOBALNESS0-NEXT:  ; %bb.12: ; %bb39.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS0-NEXT:  .LBB1_13: ; %bb44.lr.ph.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v46
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v58
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
@@ -529,7 +522,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[78:79]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[46:47], 0, 0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[72:73]
@@ -538,14 +530,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b32 s13, s69
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[56:57], off
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[78:79]
 ; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[64:65]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_14
 ; GLOBALNESS0-NEXT:  ; %bb.23: ; %bb62.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[46:47], v[42:43], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_14
 ; GLOBALNESS0-NEXT:  .LBB1_24: ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
@@ -566,14 +558,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:  ; %bb.28: ; %bb69.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_1
 ; GLOBALNESS0-NEXT:  .LBB1_29: ; %bb73.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_2
 ; GLOBALNESS0-NEXT:  .LBB1_30: ; %loop.exit.guard
 ; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir b/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir
index 08f5550f3b08a..4b967969366f4 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir
@@ -4,10 +4,10 @@
 # Check that we get two move-immediates into %1 and %2, instead of a copy from
 # %1 to %2, because that would introduce a dependency and maybe a stall.
 ---
-name: f
+name: remat_v_mov_b32_e32
 tracksRegLiveness: true
 body: |
-  ; CHECK-LABEL: name: f
+  ; CHECK-LABEL: name: remat_v_mov_b32_e32
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; CHECK-NEXT:   liveins: $sgpr0
@@ -46,3 +46,47 @@ body: |
     %4.sub1:vreg_96 = COPY %2:vgpr_32
     S_ENDPGM 0, implicit %4
 ...
+
+---
+name: remat_v_mov_b64_pseduo
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: remat_v_mov_b64_pseduo
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_MOV_B:%[0-9]+]].sub0_sub1:vreg_192_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B:%[0-9]+]].sub2_sub3:vreg_192_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   $exec = S_MOV_B64_term [[COPY]]
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MOV_B:%[0-9]+]].sub0_sub1:vreg_192_align2 = V_MUL_F64_e64 0, [[V_MOV_B]].sub0_sub1, 0, [[V_MOV_B]].sub0_sub1, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B:%[0-9]+]].sub2_sub3:vreg_192_align2 = V_MUL_F64_e64 0, [[V_MOV_B]].sub2_sub3, 0, [[V_MOV_B]].sub2_sub3, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0, implicit [[V_MOV_B]]
+  bb.0:
+    liveins: $sgpr0
+    %0:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
+    %1:vreg_64_align2 = COPY %0:vreg_64_align2
+    %2:vreg_64_align2 = COPY %0:vreg_64_align2
+    %3:sreg_64 = COPY $sgpr0_sgpr1
+    $exec = S_MOV_B64_term %3:sreg_64
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    %1:vreg_64_align2 = V_MUL_F64_e64 0, %1:vreg_64_align2, 0, %1:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    %2:vreg_64_align2 = V_MUL_F64_e64 0, %2:vreg_64_align2, 0, %2:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+
+  bb.2:
+    undef %4.sub0_sub1:vreg_192 = COPY %1:vreg_64_align2
+    %4.sub2_sub3:vreg_192 = COPY %2:vreg_64_align2
+    S_ENDPGM 0, implicit %4
+...

@arsenm arsenm marked this pull request as ready for review February 25, 2025 08:55
Copy link
Contributor

@jayfoad jayfoad left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems reasonable. Which part of the remat implementation cares about the VOP1 flag?

@@ -46,3 +46,47 @@ body: |
%4.sub1:vreg_96 = COPY %2:vgpr_32
S_ENDPGM 0, implicit %4
...

---
name: remat_v_mov_b64_pseduo
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo "pseduo"

@arsenm
Copy link
Contributor Author

arsenm commented Feb 25, 2025

Seems reasonable. Which part of the remat implementation cares about the VOP1 flag?

canRemat, which I think is just hacking around the default skipping rematerialize due to the exec physreg use

@jayfoad
Copy link
Contributor

jayfoad commented Feb 25, 2025

Seems reasonable. Which part of the remat implementation cares about the VOP1 flag?

canRemat, which I think is just hacking around the default skipping rematerialize due to the exec physreg use

Couldn't it check isVALU instead of isVOP1 || isVOP2 || isVOP3?

@arsenm
Copy link
Contributor Author

arsenm commented Feb 25, 2025

Couldn't it check isVALU instead of isVOP1 || isVOP2 || isVOP3?

That would also cover memory instructions

@jayfoad
Copy link
Contributor

jayfoad commented Feb 25, 2025

Couldn't it check isVALU instead of isVOP1 || isVOP2 || isVOP3?

That would also cover memory instructions

I only see a couple of weird BUF and FLAT instructions that also set VALU = 1. isVALU && !isVMEM still seems neater than listing all the different kinds of VOP.

@arsenm arsenm merged commit f95ad44 into main Feb 25, 2025
9 of 10 checks passed
@arsenm arsenm deleted the users/arsenm/amdgpu/mark-v-mov-b64-pseudo-vop1 branch February 25, 2025 11:22
dmcdougall pushed a commit to dmcdougall/llvm-project that referenced this pull request Jun 23, 2025
This is mostly true, and it tricks the rematerialization
code into handling this without special casing it.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants