AMDGPU: Mark v_mov_b64_pseudo as a VOP1 instruction #128677

arsenm · 2025-02-25T08:55:04Z

This is mostly true, and it tricks the rematerialization
code into handling this without special casing it.

This is mostly true, and it tricks the rematerialization code into handling this without special casing it.

arsenm · 2025-02-25T08:55:25Z

AMDGPU: Mark v_mov_b64_pseudo as a VOP1 instruction #128677 👈 (View in Graphite)
main

This stack of pull requests is managed by Graphite. Learn more about stacking.

llvmbot · 2025-02-25T08:55:37Z

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

This is mostly true, and it tricks the rematerialization
code into handling this without special casing it.

Full diff: https://github.com/llvm/llvm-project/pull/128677.diff

4 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+1)
(modified) llvm/test/CodeGen/AMDGPU/remat-sop.mir (+21-1)
(modified) llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll (+34-44)
(modified) llvm/test/CodeGen/AMDGPU/vgpr-remat.mir (+46-2)

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index cca49ee80a60e..6f80dbcfe5e71 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -136,6 +136,7 @@ def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
   let isMoveImm = 1;
   let SchedRW = [Write64Bit];
   let Size = 4;
+  let VOP1 = 1; // Not entirely correct, but close enough.
   let UseNamedOperandTable = 1;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/remat-sop.mir b/llvm/test/CodeGen/AMDGPU/remat-sop.mir
index 81aa3a39de42f..1da55cf535449 100644
--- a/llvm/test/CodeGen/AMDGPU/remat-sop.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat-sop.mir
@@ -653,4 +653,24 @@ body:             |
     S_ENDPGM 0
 ...
 
-
+---
+name:            test_remat_s_mov_b64_imm_pseudo
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: test_remat_s_mov_b64_imm_pseudo
+    ; GCN: renamable $sgpr0_sgpr1 = S_MOV_B64_IMM_PSEUDO 1
+    ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_MOV_B64_IMM_PSEUDO 2
+    ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1
+    ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3
+    ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_MOV_B64_IMM_PSEUDO 3
+    ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1
+    ; GCN-NEXT: S_ENDPGM 0
+    %0:sgpr_64 = S_MOV_B64_IMM_PSEUDO 1
+    %1:sgpr_64 = S_MOV_B64_IMM_PSEUDO 2
+    %2:sgpr_64 = S_MOV_B64_IMM_PSEUDO 3
+    S_NOP 0, implicit %0
+    S_NOP 0, implicit %1
+    S_NOP 0, implicit %2
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index dd78c2f46dde8..a6e6341914ed0 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -34,10 +34,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[36:37], s[6:7]
 ; GLOBALNESS1-NEXT:    s_load_dwordx4 s[76:79], s[8:9], 0x0
 ; GLOBALNESS1-NEXT:    s_load_dword s6, s[8:9], 0x14
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v41, v0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v42, 0
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS1-NEXT:    global_store_dword v[0:1], v42, off
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[44:45], 0, 0
+; GLOBALNESS1-NEXT:    global_store_dword v[44:45], v42, off
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    global_load_dword v2, v42, s[76:77]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[40:41], s[4:5]
@@ -46,6 +45,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GLOBALNESS1-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GLOBALNESS1-NEXT:    s_add_u32 s0, s0, s17
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v41, v0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS1-NEXT:    s_addc_u32 s1, s1, 0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x40994400
@@ -73,13 +73,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[52:53], 1, v0
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v1
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[46:47], 1, v3
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v46, 0x80
 ; GLOBALNESS1-NEXT:    s_mov_b32 s70, s16
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; GLOBALNESS1-NEXT:    s_mov_b32 s71, s15
 ; GLOBALNESS1-NEXT:    s_mov_b32 s72, s14
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v47, 0
 ; GLOBALNESS1-NEXT:    s_mov_b32 s32, 0
-; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr44_vgpr45
+; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr56_vgpr57
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -106,17 +108,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:  .LBB1_3: ; %Flow28
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[44:45], v[0:1], v[0:1] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_30
 ; GLOBALNESS1-NEXT:  .LBB1_4: ; %bb5
 ; GLOBALNESS1-NEXT:    ; =>This Loop Header: Depth=1
 ; GLOBALNESS1-NEXT:    ; Child Loop BB1_16 Depth 2
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0x80
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0
-; GLOBALNESS1-NEXT:    flat_load_dword v40, v[0:1]
+; GLOBALNESS1-NEXT:    flat_load_dword v40, v[46:47]
 ; GLOBALNESS1-NEXT:    s_add_u32 s8, s38, 40
 ; GLOBALNESS1-NEXT:    buffer_store_dword v42, off, s[0:3], 0
-; GLOBALNESS1-NEXT:    flat_load_dword v46, v[0:1]
+; GLOBALNESS1-NEXT:    flat_load_dword v58, v[46:47]
 ; GLOBALNESS1-NEXT:    s_addc_u32 s9, s39, 0
 ; GLOBALNESS1-NEXT:    s_getpc_b64 s[4:5]
 ; GLOBALNESS1-NEXT:    s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
@@ -160,8 +160,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_24
 ; GLOBALNESS1-NEXT:  ; %bb.10: ; %baz.exit.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS1-NEXT:    flat_load_dword v0, v[2:3]
+; GLOBALNESS1-NEXT:    flat_load_dword v0, v[44:45]
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[62:63], 0, v0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
@@ -170,17 +169,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_26
 ; GLOBALNESS1-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
+; GLOBALNESS1-NEXT:    global_load_dwordx2 v[0:1], v[44:45], off
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[54:55]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_13
 ; GLOBALNESS1-NEXT:  ; %bb.12: ; %bb39.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS1-NEXT:  .LBB1_13: ; %bb44.lr.ph.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v46
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v58
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS1-NEXT:    v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
@@ -237,7 +235,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[76:77]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[46:47], 0, 0
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[68:69]
@@ -246,14 +243,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b32 s13, s71
 ; GLOBALNESS1-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[56:57], off
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[76:77]
 ; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[64:65]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_14
 ; GLOBALNESS1-NEXT:  ; %bb.23: ; %bb62.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[46:47], v[42:43], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_14
 ; GLOBALNESS1-NEXT:  .LBB1_24: ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
@@ -274,14 +271,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:  ; %bb.28: ; %bb69.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_1
 ; GLOBALNESS1-NEXT:  .LBB1_29: ; %bb73.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_2
 ; GLOBALNESS1-NEXT:  .LBB1_30: ; %loop.exit.guard
 ; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
@@ -326,10 +321,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[6:7]
 ; GLOBALNESS0-NEXT:    s_load_dwordx4 s[72:75], s[8:9], 0x0
 ; GLOBALNESS0-NEXT:    s_load_dword s6, s[8:9], 0x14
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v41, v0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v42, 0
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS0-NEXT:    global_store_dword v[0:1], v42, off
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[44:45], 0, 0
+; GLOBALNESS0-NEXT:    global_store_dword v[44:45], v42, off
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    global_load_dword v2, v42, s[72:73]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[40:41], s[4:5]
@@ -338,6 +332,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GLOBALNESS0-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GLOBALNESS0-NEXT:    s_add_u32 s0, s0, s17
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v41, v0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS0-NEXT:    s_addc_u32 s1, s1, 0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x40994400
@@ -365,13 +360,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[52:53], 1, v0
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v1
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[46:47], 1, v3
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v46, 0x80
 ; GLOBALNESS0-NEXT:    s_mov_b32 s68, s16
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; GLOBALNESS0-NEXT:    s_mov_b32 s69, s15
 ; GLOBALNESS0-NEXT:    s_mov_b32 s70, s14
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v47, 0
 ; GLOBALNESS0-NEXT:    s_mov_b32 s32, 0
-; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr44_vgpr45
+; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr56_vgpr57
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -398,17 +395,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:  .LBB1_3: ; %Flow28
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[44:45], v[0:1], v[0:1] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_30
 ; GLOBALNESS0-NEXT:  .LBB1_4: ; %bb5
 ; GLOBALNESS0-NEXT:    ; =>This Loop Header: Depth=1
 ; GLOBALNESS0-NEXT:    ; Child Loop BB1_16 Depth 2
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0x80
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0
-; GLOBALNESS0-NEXT:    flat_load_dword v40, v[0:1]
+; GLOBALNESS0-NEXT:    flat_load_dword v40, v[46:47]
 ; GLOBALNESS0-NEXT:    s_add_u32 s8, s38, 40
 ; GLOBALNESS0-NEXT:    buffer_store_dword v42, off, s[0:3], 0
-; GLOBALNESS0-NEXT:    flat_load_dword v46, v[0:1]
+; GLOBALNESS0-NEXT:    flat_load_dword v58, v[46:47]
 ; GLOBALNESS0-NEXT:    s_addc_u32 s9, s39, 0
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[4:5]
 ; GLOBALNESS0-NEXT:    s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
@@ -452,8 +447,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_24
 ; GLOBALNESS0-NEXT:  ; %bb.10: ; %baz.exit.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS0-NEXT:    flat_load_dword v0, v[2:3]
+; GLOBALNESS0-NEXT:    flat_load_dword v0, v[44:45]
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[62:63], 0, v0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
@@ -462,17 +456,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_26
 ; GLOBALNESS0-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
+; GLOBALNESS0-NEXT:    global_load_dwordx2 v[0:1], v[44:45], off
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[54:55]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_13
 ; GLOBALNESS0-NEXT:  ; %bb.12: ; %bb39.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS0-NEXT:  .LBB1_13: ; %bb44.lr.ph.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v46
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v58
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
@@ -529,7 +522,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[78:79]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[46:47], 0, 0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[72:73]
@@ -538,14 +530,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b32 s13, s69
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[56:57], off
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[78:79]
 ; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[64:65]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_14
 ; GLOBALNESS0-NEXT:  ; %bb.23: ; %bb62.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[46:47], v[42:43], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_14
 ; GLOBALNESS0-NEXT:  .LBB1_24: ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
@@ -566,14 +558,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:  ; %bb.28: ; %bb69.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_1
 ; GLOBALNESS0-NEXT:  .LBB1_29: ; %bb73.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_2
 ; GLOBALNESS0-NEXT:  .LBB1_30: ; %loop.exit.guard
 ; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir b/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir
index 08f5550f3b08a..4b967969366f4 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir
@@ -4,10 +4,10 @@
 # Check that we get two move-immediates into %1 and %2, instead of a copy from
 # %1 to %2, because that would introduce a dependency and maybe a stall.
 ---
-name: f
+name: remat_v_mov_b32_e32
 tracksRegLiveness: true
 body: |
-  ; CHECK-LABEL: name: f
+  ; CHECK-LABEL: name: remat_v_mov_b32_e32
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; CHECK-NEXT:   liveins: $sgpr0
@@ -46,3 +46,47 @@ body: |
     %4.sub1:vreg_96 = COPY %2:vgpr_32
     S_ENDPGM 0, implicit %4
 ...
+
+---
+name: remat_v_mov_b64_pseduo
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: remat_v_mov_b64_pseduo
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_MOV_B:%[0-9]+]].sub0_sub1:vreg_192_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B:%[0-9]+]].sub2_sub3:vreg_192_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   $exec = S_MOV_B64_term [[COPY]]
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MOV_B:%[0-9]+]].sub0_sub1:vreg_192_align2 = V_MUL_F64_e64 0, [[V_MOV_B]].sub0_sub1, 0, [[V_MOV_B]].sub0_sub1, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B:%[0-9]+]].sub2_sub3:vreg_192_align2 = V_MUL_F64_e64 0, [[V_MOV_B]].sub2_sub3, 0, [[V_MOV_B]].sub2_sub3, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0, implicit [[V_MOV_B]]
+  bb.0:
+    liveins: $sgpr0
+    %0:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
+    %1:vreg_64_align2 = COPY %0:vreg_64_align2
+    %2:vreg_64_align2 = COPY %0:vreg_64_align2
+    %3:sreg_64 = COPY $sgpr0_sgpr1
+    $exec = S_MOV_B64_term %3:sreg_64
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    %1:vreg_64_align2 = V_MUL_F64_e64 0, %1:vreg_64_align2, 0, %1:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    %2:vreg_64_align2 = V_MUL_F64_e64 0, %2:vreg_64_align2, 0, %2:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+
+  bb.2:
+    undef %4.sub0_sub1:vreg_192 = COPY %1:vreg_64_align2
+    %4.sub2_sub3:vreg_192 = COPY %2:vreg_64_align2
+    S_ENDPGM 0, implicit %4
+...

jayfoad

Seems reasonable. Which part of the remat implementation cares about the VOP1 flag?

jayfoad · 2025-02-25T10:21:18Z

llvm/test/CodeGen/AMDGPU/vgpr-remat.mir

@@ -46,3 +46,47 @@ body: |
    %4.sub1:vreg_96 = COPY %2:vgpr_32
    S_ENDPGM 0, implicit %4
 ...
+
+---
+name: remat_v_mov_b64_pseduo


Typo "pseduo"

arsenm · 2025-02-25T10:23:00Z

Seems reasonable. Which part of the remat implementation cares about the VOP1 flag?

canRemat, which I think is just hacking around the default skipping rematerialize due to the exec physreg use

llvm/test/CodeGen/AMDGPU/vgpr-remat.mir

jayfoad · 2025-02-25T10:25:55Z

Seems reasonable. Which part of the remat implementation cares about the VOP1 flag?

canRemat, which I think is just hacking around the default skipping rematerialize due to the exec physreg use

Couldn't it check isVALU instead of isVOP1 || isVOP2 || isVOP3?

arsenm · 2025-02-25T10:53:14Z

Couldn't it check isVALU instead of isVOP1 || isVOP2 || isVOP3?

That would also cover memory instructions

jayfoad · 2025-02-25T11:13:25Z

Couldn't it check isVALU instead of isVOP1 || isVOP2 || isVOP3?

That would also cover memory instructions

I only see a couple of weird BUF and FLAT instructions that also set VALU = 1. isVALU && !isVMEM still seems neater than listing all the different kinds of VOP.

This is mostly true, and it tricks the rematerialization code into handling this without special casing it.

AMDGPU: Mark v_mov_b64_pseudo as a VOP1 instruction

01f93ba

This is mostly true, and it tricks the rematerialization code into handling this without special casing it.

arsenm added the backend:AMDGPU label Feb 25, 2025 — with Graphite App

arsenm requested review from cdevadas, jayfoad, Pierre-vh, rampitec, shiltian and Sisyph February 25, 2025 08:55

arsenm marked this pull request as ready for review February 25, 2025 08:55

jayfoad approved these changes Feb 25, 2025

View reviewed changes

arsenm commented Feb 25, 2025

View reviewed changes

llvm/test/CodeGen/AMDGPU/vgpr-remat.mir Outdated Show resolved Hide resolved

Update llvm/test/CodeGen/AMDGPU/vgpr-remat.mir

cfb36e5

arsenm merged commit f95ad44 into main Feb 25, 2025
9 of 10 checks passed

arsenm deleted the users/arsenm/amdgpu/mark-v-mov-b64-pseudo-vop1 branch February 25, 2025 11:22

dmcdougall pushed a commit to dmcdougall/llvm-project that referenced this pull request Jun 23, 2025

AMDGPU: Mark v_mov_b64_pseudo as a VOP1 instruction (llvm#128677)

aeb0225

This is mostly true, and it tricks the rematerialization code into handling this without special casing it.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

AMDGPU: Mark v_mov_b64_pseudo as a VOP1 instruction #128677

AMDGPU: Mark v_mov_b64_pseudo as a VOP1 instruction #128677

arsenm commented Feb 25, 2025

Uh oh!

arsenm commented Feb 25, 2025

Uh oh!

llvmbot commented Feb 25, 2025

Uh oh!

jayfoad left a comment

Uh oh!

jayfoad Feb 25, 2025

Uh oh!

arsenm commented Feb 25, 2025

Uh oh!

Uh oh!

jayfoad commented Feb 25, 2025

Uh oh!

arsenm commented Feb 25, 2025

Uh oh!

jayfoad commented Feb 25, 2025

Uh oh!

Uh oh!

Uh oh!

AMDGPU: Mark v_mov_b64_pseudo as a VOP1 instruction #128677

AMDGPU: Mark v_mov_b64_pseudo as a VOP1 instruction #128677

Conversation

arsenm commented Feb 25, 2025

Uh oh!

arsenm commented Feb 25, 2025

Uh oh!

llvmbot commented Feb 25, 2025

Uh oh!

jayfoad left a comment

Choose a reason for hiding this comment

Uh oh!

jayfoad Feb 25, 2025

Choose a reason for hiding this comment

Uh oh!

arsenm commented Feb 25, 2025

Uh oh!

Uh oh!

jayfoad commented Feb 25, 2025

Uh oh!

arsenm commented Feb 25, 2025

Uh oh!

jayfoad commented Feb 25, 2025

Uh oh!

Uh oh!

Uh oh!