-
Notifications
You must be signed in to change notification settings - Fork 14.3k
AMDGPU: Mark v_mov_b64_pseudo as a VOP1 instruction #128677
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This is mostly true, and it tricks the rematerialization code into handling this without special casing it.
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesThis is mostly true, and it tricks the rematerialization Full diff: https://github.com/llvm/llvm-project/pull/128677.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index cca49ee80a60e..6f80dbcfe5e71 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -136,6 +136,7 @@ def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
let isMoveImm = 1;
let SchedRW = [Write64Bit];
let Size = 4;
+ let VOP1 = 1; // Not entirely correct, but close enough.
let UseNamedOperandTable = 1;
}
diff --git a/llvm/test/CodeGen/AMDGPU/remat-sop.mir b/llvm/test/CodeGen/AMDGPU/remat-sop.mir
index 81aa3a39de42f..1da55cf535449 100644
--- a/llvm/test/CodeGen/AMDGPU/remat-sop.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat-sop.mir
@@ -653,4 +653,24 @@ body: |
S_ENDPGM 0
...
-
+---
+name: test_remat_s_mov_b64_imm_pseudo
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: test_remat_s_mov_b64_imm_pseudo
+ ; GCN: renamable $sgpr0_sgpr1 = S_MOV_B64_IMM_PSEUDO 1
+ ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_MOV_B64_IMM_PSEUDO 2
+ ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1
+ ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3
+ ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_MOV_B64_IMM_PSEUDO 3
+ ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1
+ ; GCN-NEXT: S_ENDPGM 0
+ %0:sgpr_64 = S_MOV_B64_IMM_PSEUDO 1
+ %1:sgpr_64 = S_MOV_B64_IMM_PSEUDO 2
+ %2:sgpr_64 = S_MOV_B64_IMM_PSEUDO 3
+ S_NOP 0, implicit %0
+ S_NOP 0, implicit %1
+ S_NOP 0, implicit %2
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index dd78c2f46dde8..a6e6341914ed0 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -34,10 +34,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[6:7]
; GLOBALNESS1-NEXT: s_load_dwordx4 s[76:79], s[8:9], 0x0
; GLOBALNESS1-NEXT: s_load_dword s6, s[8:9], 0x14
-; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v0
; GLOBALNESS1-NEXT: v_mov_b32_e32 v42, 0
-; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS1-NEXT: global_store_dword v[0:1], v42, off
+; GLOBALNESS1-NEXT: v_pk_mov_b32 v[44:45], 0, 0
+; GLOBALNESS1-NEXT: global_store_dword v[44:45], v42, off
; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0)
; GLOBALNESS1-NEXT: global_load_dword v2, v42, s[76:77]
; GLOBALNESS1-NEXT: s_mov_b64 s[40:41], s[4:5]
@@ -46,6 +45,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s17
+; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v0
; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0
; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0
; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x40994400
@@ -73,13 +73,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v0
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v1
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3
+; GLOBALNESS1-NEXT: v_mov_b32_e32 v46, 0x80
; GLOBALNESS1-NEXT: s_mov_b32 s70, s16
; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[8:9]
; GLOBALNESS1-NEXT: s_mov_b32 s71, s15
; GLOBALNESS1-NEXT: s_mov_b32 s72, s14
; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11]
+; GLOBALNESS1-NEXT: v_mov_b32_e32 v47, 0
; GLOBALNESS1-NEXT: s_mov_b32 s32, 0
-; GLOBALNESS1-NEXT: ; implicit-def: $vgpr44_vgpr45
+; GLOBALNESS1-NEXT: ; implicit-def: $vgpr56_vgpr57
; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -106,17 +108,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: .LBB1_3: ; %Flow28
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS1-NEXT: v_pk_mov_b32 v[44:45], v[0:1], v[0:1] op_sel:[0,1]
+; GLOBALNESS1-NEXT: v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_30
; GLOBALNESS1-NEXT: .LBB1_4: ; %bb5
; GLOBALNESS1-NEXT: ; =>This Loop Header: Depth=1
; GLOBALNESS1-NEXT: ; Child Loop BB1_16 Depth 2
-; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0x80
-; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0
-; GLOBALNESS1-NEXT: flat_load_dword v40, v[0:1]
+; GLOBALNESS1-NEXT: flat_load_dword v40, v[46:47]
; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40
; GLOBALNESS1-NEXT: buffer_store_dword v42, off, s[0:3], 0
-; GLOBALNESS1-NEXT: flat_load_dword v46, v[0:1]
+; GLOBALNESS1-NEXT: flat_load_dword v58, v[46:47]
; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0
; GLOBALNESS1-NEXT: s_getpc_b64 s[4:5]
; GLOBALNESS1-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
@@ -160,8 +160,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_24
; GLOBALNESS1-NEXT: ; %bb.10: ; %baz.exit.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS1-NEXT: flat_load_dword v0, v[2:3]
+; GLOBALNESS1-NEXT: flat_load_dword v0, v[44:45]
; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[62:63], 0, v0
; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0
@@ -170,17 +169,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_26
; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
+; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[44:45], off
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[54:55]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_13
; GLOBALNESS1-NEXT: ; %bb.12: ; %bb39.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42
-; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS1-NEXT: .LBB1_13: ; %bb44.lr.ph.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46
+; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v58
; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc
; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
@@ -237,7 +235,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41
; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0)
; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77]
-; GLOBALNESS1-NEXT: v_pk_mov_b32 v[46:47], 0, 0
; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41]
; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37]
; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[68:69]
@@ -246,14 +243,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_mov_b32 s13, s71
; GLOBALNESS1-NEXT: s_mov_b32 s14, s70
; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41
-; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[44:45], off
+; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[56:57], off
; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77]
; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[64:65]
; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_14
; GLOBALNESS1-NEXT: ; %bb.23: ; %bb62.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2
; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42
-; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[42:43], off
+; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS1-NEXT: s_branch .LBB1_14
; GLOBALNESS1-NEXT: .LBB1_24: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1
@@ -274,14 +271,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42
-; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS1-NEXT: s_branch .LBB1_1
; GLOBALNESS1-NEXT: .LBB1_29: ; %bb73.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42
-; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS1-NEXT: s_branch .LBB1_2
; GLOBALNESS1-NEXT: .LBB1_30: ; %loop.exit.guard
; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
@@ -326,10 +321,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[6:7]
; GLOBALNESS0-NEXT: s_load_dwordx4 s[72:75], s[8:9], 0x0
; GLOBALNESS0-NEXT: s_load_dword s6, s[8:9], 0x14
-; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v0
; GLOBALNESS0-NEXT: v_mov_b32_e32 v42, 0
-; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS0-NEXT: global_store_dword v[0:1], v42, off
+; GLOBALNESS0-NEXT: v_pk_mov_b32 v[44:45], 0, 0
+; GLOBALNESS0-NEXT: global_store_dword v[44:45], v42, off
; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0)
; GLOBALNESS0-NEXT: global_load_dword v2, v42, s[72:73]
; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[4:5]
@@ -338,6 +332,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s17
+; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v0
; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0
; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0
; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x40994400
@@ -365,13 +360,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v0
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v1
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3
+; GLOBALNESS0-NEXT: v_mov_b32_e32 v46, 0x80
; GLOBALNESS0-NEXT: s_mov_b32 s68, s16
; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9]
; GLOBALNESS0-NEXT: s_mov_b32 s69, s15
; GLOBALNESS0-NEXT: s_mov_b32 s70, s14
; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11]
+; GLOBALNESS0-NEXT: v_mov_b32_e32 v47, 0
; GLOBALNESS0-NEXT: s_mov_b32 s32, 0
-; GLOBALNESS0-NEXT: ; implicit-def: $vgpr44_vgpr45
+; GLOBALNESS0-NEXT: ; implicit-def: $vgpr56_vgpr57
; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -398,17 +395,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: .LBB1_3: ; %Flow28
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS0-NEXT: v_pk_mov_b32 v[44:45], v[0:1], v[0:1] op_sel:[0,1]
+; GLOBALNESS0-NEXT: v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_30
; GLOBALNESS0-NEXT: .LBB1_4: ; %bb5
; GLOBALNESS0-NEXT: ; =>This Loop Header: Depth=1
; GLOBALNESS0-NEXT: ; Child Loop BB1_16 Depth 2
-; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0x80
-; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0
-; GLOBALNESS0-NEXT: flat_load_dword v40, v[0:1]
+; GLOBALNESS0-NEXT: flat_load_dword v40, v[46:47]
; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40
; GLOBALNESS0-NEXT: buffer_store_dword v42, off, s[0:3], 0
-; GLOBALNESS0-NEXT: flat_load_dword v46, v[0:1]
+; GLOBALNESS0-NEXT: flat_load_dword v58, v[46:47]
; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0
; GLOBALNESS0-NEXT: s_getpc_b64 s[4:5]
; GLOBALNESS0-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
@@ -452,8 +447,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_24
; GLOBALNESS0-NEXT: ; %bb.10: ; %baz.exit.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS0-NEXT: flat_load_dword v0, v[2:3]
+; GLOBALNESS0-NEXT: flat_load_dword v0, v[44:45]
; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[62:63], 0, v0
; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0
@@ -462,17 +456,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_26
; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
+; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[44:45], off
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[54:55]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_13
; GLOBALNESS0-NEXT: ; %bb.12: ; %bb39.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42
-; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS0-NEXT: .LBB1_13: ; %bb44.lr.ph.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46
+; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v58
; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc
; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
@@ -529,7 +522,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41
; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0)
; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79]
-; GLOBALNESS0-NEXT: v_pk_mov_b32 v[46:47], 0, 0
; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41]
; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37]
; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[72:73]
@@ -538,14 +530,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_mov_b32 s13, s69
; GLOBALNESS0-NEXT: s_mov_b32 s14, s68
; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41
-; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[44:45], off
+; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[56:57], off
; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79]
; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[64:65]
; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_14
; GLOBALNESS0-NEXT: ; %bb.23: ; %bb62.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2
; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42
-; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[42:43], off
+; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS0-NEXT: s_branch .LBB1_14
; GLOBALNESS0-NEXT: .LBB1_24: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1
@@ -566,14 +558,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42
-; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS0-NEXT: s_branch .LBB1_1
; GLOBALNESS0-NEXT: .LBB1_29: ; %bb73.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42
-; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS0-NEXT: s_branch .LBB1_2
; GLOBALNESS0-NEXT: .LBB1_30: ; %loop.exit.guard
; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir b/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir
index 08f5550f3b08a..4b967969366f4 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir
@@ -4,10 +4,10 @@
# Check that we get two move-immediates into %1 and %2, instead of a copy from
# %1 to %2, because that would introduce a dependency and maybe a stall.
---
-name: f
+name: remat_v_mov_b32_e32
tracksRegLiveness: true
body: |
- ; CHECK-LABEL: name: f
+ ; CHECK-LABEL: name: remat_v_mov_b32_e32
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; CHECK-NEXT: liveins: $sgpr0
@@ -46,3 +46,47 @@ body: |
%4.sub1:vreg_96 = COPY %2:vgpr_32
S_ENDPGM 0, implicit %4
...
+
+---
+name: remat_v_mov_b64_pseduo
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: remat_v_mov_b64_pseduo
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[V_MOV_B:%[0-9]+]].sub0_sub1:vreg_192_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]].sub2_sub3:vreg_192_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: $exec = S_MOV_B64_term [[COPY]]
+ ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]].sub0_sub1:vreg_192_align2 = V_MUL_F64_e64 0, [[V_MOV_B]].sub0_sub1, 0, [[V_MOV_B]].sub0_sub1, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]].sub2_sub3:vreg_192_align2 = V_MUL_F64_e64 0, [[V_MOV_B]].sub2_sub3, 0, [[V_MOV_B]].sub2_sub3, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]]
+ bb.0:
+ liveins: $sgpr0
+ %0:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
+ %1:vreg_64_align2 = COPY %0:vreg_64_align2
+ %2:vreg_64_align2 = COPY %0:vreg_64_align2
+ %3:sreg_64 = COPY $sgpr0_sgpr1
+ $exec = S_MOV_B64_term %3:sreg_64
+ S_CBRANCH_EXECZ %bb.2, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ %1:vreg_64_align2 = V_MUL_F64_e64 0, %1:vreg_64_align2, 0, %1:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+ %2:vreg_64_align2 = V_MUL_F64_e64 0, %2:vreg_64_align2, 0, %2:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+
+ bb.2:
+ undef %4.sub0_sub1:vreg_192 = COPY %1:vreg_64_align2
+ %4.sub2_sub3:vreg_192 = COPY %2:vreg_64_align2
+ S_ENDPGM 0, implicit %4
+...
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems reasonable. Which part of the remat implementation cares about the VOP1 flag?
@@ -46,3 +46,47 @@ body: | | |||
%4.sub1:vreg_96 = COPY %2:vgpr_32 | |||
S_ENDPGM 0, implicit %4 | |||
... | |||
|
|||
--- | |||
name: remat_v_mov_b64_pseduo |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Typo "pseduo"
canRemat, which I think is just hacking around the default skipping rematerialize due to the exec physreg use |
Couldn't it check |
That would also cover memory instructions |
I only see a couple of weird BUF and FLAT instructions that also set |
This is mostly true, and it tricks the rematerialization code into handling this without special casing it.
This is mostly true, and it tricks the rematerialization
code into handling this without special casing it.