Skip to content

Commit f95ad44

Browse files
authored
AMDGPU: Mark v_mov_b64_pseudo as a VOP1 instruction (#128677)
This is mostly true, and it tricks the rematerialization code into handling this without special casing it.
1 parent 820aa43 commit f95ad44

File tree

4 files changed

+102
-47
lines changed

4 files changed

+102
-47
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
136136
let isMoveImm = 1;
137137
let SchedRW = [Write64Bit];
138138
let Size = 4;
139+
let VOP1 = 1; // Not entirely correct, but close enough.
139140
let UseNamedOperandTable = 1;
140141
}
141142

llvm/test/CodeGen/AMDGPU/remat-sop.mir

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -653,4 +653,24 @@ body: |
653653
S_ENDPGM 0
654654
...
655655

656-
656+
---
657+
name: test_remat_s_mov_b64_imm_pseudo
658+
tracksRegLiveness: true
659+
body: |
660+
bb.0:
661+
; GCN-LABEL: name: test_remat_s_mov_b64_imm_pseudo
662+
; GCN: renamable $sgpr0_sgpr1 = S_MOV_B64_IMM_PSEUDO 1
663+
; GCN-NEXT: renamable $sgpr2_sgpr3 = S_MOV_B64_IMM_PSEUDO 2
664+
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1
665+
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3
666+
; GCN-NEXT: renamable $sgpr0_sgpr1 = S_MOV_B64_IMM_PSEUDO 3
667+
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1
668+
; GCN-NEXT: S_ENDPGM 0
669+
%0:sgpr_64 = S_MOV_B64_IMM_PSEUDO 1
670+
%1:sgpr_64 = S_MOV_B64_IMM_PSEUDO 2
671+
%2:sgpr_64 = S_MOV_B64_IMM_PSEUDO 3
672+
S_NOP 0, implicit %0
673+
S_NOP 0, implicit %1
674+
S_NOP 0, implicit %2
675+
S_ENDPGM 0
676+
...

llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll

Lines changed: 34 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
3434
; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[6:7]
3535
; GLOBALNESS1-NEXT: s_load_dwordx4 s[76:79], s[8:9], 0x0
3636
; GLOBALNESS1-NEXT: s_load_dword s6, s[8:9], 0x14
37-
; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v0
3837
; GLOBALNESS1-NEXT: v_mov_b32_e32 v42, 0
39-
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0
40-
; GLOBALNESS1-NEXT: global_store_dword v[0:1], v42, off
38+
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[44:45], 0, 0
39+
; GLOBALNESS1-NEXT: global_store_dword v[44:45], v42, off
4140
; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0)
4241
; GLOBALNESS1-NEXT: global_load_dword v2, v42, s[76:77]
4342
; GLOBALNESS1-NEXT: s_mov_b64 s[40:41], s[4:5]
@@ -46,6 +45,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
4645
; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s12, s17
4746
; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
4847
; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s17
48+
; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v0
4949
; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0
5050
; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0
5151
; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x40994400
@@ -73,13 +73,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
7373
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v0
7474
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v1
7575
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3
76+
; GLOBALNESS1-NEXT: v_mov_b32_e32 v46, 0x80
7677
; GLOBALNESS1-NEXT: s_mov_b32 s70, s16
7778
; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[8:9]
7879
; GLOBALNESS1-NEXT: s_mov_b32 s71, s15
7980
; GLOBALNESS1-NEXT: s_mov_b32 s72, s14
8081
; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11]
82+
; GLOBALNESS1-NEXT: v_mov_b32_e32 v47, 0
8183
; GLOBALNESS1-NEXT: s_mov_b32 s32, 0
82-
; GLOBALNESS1-NEXT: ; implicit-def: $vgpr44_vgpr45
84+
; GLOBALNESS1-NEXT: ; implicit-def: $vgpr56_vgpr57
8385
; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0)
8486
; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
8587
; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -106,17 +108,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
106108
; GLOBALNESS1-NEXT: .LBB1_3: ; %Flow28
107109
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
108110
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7]
109-
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[44:45], v[0:1], v[0:1] op_sel:[0,1]
111+
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
110112
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_30
111113
; GLOBALNESS1-NEXT: .LBB1_4: ; %bb5
112114
; GLOBALNESS1-NEXT: ; =>This Loop Header: Depth=1
113115
; GLOBALNESS1-NEXT: ; Child Loop BB1_16 Depth 2
114-
; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0x80
115-
; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0
116-
; GLOBALNESS1-NEXT: flat_load_dword v40, v[0:1]
116+
; GLOBALNESS1-NEXT: flat_load_dword v40, v[46:47]
117117
; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40
118118
; GLOBALNESS1-NEXT: buffer_store_dword v42, off, s[0:3], 0
119-
; GLOBALNESS1-NEXT: flat_load_dword v46, v[0:1]
119+
; GLOBALNESS1-NEXT: flat_load_dword v58, v[46:47]
120120
; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0
121121
; GLOBALNESS1-NEXT: s_getpc_b64 s[4:5]
122122
; GLOBALNESS1-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
@@ -160,8 +160,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
160160
; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_24
161161
; GLOBALNESS1-NEXT: ; %bb.10: ; %baz.exit.i
162162
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
163-
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0
164-
; GLOBALNESS1-NEXT: flat_load_dword v0, v[2:3]
163+
; GLOBALNESS1-NEXT: flat_load_dword v0, v[44:45]
165164
; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
166165
; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[62:63], 0, v0
167166
; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0
@@ -170,17 +169,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
170169
; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_26
171170
; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i
172171
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
173-
; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
172+
; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[44:45], off
174173
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[54:55]
175174
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_13
176175
; GLOBALNESS1-NEXT: ; %bb.12: ; %bb39.i
177176
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
178177
; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42
179-
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0
180-
; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off
178+
; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
181179
; GLOBALNESS1-NEXT: .LBB1_13: ; %bb44.lr.ph.i
182180
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
183-
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46
181+
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v58
184182
; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc
185183
; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0)
186184
; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
@@ -237,7 +235,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
237235
; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41
238236
; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0)
239237
; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77]
240-
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[46:47], 0, 0
241238
; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41]
242239
; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37]
243240
; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[68:69]
@@ -246,14 +243,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
246243
; GLOBALNESS1-NEXT: s_mov_b32 s13, s71
247244
; GLOBALNESS1-NEXT: s_mov_b32 s14, s70
248245
; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41
249-
; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[44:45], off
246+
; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[56:57], off
250247
; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77]
251248
; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[64:65]
252249
; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_14
253250
; GLOBALNESS1-NEXT: ; %bb.23: ; %bb62.i
254251
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2
255252
; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42
256-
; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[42:43], off
253+
; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
257254
; GLOBALNESS1-NEXT: s_branch .LBB1_14
258255
; GLOBALNESS1-NEXT: .LBB1_24: ; in Loop: Header=BB1_4 Depth=1
259256
; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1
@@ -274,14 +271,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
274271
; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i
275272
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
276273
; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42
277-
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0
278-
; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off
274+
; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
279275
; GLOBALNESS1-NEXT: s_branch .LBB1_1
280276
; GLOBALNESS1-NEXT: .LBB1_29: ; %bb73.i
281277
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
282278
; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42
283-
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0
284-
; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off
279+
; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
285280
; GLOBALNESS1-NEXT: s_branch .LBB1_2
286281
; GLOBALNESS1-NEXT: .LBB1_30: ; %loop.exit.guard
287282
; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
@@ -326,10 +321,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
326321
; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[6:7]
327322
; GLOBALNESS0-NEXT: s_load_dwordx4 s[72:75], s[8:9], 0x0
328323
; GLOBALNESS0-NEXT: s_load_dword s6, s[8:9], 0x14
329-
; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v0
330324
; GLOBALNESS0-NEXT: v_mov_b32_e32 v42, 0
331-
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0
332-
; GLOBALNESS0-NEXT: global_store_dword v[0:1], v42, off
325+
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[44:45], 0, 0
326+
; GLOBALNESS0-NEXT: global_store_dword v[44:45], v42, off
333327
; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0)
334328
; GLOBALNESS0-NEXT: global_load_dword v2, v42, s[72:73]
335329
; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[4:5]
@@ -338,6 +332,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
338332
; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s12, s17
339333
; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
340334
; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s17
335+
; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v0
341336
; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0
342337
; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0
343338
; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x40994400
@@ -365,13 +360,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
365360
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v0
366361
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v1
367362
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3
363+
; GLOBALNESS0-NEXT: v_mov_b32_e32 v46, 0x80
368364
; GLOBALNESS0-NEXT: s_mov_b32 s68, s16
369365
; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9]
370366
; GLOBALNESS0-NEXT: s_mov_b32 s69, s15
371367
; GLOBALNESS0-NEXT: s_mov_b32 s70, s14
372368
; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11]
369+
; GLOBALNESS0-NEXT: v_mov_b32_e32 v47, 0
373370
; GLOBALNESS0-NEXT: s_mov_b32 s32, 0
374-
; GLOBALNESS0-NEXT: ; implicit-def: $vgpr44_vgpr45
371+
; GLOBALNESS0-NEXT: ; implicit-def: $vgpr56_vgpr57
375372
; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0)
376373
; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
377374
; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -398,17 +395,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
398395
; GLOBALNESS0-NEXT: .LBB1_3: ; %Flow28
399396
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
400397
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7]
401-
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[44:45], v[0:1], v[0:1] op_sel:[0,1]
398+
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
402399
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_30
403400
; GLOBALNESS0-NEXT: .LBB1_4: ; %bb5
404401
; GLOBALNESS0-NEXT: ; =>This Loop Header: Depth=1
405402
; GLOBALNESS0-NEXT: ; Child Loop BB1_16 Depth 2
406-
; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0x80
407-
; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0
408-
; GLOBALNESS0-NEXT: flat_load_dword v40, v[0:1]
403+
; GLOBALNESS0-NEXT: flat_load_dword v40, v[46:47]
409404
; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40
410405
; GLOBALNESS0-NEXT: buffer_store_dword v42, off, s[0:3], 0
411-
; GLOBALNESS0-NEXT: flat_load_dword v46, v[0:1]
406+
; GLOBALNESS0-NEXT: flat_load_dword v58, v[46:47]
412407
; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0
413408
; GLOBALNESS0-NEXT: s_getpc_b64 s[4:5]
414409
; GLOBALNESS0-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
@@ -452,8 +447,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
452447
; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_24
453448
; GLOBALNESS0-NEXT: ; %bb.10: ; %baz.exit.i
454449
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
455-
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0
456-
; GLOBALNESS0-NEXT: flat_load_dword v0, v[2:3]
450+
; GLOBALNESS0-NEXT: flat_load_dword v0, v[44:45]
457451
; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
458452
; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[62:63], 0, v0
459453
; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0
@@ -462,17 +456,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
462456
; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_26
463457
; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i
464458
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
465-
; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
459+
; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[44:45], off
466460
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[54:55]
467461
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_13
468462
; GLOBALNESS0-NEXT: ; %bb.12: ; %bb39.i
469463
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
470464
; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42
471-
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0
472-
; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off
465+
; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
473466
; GLOBALNESS0-NEXT: .LBB1_13: ; %bb44.lr.ph.i
474467
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
475-
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46
468+
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v58
476469
; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc
477470
; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0)
478471
; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
@@ -529,7 +522,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
529522
; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41
530523
; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0)
531524
; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79]
532-
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[46:47], 0, 0
533525
; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41]
534526
; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37]
535527
; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[72:73]
@@ -538,14 +530,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
538530
; GLOBALNESS0-NEXT: s_mov_b32 s13, s69
539531
; GLOBALNESS0-NEXT: s_mov_b32 s14, s68
540532
; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41
541-
; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[44:45], off
533+
; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[56:57], off
542534
; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79]
543535
; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[64:65]
544536
; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_14
545537
; GLOBALNESS0-NEXT: ; %bb.23: ; %bb62.i
546538
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2
547539
; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42
548-
; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[42:43], off
540+
; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
549541
; GLOBALNESS0-NEXT: s_branch .LBB1_14
550542
; GLOBALNESS0-NEXT: .LBB1_24: ; in Loop: Header=BB1_4 Depth=1
551543
; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1
@@ -566,14 +558,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
566558
; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i
567559
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
568560
; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42
569-
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0
570-
; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off
561+
; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
571562
; GLOBALNESS0-NEXT: s_branch .LBB1_1
572563
; GLOBALNESS0-NEXT: .LBB1_29: ; %bb73.i
573564
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
574565
; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42
575-
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0
576-
; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off
566+
; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
577567
; GLOBALNESS0-NEXT: s_branch .LBB1_2
578568
; GLOBALNESS0-NEXT: .LBB1_30: ; %loop.exit.guard
579569
; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5]

llvm/test/CodeGen/AMDGPU/vgpr-remat.mir

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
# Check that we get two move-immediates into %1 and %2, instead of a copy from
55
# %1 to %2, because that would introduce a dependency and maybe a stall.
66
---
7-
name: f
7+
name: remat_v_mov_b32_e32
88
tracksRegLiveness: true
99
body: |
10-
; CHECK-LABEL: name: f
10+
; CHECK-LABEL: name: remat_v_mov_b32_e32
1111
; CHECK: bb.0:
1212
; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
1313
; CHECK-NEXT: liveins: $sgpr0
@@ -46,3 +46,47 @@ body: |
4646
%4.sub1:vreg_96 = COPY %2:vgpr_32
4747
S_ENDPGM 0, implicit %4
4848
...
49+
50+
---
51+
name: remat_v_mov_b64_pseudo
52+
tracksRegLiveness: true
53+
body: |
54+
; CHECK-LABEL: name: remat_v_mov_b64_pseudo
55+
; CHECK: bb.0:
56+
; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
57+
; CHECK-NEXT: liveins: $sgpr0
58+
; CHECK-NEXT: {{ $}}
59+
; CHECK-NEXT: undef [[V_MOV_B:%[0-9]+]].sub0_sub1:vreg_192_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
60+
; CHECK-NEXT: [[V_MOV_B:%[0-9]+]].sub2_sub3:vreg_192_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
61+
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
62+
; CHECK-NEXT: $exec = S_MOV_B64_term [[COPY]]
63+
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
64+
; CHECK-NEXT: S_BRANCH %bb.1
65+
; CHECK-NEXT: {{ $}}
66+
; CHECK-NEXT: bb.1:
67+
; CHECK-NEXT: successors: %bb.2(0x80000000)
68+
; CHECK-NEXT: {{ $}}
69+
; CHECK-NEXT: [[V_MOV_B:%[0-9]+]].sub0_sub1:vreg_192_align2 = V_MUL_F64_e64 0, [[V_MOV_B]].sub0_sub1, 0, [[V_MOV_B]].sub0_sub1, 0, 0, implicit $mode, implicit $exec
70+
; CHECK-NEXT: [[V_MOV_B:%[0-9]+]].sub2_sub3:vreg_192_align2 = V_MUL_F64_e64 0, [[V_MOV_B]].sub2_sub3, 0, [[V_MOV_B]].sub2_sub3, 0, 0, implicit $mode, implicit $exec
71+
; CHECK-NEXT: {{ $}}
72+
; CHECK-NEXT: bb.2:
73+
; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]]
74+
bb.0:
75+
liveins: $sgpr0
76+
%0:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
77+
%1:vreg_64_align2 = COPY %0:vreg_64_align2
78+
%2:vreg_64_align2 = COPY %0:vreg_64_align2
79+
%3:sreg_64 = COPY $sgpr0_sgpr1
80+
$exec = S_MOV_B64_term %3:sreg_64
81+
S_CBRANCH_EXECZ %bb.2, implicit $exec
82+
S_BRANCH %bb.1
83+
84+
bb.1:
85+
%1:vreg_64_align2 = V_MUL_F64_e64 0, %1:vreg_64_align2, 0, %1:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
86+
%2:vreg_64_align2 = V_MUL_F64_e64 0, %2:vreg_64_align2, 0, %2:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
87+
88+
bb.2:
89+
undef %4.sub0_sub1:vreg_192 = COPY %1:vreg_64_align2
90+
%4.sub2_sub3:vreg_192 = COPY %2:vreg_64_align2
91+
S_ENDPGM 0, implicit %4
92+
...

0 commit comments

Comments
 (0)