Skip to content

Commit 9ca1323

Browse files
authored
[AMDGPU] Fix crash due to missing check for FLAT instructions that dont use vector registers when computing VALU hazard (#123627)
1 parent f233a54 commit 9ca1323

File tree

3 files changed

+93
-3
lines changed

3 files changed

+93
-3
lines changed

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -858,9 +858,12 @@ int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
858858
}
859859

860860
if (TII->isFLAT(MI)) {
861-
int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
862-
if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64)
863-
return DataIdx;
861+
// There is no hazard if the instruction does not use vector regs
862+
if (VDataIdx == -1)
863+
return -1;
864+
865+
if (AMDGPU::getRegBitWidth(VDataRCID) > 64)
866+
return VDataIdx;
864867
}
865868

866869
return -1;
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
3+
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -O0 < %s | FileCheck -check-prefix=GFX90A %s
4+
5+
@G = addrspace(1) global <2 x i32> splat (i32 5)
6+
7+
define amdgpu_ps void @global_load_lds_dword_saddr(ptr addrspace(1) inreg nocapture %gptr, ptr addrspace(3) nocapture %lptr) {
8+
; GFX942-LABEL: global_load_lds_dword_saddr:
9+
; GFX942: ; %bb.0: ; %main_body
10+
; GFX942-NEXT: v_readfirstlane_b32 s2, v0
11+
; GFX942-NEXT: v_mov_b32_e32 v2, 0
12+
; GFX942-NEXT: s_mov_b32 m0, s2
13+
; GFX942-NEXT: s_nop 0
14+
; GFX942-NEXT: global_load_lds_dword v2, s[0:1] offset:32 nt
15+
; GFX942-NEXT: s_getpc_b64 s[0:1]
16+
; GFX942-NEXT: s_add_u32 s0, s0, G@gotpcrel32@lo+4
17+
; GFX942-NEXT: s_addc_u32 s1, s1, G@gotpcrel32@hi+12
18+
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
19+
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
20+
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
21+
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
22+
; GFX942-NEXT: s_mul_i32 s3, s3, 10
23+
; GFX942-NEXT: s_mul_i32 s2, s2, 10
24+
; GFX942-NEXT: v_mov_b32_e32 v0, s2
25+
; GFX942-NEXT: v_mov_b32_e32 v1, s3
26+
; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
27+
; GFX942-NEXT: s_endpgm
28+
;
29+
; GFX90A-LABEL: global_load_lds_dword_saddr:
30+
; GFX90A: ; %bb.0: ; %main_body
31+
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
32+
; GFX90A-NEXT: s_mov_b32 s2, s0
33+
; GFX90A-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
34+
; GFX90A-NEXT: s_mov_b32 s3, s1
35+
; GFX90A-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr2_sgpr3
36+
; GFX90A-NEXT: s_getpc_b64 s[0:1]
37+
; GFX90A-NEXT: s_add_u32 s0, s0, G@gotpcrel32@lo+4
38+
; GFX90A-NEXT: s_addc_u32 s1, s1, G@gotpcrel32@hi+12
39+
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
40+
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
41+
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
42+
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
43+
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
44+
; GFX90A-NEXT: s_mov_b32 s4, s9
45+
; GFX90A-NEXT: s_mov_b32 s6, 10
46+
; GFX90A-NEXT: s_mul_i32 s4, s4, s6
47+
; GFX90A-NEXT: s_mov_b32 s5, s8
48+
; GFX90A-NEXT: s_mul_i32 s5, s5, s6
49+
; GFX90A-NEXT: v_mov_b32_e32 v2, s5
50+
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
51+
; GFX90A-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
52+
; GFX90A-NEXT: v_mov_b32_e32 v3, v4
53+
; GFX90A-NEXT: ; implicit-def: $sgpr4
54+
; GFX90A-NEXT: v_readfirstlane_b32 s4, v1
55+
; GFX90A-NEXT: s_mov_b32 m0, s4
56+
; GFX90A-NEXT: s_nop 0
57+
; GFX90A-NEXT: global_load_dword v0, s[2:3] offset:32 slc lds
58+
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
59+
; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
60+
; GFX90A-NEXT: s_endpgm
61+
main_body:
62+
%LGV = load <2 x i32>, ptr addrspace(1) @G, align 8
63+
%B = mul <2 x i32> %LGV, splat (i32 10)
64+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 32, i32 2)
65+
store <2 x i32> %B, ptr addrspace(1) @G, align 8
66+
ret void
67+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN %s
3+
4+
---
5+
name: test_flat_valu_hazard
6+
noVRegs: true
7+
body: |
8+
bb.0:
9+
liveins: $vgpr0, $vgpr1
10+
11+
; GCN-LABEL: name: test_flat_valu_hazard
12+
; GCN: liveins: $vgpr0, $vgpr1
13+
; GCN-NEXT: {{ $}}
14+
; GCN-NEXT: GLOBAL_LOAD_LDS_DWORD_SADDR killed $sgpr0_sgpr1, killed $vgpr0, 32, 2, implicit $m0, implicit $exec
15+
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec, implicit $exec
16+
; GCN-NEXT: FLAT_STORE_DWORDX2 killed renamable $vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
17+
GLOBAL_LOAD_LDS_DWORD_SADDR killed $sgpr0_sgpr1, killed $vgpr0, 32, 2, implicit $m0, implicit $exec
18+
$vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec, implicit $exec
19+
FLAT_STORE_DWORDX2 killed renamable $vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
20+
...

0 commit comments

Comments
 (0)