Skip to content

Commit 15bdb37

Browse files
committed
[AMDGPU] Fix crash due to assertion failure
Add check for FLAT instructions that dont use vector registers when computing VALU hazard. Change-Id: I558aed17b109047bdc64f8a7e5f419d4d37577cf
1 parent d70f54f commit 15bdb37

File tree

3 files changed

+160
-3
lines changed

3 files changed

+160
-3
lines changed

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -858,9 +858,12 @@ int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
858858
}
859859

860860
if (TII->isFLAT(MI)) {
861-
int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
862-
if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64)
863-
return DataIdx;
861+
// There is no hazard if the instruction does not use vector regs
862+
if (VDataIdx == -1)
863+
return -1;
864+
865+
if (AMDGPU::getRegBitWidth(VDataRCID) > 64)
866+
return VDataIdx;
864867
}
865868

866869
return -1;
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX942 %s
3+
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -O0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A %s
4+
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -O0 -global-isel=true -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A-GISEL %s
5+
6+
@G = global <2 x i32> splat (i32 5)
7+
8+
define amdgpu_ps void @global_load_lds_dword_saddr(ptr addrspace(1) inreg nocapture %gptr, ptr addrspace(3) nocapture %lptr) {
9+
; GFX942-LABEL: global_load_lds_dword_saddr:
10+
; GFX942: ; %bb.0: ; %main_body
11+
; GFX942-NEXT: s_getpc_b64 s[2:3]
12+
; GFX942-NEXT: s_add_u32 s2, s2, G@gotpcrel32@lo+4
13+
; GFX942-NEXT: s_addc_u32 s3, s3, G@gotpcrel32@hi+12
14+
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
15+
; GFX942-NEXT: v_mov_b32_e32 v1, 0
16+
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
17+
; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
18+
; GFX942-NEXT: flat_load_dwordx2 v[4:5], v[2:3]
19+
; GFX942-NEXT: v_readfirstlane_b32 s2, v0
20+
; GFX942-NEXT: s_mov_b32 m0, s2
21+
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
22+
; GFX942-NEXT: v_mul_lo_u32 v0, v4, 10
23+
; GFX942-NEXT: global_load_lds_dword v1, s[0:1] offset:32 nt
24+
; GFX942-NEXT: v_mul_lo_u32 v1, v5, 10
25+
; GFX942-NEXT: s_waitcnt vmcnt(0)
26+
; GFX942-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
27+
; GFX942-NEXT: s_endpgm
28+
;
29+
; GFX90A-LABEL: global_load_lds_dword_saddr:
30+
; GFX90A: ; %bb.0: ; %main_body
31+
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
32+
; GFX90A-NEXT: s_mov_b32 s2, s0
33+
; GFX90A-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
34+
; GFX90A-NEXT: s_mov_b32 s3, s1
35+
; GFX90A-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr2_sgpr3
36+
; GFX90A-NEXT: s_getpc_b64 s[0:1]
37+
; GFX90A-NEXT: s_add_u32 s0, s0, G@gotpcrel32@lo+4
38+
; GFX90A-NEXT: s_addc_u32 s1, s1, G@gotpcrel32@hi+12
39+
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
40+
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
41+
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
42+
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
43+
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
44+
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
45+
; GFX90A-NEXT: s_mov_b32 s4, 10
46+
; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s4
47+
; GFX90A-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec
48+
; GFX90A-NEXT: v_mul_lo_u32 v2, v2, s4
49+
; GFX90A-NEXT: ; implicit-def: $sgpr4
50+
; GFX90A-NEXT: ; implicit-def: $sgpr4
51+
; GFX90A-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
52+
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
53+
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
54+
; GFX90A-NEXT: ; implicit-def: $sgpr4
55+
; GFX90A-NEXT: v_readfirstlane_b32 s4, v1
56+
; GFX90A-NEXT: s_mov_b32 m0, s4
57+
; GFX90A-NEXT: s_nop 0
58+
; GFX90A-NEXT: global_load_dword v0, s[2:3] offset:32 slc lds
59+
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
60+
; GFX90A-NEXT: s_waitcnt vmcnt(0)
61+
; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
62+
; GFX90A-NEXT: s_endpgm
63+
;
64+
; GFX90A-GISEL-LABEL: global_load_lds_dword_saddr:
65+
; GFX90A-GISEL: ; %bb.0: ; %main_body
66+
; GFX90A-GISEL-NEXT: s_mov_b32 s2, s0
67+
; GFX90A-GISEL-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
68+
; GFX90A-GISEL-NEXT: s_mov_b32 s3, s1
69+
; GFX90A-GISEL-NEXT: s_mov_b32 s4, 10
70+
; GFX90A-GISEL-NEXT: s_getpc_b64 s[0:1]
71+
; GFX90A-GISEL-NEXT: s_add_u32 s0, s0, G@gotpcrel32@lo+4
72+
; GFX90A-GISEL-NEXT: s_addc_u32 s1, s1, G@gotpcrel32@hi+12
73+
; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
74+
; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0)
75+
; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
76+
; GFX90A-GISEL-NEXT: flat_load_dwordx2 v[4:5], v[2:3]
77+
; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
78+
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, v4
79+
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v1, v5
80+
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v3, s4
81+
; GFX90A-GISEL-NEXT: v_mul_lo_u32 v2, v2, v3
82+
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v3, s4
83+
; GFX90A-GISEL-NEXT: v_mul_lo_u32 v1, v1, v3
84+
; GFX90A-GISEL-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
85+
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v3, v1
86+
; GFX90A-GISEL-NEXT: v_readfirstlane_b32 s4, v0
87+
; GFX90A-GISEL-NEXT: s_mov_b32 m0, s4
88+
; GFX90A-GISEL-NEXT: v_mov_b32_e32 v0, 0
89+
; GFX90A-GISEL-NEXT: global_load_dword v0, s[2:3] offset:32 slc lds
90+
; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
91+
; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0)
92+
; GFX90A-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
93+
; GFX90A-GISEL-NEXT: s_endpgm
94+
main_body:
95+
%LGV = load <2 x i32>, ptr @G, align 8
96+
%B = mul <2 x i32> %LGV, splat (i32 10)
97+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 32, i32 2)
98+
store <2 x i32> %B, ptr @G, align 8
99+
ret void
100+
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck %s
3+
4+
--- |
5+
@G = global <2 x i32> splat (i32 5)
6+
7+
define amdgpu_ps void @global_load_lds_dword_saddr(ptr addrspace(1) inreg nocapture %gptr, ptr addrspace(3) nocapture %lptr) #0 {
8+
main_body:
9+
%LGV = load <2 x i32>, ptr @G, align 8
10+
%B = mul <2 x i32> %LGV, splat (i32 10)
11+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 32, i32 2)
12+
store <2 x i32> %B, ptr @G, align 8
13+
ret void
14+
}
15+
16+
attributes #0 = { "amdgpu-memory-bound"="true" "amdgpu-wave-limiter"="true" "target-cpu"="gfx942" }
17+
18+
...
19+
---
20+
name: global_load_lds_dword_saddr
21+
noVRegs: true
22+
body: |
23+
bb.0.main_body:
24+
liveins: $sgpr0, $sgpr1, $vgpr0
25+
26+
; CHECK-LABEL: name: global_load_lds_dword_saddr
27+
; CHECK: liveins: $sgpr0, $sgpr1, $vgpr0
28+
; CHECK-NEXT: {{ $}}
29+
; CHECK-NEXT: renamable $sgpr2_sgpr3 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @G, target-flags(amdgpu-gotprel32-hi) @G, implicit-def dead $scc
30+
; CHECK-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr2_sgpr3, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
31+
; CHECK-NEXT: renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec
32+
; CHECK-NEXT: renamable $vgpr2_vgpr3 = COPY killed renamable $sgpr2_sgpr3
33+
; CHECK-NEXT: renamable $vgpr4_vgpr5 = FLAT_LOAD_DWORDX2 renamable $vgpr2_vgpr3, 0, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s64) from @G)
34+
; CHECK-NEXT: renamable $sgpr2 = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec
35+
; CHECK-NEXT: $m0 = S_MOV_B32 killed renamable $sgpr2
36+
; CHECK-NEXT: S_NOP 0
37+
; CHECK-NEXT: GLOBAL_LOAD_LDS_DWORD_SADDR killed $sgpr0_sgpr1, killed $vgpr1, 32, 2, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(1) poison` + 32, align 1, addrspace 1), (store (s32) into %ir.lptr + 32, addrspace 3)
38+
; CHECK-NEXT: renamable $vgpr1 = V_MUL_LO_U32_e64 $vgpr5, 10, implicit $exec
39+
; CHECK-NEXT: renamable $vgpr0 = V_MUL_LO_U32_e64 killed $vgpr4, 10, implicit $exec
40+
; CHECK-NEXT: FLAT_STORE_DWORDX2 killed renamable $vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into @G)
41+
; CHECK-NEXT: S_ENDPGM 0
42+
renamable $sgpr2_sgpr3 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @G, target-flags(amdgpu-gotprel32-hi) @G, implicit-def dead $scc
43+
renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr2_sgpr3, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
44+
renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec
45+
renamable $vgpr2_vgpr3 = COPY killed renamable $sgpr2_sgpr3
46+
renamable $vgpr4_vgpr5 = FLAT_LOAD_DWORDX2 renamable $vgpr2_vgpr3, 0, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s64) from @G)
47+
renamable $sgpr2 = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec
48+
$m0 = S_MOV_B32 killed renamable $sgpr2
49+
GLOBAL_LOAD_LDS_DWORD_SADDR killed $sgpr0_sgpr1, killed $vgpr1, 32, 2, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(1) poison` + 32, align 1, addrspace 1), (store (s32) into %ir.lptr + 32, addrspace 3)
50+
renamable $vgpr1 = V_MUL_LO_U32_e64 $vgpr5, 10, implicit $exec
51+
renamable $vgpr0 = V_MUL_LO_U32_e64 killed $vgpr4, 10, implicit $exec
52+
FLAT_STORE_DWORDX2 killed renamable $vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into @G)
53+
S_ENDPGM 0
54+
...

0 commit comments

Comments
 (0)