Skip to content

Commit 0e43f42

Browse files
committed
Fixes for SWDEV-508901
Cherry-pick of remaining stack from llvm#126148 plus a downstream test. This is a combination of 2 commits. [AMDGPU] Remove dead function metadata after amdgpu-lower-kernel-arguments The verifier ensures function !dbg metadata is unique across the module, so ensure the old nameless function we leave behind doesn't violate this invariant. Removing the function via e.g. eraseFromParent seems like a better option, but doesn't seem to be legal from a FunctionPass. [AMDGPU] Push amdgpu-preload-kern-arg-prolog after livedebugvalues This is effectively a workaround for a bug in livedebugvalues, but seems to potentially be a general improvement, as BB sections seems like it could ruin the special 256-byte prelude scheme that amdgpu-preload-kern-arg-prolog requires anyway. Moving it even later doesn't seem to have any material impact, and just adds livedebugvalues to the list of things which no longer have to deal with pseudo multiple-entry functions.
1 parent 808bf02 commit 0e43f42

File tree

5 files changed

+316
-8
lines changed

5 files changed

+316
-8
lines changed

llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ class PreloadKernelArgInfo {
132132
NF->setAttributes(AL);
133133
F.replaceAllUsesWith(NF);
134134
F.setCallingConv(CallingConv::C);
135+
F.clearMetadata();
135136

136137
return NF;
137138
}

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1189,6 +1189,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
11891189
void addPostRegAlloc() override;
11901190
void addPreSched2() override;
11911191
void addPreEmitPass() override;
1192+
void addPostBBSections() override;
11921193
};
11931194

11941195
} // end anonymous namespace
@@ -1731,6 +1732,11 @@ void GCNPassConfig::addPreEmitPass() {
17311732
addPass(&AMDGPUInsertDelayAluID);
17321733

17331734
addPass(&BranchRelaxationPassID);
1735+
}
1736+
1737+
void GCNPassConfig::addPostBBSections() {
1738+
// We run this later to avoid passes like livedebugvalues and BBSections
1739+
// having to deal with the apparent multi-entry functions we may generate.
17341740
addPass(createAMDGPUPreloadKernArgPrologLegacyPass());
17351741
}
17361742

llvm/test/CodeGen/AMDGPU/llc-pipeline.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -147,11 +147,11 @@
147147
; GCN-O0-NEXT: Post RA hazard recognizer
148148
; GCN-O0-NEXT: AMDGPU Insert waits for SGPR read hazards
149149
; GCN-O0-NEXT: Branch relaxation pass
150-
; GCN-O0-NEXT: AMDGPU Preload Kernel Arguments Prolog
151150
; GCN-O0-NEXT: Register Usage Information Collector Pass
152151
; GCN-O0-NEXT: Remove Loads Into Fake Uses
153152
; GCN-O0-NEXT: Live DEBUG_VALUE analysis
154153
; GCN-O0-NEXT: Machine Sanitizer Binary Metadata
154+
; GCN-O0-NEXT: AMDGPU Preload Kernel Arguments Prolog
155155
; GCN-O0-NEXT: Lazy Machine Block Frequency Analysis
156156
; GCN-O0-NEXT: Machine Optimization Remark Emitter
157157
; GCN-O0-NEXT: Stack Frame Layout Analysis
@@ -434,11 +434,11 @@
434434
; GCN-O1-NEXT: AMDGPU Insert waits for SGPR read hazards
435435
; GCN-O1-NEXT: AMDGPU Insert Delay ALU
436436
; GCN-O1-NEXT: Branch relaxation pass
437-
; GCN-O1-NEXT: AMDGPU Preload Kernel Arguments Prolog
438437
; GCN-O1-NEXT: Register Usage Information Collector Pass
439438
; GCN-O1-NEXT: Remove Loads Into Fake Uses
440439
; GCN-O1-NEXT: Live DEBUG_VALUE analysis
441440
; GCN-O1-NEXT: Machine Sanitizer Binary Metadata
441+
; GCN-O1-NEXT: AMDGPU Preload Kernel Arguments Prolog
442442
; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis
443443
; GCN-O1-NEXT: Machine Optimization Remark Emitter
444444
; GCN-O1-NEXT: Stack Frame Layout Analysis
@@ -749,11 +749,11 @@
749749
; GCN-O1-OPTS-NEXT: AMDGPU Insert waits for SGPR read hazards
750750
; GCN-O1-OPTS-NEXT: AMDGPU Insert Delay ALU
751751
; GCN-O1-OPTS-NEXT: Branch relaxation pass
752-
; GCN-O1-OPTS-NEXT: AMDGPU Preload Kernel Arguments Prolog
753752
; GCN-O1-OPTS-NEXT: Register Usage Information Collector Pass
754753
; GCN-O1-OPTS-NEXT: Remove Loads Into Fake Uses
755754
; GCN-O1-OPTS-NEXT: Live DEBUG_VALUE analysis
756755
; GCN-O1-OPTS-NEXT: Machine Sanitizer Binary Metadata
756+
; GCN-O1-OPTS-NEXT: AMDGPU Preload Kernel Arguments Prolog
757757
; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis
758758
; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter
759759
; GCN-O1-OPTS-NEXT: Stack Frame Layout Analysis
@@ -1070,11 +1070,11 @@
10701070
; GCN-O2-NEXT: AMDGPU Insert waits for SGPR read hazards
10711071
; GCN-O2-NEXT: AMDGPU Insert Delay ALU
10721072
; GCN-O2-NEXT: Branch relaxation pass
1073-
; GCN-O2-NEXT: AMDGPU Preload Kernel Arguments Prolog
10741073
; GCN-O2-NEXT: Register Usage Information Collector Pass
10751074
; GCN-O2-NEXT: Remove Loads Into Fake Uses
10761075
; GCN-O2-NEXT: Live DEBUG_VALUE analysis
10771076
; GCN-O2-NEXT: Machine Sanitizer Binary Metadata
1077+
; GCN-O2-NEXT: AMDGPU Preload Kernel Arguments Prolog
10781078
; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis
10791079
; GCN-O2-NEXT: Machine Optimization Remark Emitter
10801080
; GCN-O2-NEXT: Stack Frame Layout Analysis
@@ -1404,11 +1404,11 @@
14041404
; GCN-O3-NEXT: AMDGPU Insert waits for SGPR read hazards
14051405
; GCN-O3-NEXT: AMDGPU Insert Delay ALU
14061406
; GCN-O3-NEXT: Branch relaxation pass
1407-
; GCN-O3-NEXT: AMDGPU Preload Kernel Arguments Prolog
14081407
; GCN-O3-NEXT: Register Usage Information Collector Pass
14091408
; GCN-O3-NEXT: Remove Loads Into Fake Uses
14101409
; GCN-O3-NEXT: Live DEBUG_VALUE analysis
14111410
; GCN-O3-NEXT: Machine Sanitizer Binary Metadata
1411+
; GCN-O3-NEXT: AMDGPU Preload Kernel Arguments Prolog
14121412
; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis
14131413
; GCN-O3-NEXT: Machine Optimization Remark Emitter
14141414
; GCN-O3-NEXT: Stack Frame Layout Analysis
Lines changed: 295 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,295 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940 %s
3+
4+
define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg noundef %dst.coerce, ptr addrspace(1) inreg noundef %src.coerce, i64 inreg noundef %nElts, i64 inreg noundef %redOpArg, i1 inreg noundef %redOpArgIsPtr) #0 !dbg !4 {
5+
; GFX940-LABEL: preload_block_count_x:
6+
; GFX940: .Lfunc_begin0:
7+
; GFX940-NEXT: .file 0 "/" "<stdin>"
8+
; GFX940-NEXT: .cfi_sections .debug_frame
9+
; GFX940-NEXT: .cfi_startproc
10+
; GFX940-NEXT: ; %bb.5:
11+
; GFX940-NEXT: .loc 0 1 0 prologue_end ; <stdin>:1:0
12+
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
13+
; GFX940-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8
14+
; GFX940-NEXT: s_load_dword s12, s[0:1], 0x28
15+
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
16+
; GFX940-NEXT: s_branch .LBB0_0
17+
; GFX940-NEXT: .loc 0 0 0 is_stmt 0 ; :0:0
18+
; GFX940-NEXT: .Ltmp0:
19+
; GFX940-NEXT: .p2align 8
20+
; GFX940-NEXT: ; %bb.6:
21+
; GFX940-NEXT: .LBB0_0: ; %entry
22+
; GFX940-NEXT: .cfi_escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 ;
23+
; GFX940-NEXT: .cfi_undefined 16
24+
; GFX940-NEXT: s_mov_b32 s0, s13
25+
; GFX940-NEXT: .Ltmp1:
26+
; GFX940-NEXT: ;DEBUG_VALUE: test:var <- [DW_OP_LLVM_poisoned] $sgpr2_sgpr3
27+
; GFX940-NEXT: .loc 0 1 0 is_stmt 1 ; <stdin>:1:0
28+
; GFX940-NEXT: s_ashr_i32 s13, s12, 31
29+
; GFX940-NEXT: s_or_b64 s[8:9], s[6:7], s[12:13]
30+
; GFX940-NEXT: s_mov_b32 s8, 0
31+
; GFX940-NEXT: s_cmp_lg_u64 s[8:9], 0
32+
; GFX940-NEXT: s_cbranch_scc0 .LBB0_4
33+
; GFX940-NEXT: .Ltmp2:
34+
; GFX940-NEXT: ; %bb.1:
35+
; GFX940-NEXT: ;DEBUG_VALUE: test:var <- [DW_OP_LLVM_poisoned] $sgpr2_sgpr3
36+
; GFX940-NEXT: v_cvt_f32_u32_e32 v0, s12
37+
; GFX940-NEXT: v_cvt_f32_u32_e32 v1, s13
38+
; GFX940-NEXT: s_sub_u32 s1, 0, s12
39+
; GFX940-NEXT: s_subb_u32 s3, 0, s13
40+
; GFX940-NEXT: .Ltmp3:
41+
; GFX940-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0
42+
; GFX940-NEXT: v_rcp_f32_e32 v0, v0
43+
; GFX940-NEXT: s_nop 0
44+
; GFX940-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
45+
; GFX940-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
46+
; GFX940-NEXT: v_trunc_f32_e32 v1, v1
47+
; GFX940-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0
48+
; GFX940-NEXT: v_cvt_u32_f32_e32 v1, v1
49+
; GFX940-NEXT: v_cvt_u32_f32_e32 v0, v0
50+
; GFX940-NEXT: v_readfirstlane_b32 s5, v1
51+
; GFX940-NEXT: v_readfirstlane_b32 s8, v0
52+
; GFX940-NEXT: s_mul_i32 s9, s1, s5
53+
; GFX940-NEXT: s_mul_hi_u32 s15, s1, s8
54+
; GFX940-NEXT: s_mul_i32 s14, s3, s8
55+
; GFX940-NEXT: s_add_i32 s9, s15, s9
56+
; GFX940-NEXT: s_add_i32 s9, s9, s14
57+
; GFX940-NEXT: s_mul_i32 s16, s1, s8
58+
; GFX940-NEXT: s_mul_hi_u32 s14, s8, s9
59+
; GFX940-NEXT: s_mul_i32 s15, s8, s9
60+
; GFX940-NEXT: s_mul_hi_u32 s8, s8, s16
61+
; GFX940-NEXT: s_add_u32 s8, s8, s15
62+
; GFX940-NEXT: s_addc_u32 s14, 0, s14
63+
; GFX940-NEXT: s_mul_hi_u32 s17, s5, s16
64+
; GFX940-NEXT: s_mul_i32 s16, s5, s16
65+
; GFX940-NEXT: s_add_u32 s8, s8, s16
66+
; GFX940-NEXT: s_mul_hi_u32 s15, s5, s9
67+
; GFX940-NEXT: s_addc_u32 s8, s14, s17
68+
; GFX940-NEXT: s_addc_u32 s14, s15, 0
69+
; GFX940-NEXT: s_mul_i32 s9, s5, s9
70+
; GFX940-NEXT: s_add_u32 s8, s8, s9
71+
; GFX940-NEXT: s_addc_u32 s9, 0, s14
72+
; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, s8, v0
73+
; GFX940-NEXT: s_cmp_lg_u64 vcc, 0
74+
; GFX940-NEXT: s_addc_u32 s5, s5, s9
75+
; GFX940-NEXT: v_readfirstlane_b32 s9, v0
76+
; GFX940-NEXT: s_mul_i32 s8, s1, s5
77+
; GFX940-NEXT: s_mul_hi_u32 s14, s1, s9
78+
; GFX940-NEXT: s_add_i32 s8, s14, s8
79+
; GFX940-NEXT: s_mul_i32 s3, s3, s9
80+
; GFX940-NEXT: s_add_i32 s8, s8, s3
81+
; GFX940-NEXT: s_mul_i32 s1, s1, s9
82+
; GFX940-NEXT: s_mul_hi_u32 s14, s5, s1
83+
; GFX940-NEXT: s_mul_i32 s15, s5, s1
84+
; GFX940-NEXT: s_mul_i32 s17, s9, s8
85+
; GFX940-NEXT: s_mul_hi_u32 s1, s9, s1
86+
; GFX940-NEXT: s_mul_hi_u32 s16, s9, s8
87+
; GFX940-NEXT: s_add_u32 s1, s1, s17
88+
; GFX940-NEXT: s_addc_u32 s9, 0, s16
89+
; GFX940-NEXT: s_add_u32 s1, s1, s15
90+
; GFX940-NEXT: s_mul_hi_u32 s3, s5, s8
91+
; GFX940-NEXT: s_addc_u32 s1, s9, s14
92+
; GFX940-NEXT: s_addc_u32 s3, s3, 0
93+
; GFX940-NEXT: s_mul_i32 s8, s5, s8
94+
; GFX940-NEXT: s_add_u32 s1, s1, s8
95+
; GFX940-NEXT: s_addc_u32 s3, 0, s3
96+
; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, s1, v0
97+
; GFX940-NEXT: s_cmp_lg_u64 vcc, 0
98+
; GFX940-NEXT: s_addc_u32 s1, s5, s3
99+
; GFX940-NEXT: v_readfirstlane_b32 s8, v0
100+
; GFX940-NEXT: s_mul_i32 s5, s6, s1
101+
; GFX940-NEXT: s_mul_hi_u32 s9, s6, s8
102+
; GFX940-NEXT: s_mul_hi_u32 s3, s6, s1
103+
; GFX940-NEXT: s_add_u32 s5, s9, s5
104+
; GFX940-NEXT: s_addc_u32 s3, 0, s3
105+
; GFX940-NEXT: s_mul_hi_u32 s14, s7, s8
106+
; GFX940-NEXT: s_mul_i32 s8, s7, s8
107+
; GFX940-NEXT: s_add_u32 s5, s5, s8
108+
; GFX940-NEXT: s_mul_hi_u32 s9, s7, s1
109+
; GFX940-NEXT: s_addc_u32 s3, s3, s14
110+
; GFX940-NEXT: s_addc_u32 s5, s9, 0
111+
; GFX940-NEXT: s_mul_i32 s1, s7, s1
112+
; GFX940-NEXT: s_add_u32 s1, s3, s1
113+
; GFX940-NEXT: s_addc_u32 s3, 0, s5
114+
; GFX940-NEXT: s_mul_i32 s5, s12, s3
115+
; GFX940-NEXT: s_mul_hi_u32 s8, s12, s1
116+
; GFX940-NEXT: s_add_i32 s5, s8, s5
117+
; GFX940-NEXT: s_mul_i32 s8, s13, s1
118+
; GFX940-NEXT: s_mul_i32 s9, s12, s1
119+
; GFX940-NEXT: s_add_i32 s5, s5, s8
120+
; GFX940-NEXT: v_mov_b32_e32 v0, s9
121+
; GFX940-NEXT: s_sub_i32 s8, s7, s5
122+
; GFX940-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0
123+
; GFX940-NEXT: s_cmp_lg_u64 vcc, 0
124+
; GFX940-NEXT: s_subb_u32 s14, s8, s13
125+
; GFX940-NEXT: v_subrev_co_u32_e64 v1, s[8:9], s12, v0
126+
; GFX940-NEXT: s_cmp_lg_u64 s[8:9], 0
127+
; GFX940-NEXT: s_subb_u32 s8, s14, 0
128+
; GFX940-NEXT: s_cmp_ge_u32 s8, s13
129+
; GFX940-NEXT: v_readfirstlane_b32 s14, v1
130+
; GFX940-NEXT: s_cselect_b32 s9, -1, 0
131+
; GFX940-NEXT: s_cmp_ge_u32 s14, s12
132+
; GFX940-NEXT: s_cselect_b32 s14, -1, 0
133+
; GFX940-NEXT: s_cmp_eq_u32 s8, s13
134+
; GFX940-NEXT: s_cselect_b32 s8, s14, s9
135+
; GFX940-NEXT: s_add_u32 s9, s1, 1
136+
; GFX940-NEXT: s_addc_u32 s14, s3, 0
137+
; GFX940-NEXT: s_add_u32 s15, s1, 2
138+
; GFX940-NEXT: s_addc_u32 s16, s3, 0
139+
; GFX940-NEXT: s_cmp_lg_u32 s8, 0
140+
; GFX940-NEXT: s_cselect_b32 s8, s15, s9
141+
; GFX940-NEXT: s_cselect_b32 s9, s16, s14
142+
; GFX940-NEXT: s_cmp_lg_u64 vcc, 0
143+
; GFX940-NEXT: s_subb_u32 s5, s7, s5
144+
; GFX940-NEXT: s_cmp_ge_u32 s5, s13
145+
; GFX940-NEXT: v_readfirstlane_b32 s15, v0
146+
; GFX940-NEXT: s_cselect_b32 s14, -1, 0
147+
; GFX940-NEXT: s_cmp_ge_u32 s15, s12
148+
; GFX940-NEXT: s_cselect_b32 s15, -1, 0
149+
; GFX940-NEXT: s_cmp_eq_u32 s5, s13
150+
; GFX940-NEXT: s_cselect_b32 s5, s15, s14
151+
; GFX940-NEXT: s_cmp_lg_u32 s5, 0
152+
; GFX940-NEXT: s_cselect_b32 s9, s9, s3
153+
; GFX940-NEXT: s_cselect_b32 s8, s8, s1
154+
; GFX940-NEXT: s_cbranch_execnz .LBB0_3
155+
; GFX940-NEXT: .LBB0_2:
156+
; GFX940-NEXT: v_cvt_f32_u32_e32 v0, s12
157+
; GFX940-NEXT: s_sub_i32 s1, 0, s12
158+
; GFX940-NEXT: s_mov_b32 s9, 0
159+
; GFX940-NEXT: v_rcp_iflag_f32_e32 v0, v0
160+
; GFX940-NEXT: s_nop 0
161+
; GFX940-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
162+
; GFX940-NEXT: v_cvt_u32_f32_e32 v0, v0
163+
; GFX940-NEXT: s_nop 0
164+
; GFX940-NEXT: v_readfirstlane_b32 s3, v0
165+
; GFX940-NEXT: s_mul_i32 s1, s1, s3
166+
; GFX940-NEXT: s_mul_hi_u32 s1, s3, s1
167+
; GFX940-NEXT: s_add_i32 s3, s3, s1
168+
; GFX940-NEXT: s_mul_hi_u32 s1, s6, s3
169+
; GFX940-NEXT: s_mul_i32 s5, s1, s12
170+
; GFX940-NEXT: s_sub_i32 s5, s6, s5
171+
; GFX940-NEXT: s_add_i32 s3, s1, 1
172+
; GFX940-NEXT: s_sub_i32 s8, s5, s12
173+
; GFX940-NEXT: s_cmp_ge_u32 s5, s12
174+
; GFX940-NEXT: s_cselect_b32 s1, s3, s1
175+
; GFX940-NEXT: s_cselect_b32 s5, s8, s5
176+
; GFX940-NEXT: s_add_i32 s3, s1, 1
177+
; GFX940-NEXT: s_cmp_ge_u32 s5, s12
178+
; GFX940-NEXT: s_cselect_b32 s8, s3, s1
179+
; GFX940-NEXT: .LBB0_3:
180+
; GFX940-NEXT: s_ashr_i32 s1, s0, 31
181+
; GFX940-NEXT: s_add_u32 s3, s8, 15
182+
; GFX940-NEXT: s_addc_u32 s5, s9, 0
183+
; GFX940-NEXT: s_and_b32 s3, s3, -16
184+
; GFX940-NEXT: s_mul_i32 s1, s3, s1
185+
; GFX940-NEXT: s_mul_hi_u32 s8, s3, s0
186+
; GFX940-NEXT: s_add_i32 s1, s8, s1
187+
; GFX940-NEXT: s_mul_i32 s5, s5, s0
188+
; GFX940-NEXT: s_add_i32 s1, s1, s5
189+
; GFX940-NEXT: s_mul_i32 s3, s3, s0
190+
; GFX940-NEXT: v_cvt_f64_i32_e32 v[0:1], s1
191+
; GFX940-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32
192+
; GFX940-NEXT: v_cvt_f64_u32_e32 v[2:3], s3
193+
; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
194+
; GFX940-NEXT: v_cvt_f64_u32_e32 v[2:3], s7
195+
; GFX940-NEXT: v_ldexp_f64 v[2:3], v[2:3], 32
196+
; GFX940-NEXT: v_cvt_f64_u32_e32 v[4:5], s6
197+
; GFX940-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
198+
; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
199+
; GFX940-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
200+
; GFX940-NEXT: s_movk_i32 s0, 0xffe0
201+
; GFX940-NEXT: v_ldexp_f64 v[2:3], v[0:1], s0
202+
; GFX940-NEXT: v_floor_f64_e32 v[2:3], v[2:3]
203+
; GFX940-NEXT: v_fmac_f64_e32 v[0:1], 0xc1f00000, v[2:3]
204+
; GFX940-NEXT: v_cvt_u32_f64_e32 v0, v[0:1]
205+
; GFX940-NEXT: v_add_u32_e32 v1, s2, v0
206+
; GFX940-NEXT: v_add_u32_e32 v0, s4, v0
207+
; GFX940-NEXT: v_or_b32_e32 v0, v0, v1
208+
; GFX940-NEXT: v_and_b32_e32 v0, 15, v0
209+
; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
210+
; GFX940-NEXT: s_nop 1
211+
; GFX940-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
212+
; GFX940-NEXT: ;;#ASMSTART
213+
; GFX940-NEXT: ;;#ASMEND
214+
; GFX940-NEXT: s_endpgm
215+
; GFX940-NEXT: .LBB0_4:
216+
; GFX940-NEXT: .Ltmp4:
217+
; GFX940-NEXT: ;DEBUG_VALUE: test:var <- [DW_OP_LLVM_poisoned] $sgpr2_sgpr3
218+
; GFX940-NEXT: ; implicit-def: $sgpr8_sgpr9
219+
; GFX940-NEXT: s_branch .LBB0_2
220+
; GFX940-NEXT: .Ltmp5:
221+
entry:
222+
%0 = ptrtoint ptr addrspace(1) %dst.coerce to i64
223+
%1 = inttoptr i64 %0 to ptr
224+
%2 = ptrtoint ptr addrspace(1) %src.coerce to i64
225+
#dbg_value(ptr %1, !8, !DIExpression(DIOpArg(0, ptr)), !10)
226+
%3 = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x(), !dbg !10
227+
%4 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr(), !dbg !10
228+
%5 = tail call i32 @llvm.amdgcn.workgroup.id.x(), !dbg !10
229+
%6 = load i32, ptr addrspace(4) %4, align 4, !dbg !10
230+
%7 = getelementptr inbounds nuw i8, ptr addrspace(4) %4, i64 12, !dbg !10
231+
%8 = load i16, ptr addrspace(4) %7, align 4, !dbg !10
232+
%conv.i.i = zext i16 %8 to i32, !dbg !10
233+
%conv = sext i32 %5 to i64, !dbg !10
234+
%conv6 = sext i32 %6 to i64, !dbg !10
235+
%div = udiv i64 %nElts, %conv6, !dbg !10
236+
%sub.i = add i64 %div, 15, !dbg !10
237+
%and.i = and i64 %sub.i, -16, !dbg !10
238+
%mul = mul i64 %and.i, %conv, !dbg !10
239+
%add8 = add nsw i32 %5, 1, !dbg !10
240+
%conv9 = sext i32 %add8 to i64, !dbg !10
241+
%mul13 = mul i64 %and.i, %conv9, !dbg !10
242+
%conv.i = sitofp i64 %mul to double, !dbg !10
243+
%conv1.i = uitofp i64 %nElts to double, !dbg !10
244+
%9 = tail call contract noundef double @llvm.minnum.f64(double %conv.i, double %conv1.i), !dbg !10
245+
%conv15 = fptosi double %9 to i64, !dbg !10
246+
%conv.i43 = sitofp i64 %mul13 to double, !dbg !10
247+
%10 = tail call contract noundef double @llvm.minnum.f64(double %conv.i43, double %conv1.i), !dbg !10
248+
%add.ptr18 = getelementptr inbounds i8, ptr %1, i64 %conv15, !dbg !10
249+
%rem = and i64 %redOpArg, 1, !dbg !10
250+
%cmp.not = icmp eq i64 %rem, 0, !dbg !10
251+
%rem21 = and i64 %redOpArg, 2, !dbg !10
252+
%cmp22.not = icmp eq i64 %rem21, 0, !dbg !10
253+
%rem26 = and i64 %redOpArg, 4, !dbg !10
254+
%cmp27.not = icmp eq i64 %rem26, 0, !dbg !10
255+
%11 = inttoptr i64 %redOpArg to ptr, !dbg !10
256+
%12 = load i64, ptr %11, align 8, !dbg !10
257+
%conv17 = fptosi double %10 to i64, !dbg !10
258+
%sub = sub nsw i64 %conv17, %conv15, !dbg !10
259+
%rem.i.i5354 = and i32 %3, 63, !dbg !10
260+
%cmp.i.i.not = icmp eq i32 %rem.i.i5354, 0, !dbg !10
261+
%13 = add i64 %2, %conv15, !dbg !10
262+
%14 = ptrtoint ptr %add.ptr18 to i64, !dbg !10
263+
%15 = or i64 %13, %14, !dbg !10
264+
%16 = and i64 %15, 15, !dbg !10
265+
%and1583.i.i = icmp ne i64 %16, 0, !dbg !10
266+
%17 = zext i1 %and1583.i.i to i32, !dbg !10
267+
%18 = tail call i32 asm sideeffect "", "=v,0"(i32 %17) #9, !dbg !10
268+
%19 = icmp ne i32 %18, 0, !dbg !10
269+
%20 = tail call i64 @llvm.amdgcn.ballot.i64(i1 %19), !dbg !10
270+
%.not.i.i = icmp eq i64 %20, 0, !dbg !10
271+
%div1.i.i.i555659 = lshr i32 %3, 6, !dbg !10
272+
%div8.i.i.i = sdiv i64 %sub, 4096, !dbg !10
273+
%mul9.i.i.i = shl nsw i64 %div8.i.i.i, 12, !dbg !10
274+
%sub12.i.i.i = sub nsw i64 %sub, %mul9.i.i.i, !dbg !10
275+
%conv13.i.i.i = zext nneg i32 %div1.i.i.i555659 to i64, !dbg !10
276+
%sub14.i.i.i = sub nsw i64 %div8.i.i.i, %conv13.i.i.i, !dbg !10
277+
%cmp30399.i.i.i = icmp sgt i64 %sub14.i.i.i, 0, !dbg !10
278+
ret void
279+
}
280+
281+
attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
282+
283+
!llvm.dbg.cu = !{!0}
284+
!llvm.module.flags = !{!2, !3}
285+
!0 = distinct !DICompileUnit(language: DW_LANG_OpenCL, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug)
286+
!1 = !DIFile(filename: "<stdin>", directory: "/")
287+
!2 = !{i32 7, !"Dwarf Version", i32 5}
288+
!3 = !{i32 2, !"Debug Info Version", i32 3}
289+
!4 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !5, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0)
290+
!5 = !DISubroutineType(cc: DW_CC_LLVM_OpenCLKernel, types: !6)
291+
!6 = !{null}
292+
!7 = !{i32 1024, i32 1, i32 1}
293+
!8 = !DILocalVariable(name: "var", arg: 1, scope: !4, file: !1, line: 1, type: !9)
294+
!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
295+
!10 = !DILocation(line: 1, scope: !4)

0 commit comments

Comments
 (0)