Skip to content

Commit 729bf9b

Browse files
committed
AMDGPU: Enable fixed function ABI by default
Code using indirect calls is broken without this, and there isn't really much value in supporting the old attempt to vary the argument placement based on uses. This resulted in more argument shuffling code anyway. Also have the option stop implying all inputs need to be passed. This will no rely on the amdgpu-no-* attributes to avoid passing unnecessary values.
1 parent 89f0f27 commit 729bf9b

File tree

61 files changed

+1914
-1489
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+1914
-1489
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -829,9 +829,12 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
829829

830830
if (IncomingArg) {
831831
LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy);
832-
} else {
833-
assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
832+
} else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
834833
LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder);
834+
} else {
835+
// We may have proven the input wasn't needed, although the ABI is
836+
// requiring it. We just need to allocate the register appropriately.
837+
MIRBuilder.buildUndef(InputReg);
835838
}
836839

837840
if (OutgoingArg->isRegister()) {

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ static cl::opt<bool, true> EnableAMDGPUFixedFunctionABIOpt(
235235
"amdgpu-fixed-function-abi",
236236
cl::desc("Enable all implicit function arguments"),
237237
cl::location(AMDGPUTargetMachine::EnableFixedFunctionABI),
238-
cl::init(false),
238+
cl::init(true),
239239
cl::Hidden);
240240

241241
// Enable lib calls simplifications

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -110,20 +110,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
110110
else if (ST.isMesaGfxShader(F))
111111
ImplicitBufferPtr = true;
112112

113-
if (UseFixedABI) {
114-
DispatchPtr = true;
115-
QueuePtr = true;
116-
ImplicitArgPtr = true;
117-
WorkGroupIDX = true;
118-
WorkGroupIDY = true;
119-
WorkGroupIDZ = true;
120-
WorkItemIDX = true;
121-
WorkItemIDY = true;
122-
WorkItemIDZ = true;
123-
124-
// FIXME: We don't need this?
125-
DispatchID = true;
126-
} else if (!AMDGPU::isGraphics(CC)) {
113+
if (!AMDGPU::isGraphics(CC)) {
127114
if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x"))
128115
WorkGroupIDX = true;
129116

llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,7 @@ define <16 x i32> @v_ashr_v16i32(<16 x i32> %value, <16 x i32> %amount) {
481481
; GCN: ; %bb.0:
482482
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
483483
; GCN-NEXT: v_ashrrev_i32_e32 v0, v16, v0
484+
; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32
484485
; GCN-NEXT: v_ashrrev_i32_e32 v1, v17, v1
485486
; GCN-NEXT: v_ashrrev_i32_e32 v2, v18, v2
486487
; GCN-NEXT: v_ashrrev_i32_e32 v3, v19, v3
@@ -495,13 +496,15 @@ define <16 x i32> @v_ashr_v16i32(<16 x i32> %value, <16 x i32> %amount) {
495496
; GCN-NEXT: v_ashrrev_i32_e32 v12, v28, v12
496497
; GCN-NEXT: v_ashrrev_i32_e32 v13, v29, v13
497498
; GCN-NEXT: v_ashrrev_i32_e32 v14, v30, v14
498-
; GCN-NEXT: v_ashrrev_i32_e32 v15, v31, v15
499+
; GCN-NEXT: s_waitcnt vmcnt(0)
500+
; GCN-NEXT: v_ashrrev_i32_e32 v15, v16, v15
499501
; GCN-NEXT: s_setpc_b64 s[30:31]
500502
;
501503
; GFX10-LABEL: v_ashr_v16i32:
502504
; GFX10: ; %bb.0:
503505
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
504506
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
507+
; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
505508
; GFX10-NEXT: v_ashrrev_i32_e32 v0, v16, v0
506509
; GFX10-NEXT: v_ashrrev_i32_e32 v1, v17, v1
507510
; GFX10-NEXT: v_ashrrev_i32_e32 v2, v18, v2
@@ -517,6 +520,7 @@ define <16 x i32> @v_ashr_v16i32(<16 x i32> %value, <16 x i32> %amount) {
517520
; GFX10-NEXT: v_ashrrev_i32_e32 v12, v28, v12
518521
; GFX10-NEXT: v_ashrrev_i32_e32 v13, v29, v13
519522
; GFX10-NEXT: v_ashrrev_i32_e32 v14, v30, v14
523+
; GFX10-NEXT: s_waitcnt vmcnt(0)
520524
; GFX10-NEXT: v_ashrrev_i32_e32 v15, v31, v15
521525
; GFX10-NEXT: s_setpc_b64 s[30:31]
522526
%result = ashr <16 x i32> %value, %amount

llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll

Lines changed: 108 additions & 100 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ define float @test_atomicrmw_fsub(float addrspace(3)* %addr) {
3333
; CHECK-NEXT: bb.2.atomicrmw.start:
3434
; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
3535
; CHECK-NEXT: {{ $}}
36-
; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s64) = G_PHI %9(s64), %bb.2, [[C1]](s64), %bb.1
37-
; CHECK-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[LOAD]](s32), %bb.1, %7(s32), %bb.2
36+
; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s64) = G_PHI %16(s64), %bb.2, [[C1]](s64), %bb.1
37+
; CHECK-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[LOAD]](s32), %bb.1, %14(s32), %bb.2
3838
; CHECK-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[PHI1]], [[C]]
3939
; CHECK-NEXT: [[ATOMIC_CMPXCHG_WITH_SUCCESS:%[0-9]+]]:_(s32), [[ATOMIC_CMPXCHG_WITH_SUCCESS1:%[0-9]+]]:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS [[COPY]](p3), [[PHI1]], [[FSUB]] :: (load store seq_cst seq_cst (s32) on %ir.2, addrspace 3)
4040
; CHECK-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64)

llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,8 @@ define float @v_constained_fadd_f32_fpexcept_ignore(float %x, float %y) #0 {
4141
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
4242
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
4343
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
44-
; CHECK-NEXT: %3:_(s32) = nofpexcept G_STRICT_FADD [[COPY]], [[COPY1]]
45-
; CHECK-NEXT: $vgpr0 = COPY %3(s32)
44+
; CHECK-NEXT: %10:_(s32) = nofpexcept G_STRICT_FADD [[COPY]], [[COPY1]]
45+
; CHECK-NEXT: $vgpr0 = COPY %10(s32)
4646
; CHECK-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
4747
; CHECK-NEXT: S_SETPC_B64_return [[COPY3]], implicit $vgpr0
4848
%val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
@@ -57,8 +57,8 @@ define float @v_constained_fadd_f32_fpexcept_ignore_flags(float %x, float %y) #0
5757
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
5858
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
5959
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
60-
; CHECK-NEXT: %3:_(s32) = nsz nofpexcept G_STRICT_FADD [[COPY]], [[COPY1]]
61-
; CHECK-NEXT: $vgpr0 = COPY %3(s32)
60+
; CHECK-NEXT: %10:_(s32) = nsz nofpexcept G_STRICT_FADD [[COPY]], [[COPY1]]
61+
; CHECK-NEXT: $vgpr0 = COPY %10(s32)
6262
; CHECK-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
6363
; CHECK-NEXT: S_SETPC_B64_return [[COPY3]], implicit $vgpr0
6464
%val = call nsz float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
@@ -115,8 +115,8 @@ define <2 x float> @v_constained_fadd_v2f32_fpexcept_ignore(<2 x float> %x, <2 x
115115
; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
116116
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
117117
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
118-
; CHECK-NEXT: %7:_(<2 x s32>) = nofpexcept G_STRICT_FADD [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
119-
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES %7(<2 x s32>)
118+
; CHECK-NEXT: %14:_(<2 x s32>) = nofpexcept G_STRICT_FADD [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
119+
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES %14(<2 x s32>)
120120
; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
121121
; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
122122
; CHECK-NEXT: [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
@@ -155,8 +155,8 @@ define float @v_constained_fsub_f32_fpexcept_ignore_flags(float %x, float %y) #0
155155
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
156156
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
157157
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
158-
; CHECK-NEXT: %3:_(s32) = nsz nofpexcept G_STRICT_FSUB [[COPY]], [[COPY1]]
159-
; CHECK-NEXT: $vgpr0 = COPY %3(s32)
158+
; CHECK-NEXT: %10:_(s32) = nsz nofpexcept G_STRICT_FSUB [[COPY]], [[COPY1]]
159+
; CHECK-NEXT: $vgpr0 = COPY %10(s32)
160160
; CHECK-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
161161
; CHECK-NEXT: S_SETPC_B64_return [[COPY3]], implicit $vgpr0
162162
%val = call nsz float @llvm.experimental.constrained.fsub.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
@@ -171,8 +171,8 @@ define float @v_constained_fmul_f32_fpexcept_ignore_flags(float %x, float %y) #0
171171
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
172172
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
173173
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
174-
; CHECK-NEXT: %3:_(s32) = nsz nofpexcept G_STRICT_FMUL [[COPY]], [[COPY1]]
175-
; CHECK-NEXT: $vgpr0 = COPY %3(s32)
174+
; CHECK-NEXT: %10:_(s32) = nsz nofpexcept G_STRICT_FMUL [[COPY]], [[COPY1]]
175+
; CHECK-NEXT: $vgpr0 = COPY %10(s32)
176176
; CHECK-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
177177
; CHECK-NEXT: S_SETPC_B64_return [[COPY3]], implicit $vgpr0
178178
%val = call nsz float @llvm.experimental.constrained.fmul.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
@@ -187,8 +187,8 @@ define float @v_constained_fdiv_f32_fpexcept_ignore_flags(float %x, float %y) #0
187187
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
188188
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
189189
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
190-
; CHECK-NEXT: %3:_(s32) = nsz nofpexcept G_STRICT_FDIV [[COPY]], [[COPY1]]
191-
; CHECK-NEXT: $vgpr0 = COPY %3(s32)
190+
; CHECK-NEXT: %10:_(s32) = nsz nofpexcept G_STRICT_FDIV [[COPY]], [[COPY1]]
191+
; CHECK-NEXT: $vgpr0 = COPY %10(s32)
192192
; CHECK-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
193193
; CHECK-NEXT: S_SETPC_B64_return [[COPY3]], implicit $vgpr0
194194
%val = call nsz float @llvm.experimental.constrained.fdiv.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
@@ -203,8 +203,8 @@ define float @v_constained_frem_f32_fpexcept_ignore_flags(float %x, float %y) #0
203203
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
204204
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
205205
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
206-
; CHECK-NEXT: %3:_(s32) = nsz nofpexcept G_STRICT_FREM [[COPY]], [[COPY1]]
207-
; CHECK-NEXT: $vgpr0 = COPY %3(s32)
206+
; CHECK-NEXT: %10:_(s32) = nsz nofpexcept G_STRICT_FREM [[COPY]], [[COPY1]]
207+
; CHECK-NEXT: $vgpr0 = COPY %10(s32)
208208
; CHECK-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
209209
; CHECK-NEXT: S_SETPC_B64_return [[COPY3]], implicit $vgpr0
210210
%val = call nsz float @llvm.experimental.constrained.frem.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
@@ -220,8 +220,8 @@ define float @v_constained_fma_f32_fpexcept_ignore_flags(float %x, float %y, flo
220220
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
221221
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
222222
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
223-
; CHECK-NEXT: %4:_(s32) = nsz nofpexcept G_STRICT_FMA [[COPY]], [[COPY1]], [[COPY2]]
224-
; CHECK-NEXT: $vgpr0 = COPY %4(s32)
223+
; CHECK-NEXT: %11:_(s32) = nsz nofpexcept G_STRICT_FMA [[COPY]], [[COPY1]], [[COPY2]]
224+
; CHECK-NEXT: $vgpr0 = COPY %11(s32)
225225
; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
226226
; CHECK-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
227227
%val = call nsz float @llvm.experimental.constrained.fma.f32(float %x, float %y, float %z, metadata !"round.tonearest", metadata !"fpexcept.ignore")

0 commit comments

Comments
 (0)