Skip to content

Commit 85311bd

Browse files
committed
Diagnostic error and move LDSKernelId.
Use diagnostic error. Unify preload kernarg SGPR accounting and move LDSKernelId to UserSGPRInfo so that synthetic SGPRs can be properly tracked when calculating the number of available registers for preloads.
1 parent 69313b6 commit 85311bd

10 files changed

+134
-50
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -520,6 +520,17 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
520520

521521
// TODO: Align down to dword alignment and extract bits for extending loads.
522522
for (auto &Arg : F.args()) {
523+
// Hidden arguments that are in the kernel signature must be preloded to
524+
// user SGPRs, or loaded via the implicit_arg ptr. Print a diagnostic
525+
// error if a hidden argument is in the argument list and is not
526+
// preloaded.
527+
if (Arg.hasAttribute("amdgpu-hidden-argument")) {
528+
DiagnosticInfoUnsupported NonPreloadHiddenArg(
529+
*Arg.getParent(),
530+
"Hidden argument in kernel signature was not preloaded");
531+
F.getContext().diagnose(NonPreloadHiddenArg);
532+
}
533+
523534
const bool IsByRef = Arg.hasByRefAttr();
524535
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
525536
unsigned AllocSize = DL.getTypeAllocSize(ArgTy);

llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -145,19 +145,19 @@ class PreloadKernelArgInfo {
145145
// arguments.
146146
void setInitialFreeUserSGPRsCount() {
147147
GCNUserSGPRUsageInfo UserSGPRInfo(F, ST);
148-
NumFreeUserSGPRs =
149-
UserSGPRInfo.getNumFreeUserSGPRs() - 1 /* Synthetic SGPRs*/;
148+
NumFreeUserSGPRs = UserSGPRInfo.getNumFreeKernargPreloadSGPRs();
150149
}
151150

152151
bool tryAllocPreloadSGPRs(unsigned AllocSize, uint64_t ArgOffset,
153152
uint64_t LastExplicitArgOffset) {
154153
// Check if this argument may be loaded into the same register as the
155154
// previous argument.
156-
if (ArgOffset == LastExplicitArgOffset && !isAligned(Align(4), ArgOffset) &&
157-
AllocSize < 4)
155+
if (ArgOffset - LastExplicitArgOffset < 4 &&
156+
!isAligned(Align(4), ArgOffset))
158157
return true;
159158

160159
// Pad SGPRs for kernarg alignment.
160+
ArgOffset = alignDown(ArgOffset, 4);
161161
unsigned Padding = ArgOffset - LastExplicitArgOffset;
162162
unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
163163
unsigned NumPreloadSGPRs = alignTo(AllocSize, 4) / 4;
@@ -229,7 +229,8 @@ class PreloadKernelArgInfo {
229229
LastExplicitArgOffset))
230230
return true;
231231

232-
LastExplicitArgOffset = LoadOffset + LoadSize;
232+
LastExplicitArgOffset =
233+
ImplicitArgsBaseOffset + LoadOffset + LoadSize;
233234
return false;
234235
});
235236

llvm/lib/Target/AMDGPU/GCNSubtarget.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -748,6 +748,10 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
748748
FlatScratchInit = true;
749749
}
750750

751+
if (!AMDGPU::isGraphics(CC) && !IsKernel &&
752+
!F.hasFnAttribute("amdgpu-no-lds-kernel-id"))
753+
LDSKernelId = true;
754+
751755
if (hasImplicitBufferPtr())
752756
NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);
753757

@@ -771,6 +775,9 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
771775

772776
if (hasPrivateSegmentSize())
773777
NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID);
778+
779+
if (hasLDSKernelId())
780+
NumSyntheticSGPRs += getNumUserSGPRForField(LDSKernelIdID);
774781
}
775782

776783
void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
@@ -779,6 +786,6 @@ void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
779786
NumUsedUserSGPRs += NumSGPRs;
780787
}
781788

782-
unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
783-
return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
789+
unsigned GCNUserSGPRUsageInfo::getNumFreeKernargPreloadSGPRs() {
790+
return AMDGPU::getMaxNumUserSGPRs(ST) - (NumUsedUserSGPRs + NumSyntheticSGPRs);
784791
}

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1678,11 +1678,13 @@ class GCNUserSGPRUsageInfo {
16781678

16791679
bool hasPrivateSegmentSize() const { return PrivateSegmentSize; }
16801680

1681+
bool hasLDSKernelId() const { return LDSKernelId; }
1682+
16811683
unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
16821684

16831685
unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
16841686

1685-
unsigned getNumFreeUserSGPRs();
1687+
unsigned getNumFreeKernargPreloadSGPRs();
16861688

16871689
void allocKernargPreloadSGPRs(unsigned NumSGPRs);
16881690

@@ -1694,11 +1696,12 @@ class GCNUserSGPRUsageInfo {
16941696
KernargSegmentPtrID = 4,
16951697
DispatchIdID = 5,
16961698
FlatScratchInitID = 6,
1697-
PrivateSegmentSizeID = 7
1699+
PrivateSegmentSizeID = 7,
1700+
LDSKernelIdID = 8
16981701
};
16991702

17001703
// Returns the size in number of SGPRs for preload user SGPR field.
1701-
static unsigned getNumUserSGPRForField(UserSGPRID ID) {
1704+
static constexpr unsigned getNumUserSGPRForField(UserSGPRID ID) {
17021705
switch (ID) {
17031706
case ImplicitBufferPtrID:
17041707
return 2;
@@ -1716,6 +1719,8 @@ class GCNUserSGPRUsageInfo {
17161719
return 2;
17171720
case PrivateSegmentSizeID:
17181721
return 1;
1722+
case LDSKernelIdID:
1723+
return 1;
17191724
}
17201725
llvm_unreachable("Unknown UserSGPRID.");
17211726
}
@@ -1744,9 +1749,13 @@ class GCNUserSGPRUsageInfo {
17441749

17451750
bool PrivateSegmentSize = false;
17461751

1752+
bool LDSKernelId = false;
1753+
17471754
unsigned NumKernargPreloadSGPRs = 0;
17481755

17491756
unsigned NumUsedUserSGPRs = 0;
1757+
1758+
unsigned NumSyntheticSGPRs = 0;
17501759
};
17511760

17521761
} // end namespace llvm

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2422,7 +2422,7 @@ void SITargetLowering::allocateSpecialInputSGPRs(
24222422
if (Info.hasWorkGroupIDZ())
24232423
allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
24242424

2425-
if (Info.hasLDSKernelId())
2425+
if (UserSGPRInfo.hasLDSKernelId())
24262426
allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
24272427
}
24282428

@@ -2545,8 +2545,8 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
25452545
unsigned Padding = ArgOffset - LastExplicitArgOffset;
25462546
unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
25472547
// Check for free user SGPRs for preloading.
2548-
if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
2549-
SGPRInfo.getNumFreeUserSGPRs()) {
2548+
if (PaddingSGPRs + NumAllocSGPRs >
2549+
SGPRInfo.getNumFreeKernargPreloadSGPRs()) {
25502550
InPreloadSequence = false;
25512551
break;
25522552
}
@@ -2574,7 +2574,8 @@ void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF,
25742574
const SIRegisterInfo &TRI,
25752575
SIMachineFunctionInfo &Info) const {
25762576
// Always allocate this last since it is a synthetic preload.
2577-
if (Info.hasLDSKernelId()) {
2577+
const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2578+
if (UserSGPRInfo.hasLDSKernelId()) {
25782579
Register Reg = Info.addLDSKernelId();
25792580
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
25802581
CCInfo.AllocateReg(Reg);
@@ -2824,7 +2825,7 @@ SDValue SITargetLowering::LowerFormalArguments(
28242825
const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
28252826
assert(!UserSGPRInfo.hasDispatchPtr() &&
28262827
!UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2827-
!Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2828+
!UserSGPRInfo.hasLDSKernelId() && !Info->hasWorkItemIDX() &&
28282829
!Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
28292830
(void)UserSGPRInfo;
28302831
if (!Subtarget->enableFlatScratch())
@@ -3024,13 +3025,20 @@ SDValue SITargetLowering::LowerFormalArguments(
30243025
NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
30253026
}
30263027
} else {
3027-
#ifndef NDEBUG
3028+
// Hidden arguments that are in the kernel signature must be preloded to
3029+
// user SGPRs, or loaded via the implicit_arg ptr. Print a diagnostic
3030+
// error if a hidden argument is in the argument list and is not
3031+
// preloaded.
30283032
if (Arg.isOrigArg()) {
30293033
Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3030-
assert(!OrigArg->hasAttribute("amdgpu-hidden-argument") &&
3031-
"Hidden arguments should be preloaded");
3034+
if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3035+
DiagnosticInfoUnsupported NonPreloadHiddenArg(
3036+
*OrigArg->getParent(),
3037+
"Hidden argument in kernel signature was not preloaded",
3038+
DL.getDebugLoc());
3039+
DAG.getContext()->diagnose(NonPreloadHiddenArg);
3040+
}
30323041
}
3033-
#endif // NDEBUG
30343042

30353043
NewArg =
30363044
lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
3838
const GCNSubtarget *STI)
3939
: AMDGPUMachineFunction(F, *STI), Mode(F, *STI), GWSResourcePSV(getTM(STI)),
4040
UserSGPRInfo(F, *STI), WorkGroupIDX(false), WorkGroupIDY(false),
41-
WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false),
41+
WorkGroupIDZ(false), WorkGroupInfo(false),
4242
PrivateSegmentWaveByteOffset(false), WorkItemIDX(false),
4343
WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false),
4444
GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) {
@@ -131,9 +131,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
131131
if (!F.hasFnAttribute("amdgpu-no-workitem-id-z") &&
132132
ST.getMaxWorkitemID(F, 2) != 0)
133133
WorkItemIDZ = true;
134-
135-
if (!IsKernel && !F.hasFnAttribute("amdgpu-no-lds-kernel-id"))
136-
LDSKernelId = true;
137134
}
138135

139136
if (isEntryFunction()) {

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
461461
bool WorkGroupIDY : 1;
462462
bool WorkGroupIDZ : 1;
463463
bool WorkGroupInfo : 1;
464-
bool LDSKernelId : 1;
465464
bool PrivateSegmentWaveByteOffset : 1;
466465

467466
bool WorkItemIDX : 1; // Always initialized.
@@ -822,8 +821,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
822821
return ArgInfo.WorkGroupInfo.getRegister();
823822
}
824823

825-
bool hasLDSKernelId() const { return LDSKernelId; }
826-
827824
// Add special VGPR inputs
828825
void setWorkItemIDX(ArgDescriptor Arg) {
829826
ArgInfo.WorkItemIDX = Arg;
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
; RUN: not llc -global-isel=1 -mtriple=amdgcn--amdhsa -mcpu=gfx940 -start-after=amdgpu-lower-kernel-arguments < %s 2>&1 | FileCheck -check-prefix=ERROR %s
2+
; RUN: not llc -global-isel=0 -mtriple=amdgcn--amdhsa -mcpu=gfx940 -start-after=amdgpu-lower-kernel-arguments < %s 2>&1 | FileCheck -check-prefix=ERROR %s
3+
4+
define amdgpu_kernel void @no_free_sgprs_block_count_x_no_preload_diag(ptr addrspace(1) inreg %out, i512 inreg, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x) #0 {
5+
; ERROR: error: <unknown>:0:0: in function no_free_sgprs_block_count_x_no_preload_diag void (ptr addrspace(1), i512, i32): Hidden argument in kernel signature was not preloaded
6+
store i32 %_hidden_block_count_x, ptr addrspace(1) %out
7+
ret void
8+
}
9+
10+
define amdgpu_kernel void @preloadremainder_z_no_preload_diag(ptr addrspace(1) inreg %out, i256 inreg, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_x, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_y, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_z) #0 {
11+
; ERROR: error: <unknown>:0:0: in function preloadremainder_z_no_preload_diag void (ptr addrspace(1), i256, i32, i32, i32, i16, i16, i16, i16, i16, i16): Hidden argument in kernel signature was not preloaded
12+
%conv = zext i16 %_hidden_remainder_z to i32
13+
store i32 %conv, ptr addrspace(1) %out
14+
ret void
15+
}
16+
17+
attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }

llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll

Lines changed: 59 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -439,13 +439,13 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %ou
439439
; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
440440
; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
441441
; GFX90a-NEXT: ; %bb.0:
442-
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
443-
; GFX90a-NEXT: global_load_ushort v2, v3, s[4:5] offset:24
444442
; GFX90a-NEXT: s_lshr_b32 s0, s11, 16
445443
; GFX90a-NEXT: s_and_b32 s1, s11, 0xffff
444+
; GFX90a-NEXT: s_and_b32 s2, s12, 0xffff
445+
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
446446
; GFX90a-NEXT: v_mov_b32_e32 v0, s1
447447
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
448-
; GFX90a-NEXT: s_waitcnt vmcnt(0)
448+
; GFX90a-NEXT: v_mov_b32_e32 v2, s2
449449
; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
450450
; GFX90a-NEXT: s_endpgm
451451
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
@@ -554,28 +554,27 @@ define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0
554554
; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
555555
; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
556556
; GFX940-NEXT: ; %bb.0:
557+
; GFX940-NEXT: s_lshr_b32 s0, s9, 16
558+
; GFX940-NEXT: s_lshr_b32 s1, s8, 16
559+
; GFX940-NEXT: s_and_b32 s4, s9, 0xffff
557560
; GFX940-NEXT: v_mov_b32_e32 v3, 0
558-
; GFX940-NEXT: global_load_ushort v2, v3, s[0:1] offset:30
559-
; GFX940-NEXT: s_lshr_b32 s0, s8, 16
560-
; GFX940-NEXT: s_and_b32 s1, s9, 0xffff
561-
; GFX940-NEXT: v_mov_b32_e32 v0, s0
562-
; GFX940-NEXT: v_mov_b32_e32 v1, s1
563-
; GFX940-NEXT: s_waitcnt vmcnt(0)
561+
; GFX940-NEXT: v_mov_b32_e32 v0, s1
562+
; GFX940-NEXT: v_mov_b32_e32 v1, s4
563+
; GFX940-NEXT: v_mov_b32_e32 v2, s0
564564
; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
565565
; GFX940-NEXT: s_endpgm
566566
;
567567
; GFX90a-LABEL: preloadremainder_xyz:
568568
; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
569569
; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
570570
; GFX90a-NEXT: ; %bb.0:
571+
; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
572+
; GFX90a-NEXT: s_lshr_b32 s1, s12, 16
573+
; GFX90a-NEXT: s_and_b32 s2, s13, 0xffff
571574
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
572-
; GFX90a-NEXT: global_load_dword v0, v3, s[4:5] offset:26
573-
; GFX90a-NEXT: global_load_ushort v2, v3, s[4:5] offset:30
574-
; GFX90a-NEXT: s_lshr_b32 s0, s12, 16
575-
; GFX90a-NEXT: s_waitcnt vmcnt(1)
576-
; GFX90a-NEXT: v_lshrrev_b32_e32 v1, 16, v0
577-
; GFX90a-NEXT: v_mov_b32_e32 v0, s0
578-
; GFX90a-NEXT: s_waitcnt vmcnt(0)
575+
; GFX90a-NEXT: v_mov_b32_e32 v0, s1
576+
; GFX90a-NEXT: v_mov_b32_e32 v1, s2
577+
; GFX90a-NEXT: v_mov_b32_e32 v2, s0
579578
; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
580579
; GFX90a-NEXT: s_endpgm
581580
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
@@ -600,10 +599,8 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr
600599
; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
601600
; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
602601
; GFX940-NEXT: ; %bb.0:
603-
; GFX940-NEXT: s_load_dword s0, s[4:5], 0x1c
602+
; GFX940-NEXT: s_lshr_b32 s0, s15, 16
604603
; GFX940-NEXT: v_mov_b32_e32 v0, 0
605-
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
606-
; GFX940-NEXT: s_lshr_b32 s0, s0, 16
607604
; GFX940-NEXT: v_mov_b32_e32 v1, s0
608605
; GFX940-NEXT: global_store_dword v0, v1, s[8:9] sc0 sc1
609606
; GFX940-NEXT: s_endpgm
@@ -627,7 +624,7 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr
627624
ret void
628625
}
629626

630-
; Check for consistency between isel and earlier passes preload SGPR accounting.
627+
; Check for consistency between isel and earlier passes preload SGPR accounting with max preload SGPRs.
631628

632629
define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %out, i192 inreg %t0, i32 inreg %t1) #0 {
633630
; GFX940-LABEL: preload_block_max_user_sgprs:
@@ -655,4 +652,46 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %
655652
ret void
656653
}
657654

655+
define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(ptr addrspace(1) inreg %out) #0 {
656+
; GFX940-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
657+
; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
658+
; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
659+
; GFX940-NEXT: ; %bb.0:
660+
; GFX940-NEXT: s_lshr_b32 s0, s9, 16
661+
; GFX940-NEXT: s_and_b32 s1, s8, 0xffff
662+
; GFX940-NEXT: v_mov_b32_e32 v3, 0
663+
; GFX940-NEXT: v_mov_b32_e32 v0, s6
664+
; GFX940-NEXT: v_mov_b32_e32 v1, s1
665+
; GFX940-NEXT: v_mov_b32_e32 v2, s0
666+
; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
667+
; GFX940-NEXT: s_endpgm
668+
;
669+
; GFX90a-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
670+
; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
671+
; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
672+
; GFX90a-NEXT: ; %bb.0:
673+
; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
674+
; GFX90a-NEXT: s_and_b32 s1, s12, 0xffff
675+
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
676+
; GFX90a-NEXT: v_mov_b32_e32 v0, s10
677+
; GFX90a-NEXT: v_mov_b32_e32 v1, s1
678+
; GFX90a-NEXT: v_mov_b32_e32 v2, s0
679+
; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
680+
; GFX90a-NEXT: s_endpgm
681+
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
682+
%gep0 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
683+
%gep1 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
684+
%gep2 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
685+
%load0 = load i32, ptr addrspace(4) %gep0
686+
%load1 = load i16, ptr addrspace(4) %gep1
687+
%load2 = load i16, ptr addrspace(4) %gep2
688+
%conv1 = zext i16 %load1 to i32
689+
%conv2 = zext i16 %load2 to i32
690+
%ins.0 = insertelement <3 x i32> poison, i32 %load0, i32 0
691+
%ins.1 = insertelement <3 x i32> %ins.0, i32 %conv1, i32 1
692+
%ins.2 = insertelement <3 x i32> %ins.1, i32 %conv2, i32 2
693+
store <3 x i32> %ins.2, ptr addrspace(1) %out
694+
ret void
695+
}
696+
658697
attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }

llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -187,17 +187,15 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_8(ptr addrspace(1) %i
187187
; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8
188188
; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) inreg [[IN3:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]], ptr addrspace(1) inreg [[OUT3:%.*]]) #[[ATTR0]] {
189189
; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
190-
; PRELOAD-8-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 48
191-
; PRELOAD-8-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0:![0-9]+]]
192190
; PRELOAD-8-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56
193-
; PRELOAD-8-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
191+
; PRELOAD-8-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0:![0-9]+]]
194192
; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
195193
; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
196194
; PRELOAD-8-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4
197195
; PRELOAD-8-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3]], align 4
198196
; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
199197
; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
200-
; PRELOAD-8-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4
198+
; PRELOAD-8-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2]], align 4
201199
; PRELOAD-8-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4
202200
; PRELOAD-8-NEXT: ret void
203201
;

0 commit comments

Comments
 (0)