Skip to content

Commit 173c682

Browse files
authored
[AMDGPU] Enable unaligned scratch accesses (#110219)
This allows us to emit wide generic and scratch memory accesses when we do not have alignment information. In cases where accesses happen to be properly aligned or where generic accesses do not go to scratch memory, this improves performance of the generated code by a factor of up to 16x and reduces code size, especially when lowering memcpy and memmove intrinsics. Also: Make the use of the FeatureUnalignedScratchAccess feature more consistent: FeatureUnalignedScratchAccess and EnableFlatScratch are now orthogonal, whereas, before, code assumed that the latter implies the former at some places. Part of SWDEV-455845.
1 parent 81bd712 commit 173c682

15 files changed

+6082
-17791
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1178,9 +1178,9 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
11781178
FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts,
11791179
FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
11801180
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
1181-
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
1182-
FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero,
1183-
FeatureVmemWriteVgprInOrder
1181+
FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
1182+
FeatureUnalignedDSAccess, FeatureNegativeScratchOffsetBug, FeatureGWS,
1183+
FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder
11841184
]
11851185
>;
11861186

@@ -1199,9 +1199,9 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
11991199
FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts,
12001200
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
12011201
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
1202-
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts,
1203-
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
1204-
FeatureMaxHardClauseLength63,
1202+
FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
1203+
FeatureUnalignedDSAccess, FeatureImageInsts, FeatureGDS, FeatureGWS,
1204+
FeatureDefaultComponentZero, FeatureMaxHardClauseLength63,
12051205
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
12061206
FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts,
12071207
FeatureVmemWriteVgprInOrder
@@ -1223,9 +1223,9 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
12231223
FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts,
12241224
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
12251225
FeatureA16, FeatureFastDenormalF32, FeatureG16,
1226-
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
1227-
FeatureGWS, FeatureDefaultComponentZero,
1228-
FeatureMaxHardClauseLength32,
1226+
FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
1227+
FeatureUnalignedDSAccess, FeatureGDS, FeatureGWS,
1228+
FeatureDefaultComponentZero, FeatureMaxHardClauseLength32,
12291229
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
12301230
FeatureVmemWriteVgprInOrder
12311231
]
@@ -1246,9 +1246,9 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
12461246
FeatureVOP3Literal, FeatureDPP8,
12471247
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
12481248
FeatureA16, FeatureFastDenormalF32, FeatureG16,
1249-
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
1250-
FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast,
1251-
FeatureMaxHardClauseLength32,
1249+
FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
1250+
FeatureUnalignedDSAccess, FeatureTrue16BitInsts,
1251+
FeatureDefaultComponentBroadcast, FeatureMaxHardClauseLength32,
12521252
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
12531253
FeatureAgentScopeFineGrainedRemoteMemoryAtomics
12541254
]

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -387,8 +387,8 @@ bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
387387
// them later if they may access private memory. We don't have enough context
388388
// here, and legalization can handle it.
389389
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
390-
return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
391-
ChainSizeInBytes <= ST->getMaxPrivateElementSize();
390+
return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
391+
ChainSizeInBytes <= ST->getMaxPrivateElementSize();
392392
}
393393
return true;
394394
}

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -591,6 +591,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
591591
return UnalignedScratchAccess;
592592
}
593593

594+
bool hasUnalignedScratchAccessEnabled() const {
595+
return UnalignedScratchAccess && UnalignedAccessMode;
596+
}
597+
594598
bool hasUnalignedAccessMode() const {
595599
return UnalignedAccessMode;
596600
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1824,26 +1824,16 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
18241824
Subtarget->hasUnalignedDSAccessEnabled();
18251825
}
18261826

1827-
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1828-
bool AlignedBy4 = Alignment >= Align(4);
1829-
if (IsFast)
1830-
*IsFast = AlignedBy4;
1831-
1832-
return AlignedBy4 ||
1833-
Subtarget->enableFlatScratch() ||
1834-
Subtarget->hasUnalignedScratchAccess();
1835-
}
1836-
18371827
// FIXME: We have to be conservative here and assume that flat operations
18381828
// will access scratch. If we had access to the IR function, then we
18391829
// could determine if any private memory was used in the function.
1840-
if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1841-
!Subtarget->hasUnalignedScratchAccess()) {
1830+
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1831+
AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
18421832
bool AlignedBy4 = Alignment >= Align(4);
18431833
if (IsFast)
18441834
*IsFast = AlignedBy4;
18451835

1846-
return AlignedBy4;
1836+
return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
18471837
}
18481838

18491839
// So long as they are correct, wide global memory operations perform better

0 commit comments

Comments
 (0)