Skip to content

Commit 787ddf9

Browse files
committed
[AMDGPU] Enable unaligned scratch accesses (llvm#110219)
This allows us to emit wide generic and scratch memory accesses when we do not have alignment information. In cases where accesses happen to be properly aligned or where generic accesses do not go to scratch memory, this improves performance of the generated code by a factor of up to 16x and reduces code size, especially when lowering memcpy and memmove intrinsics. Also: Make the use of the FeatureUnalignedScratchAccess feature more consistent: FeatureUnalignedScratchAccess and EnableFlatScratch are now orthogonal, whereas, before, code assumed that the latter implies the former at some places. Part of SWDEV-455845. (cherry picked from commit 173c682, resolved merge conflicts in AMDGPU.td, flat-scratch.ll, memcpy-libcall.ll) Change-Id: I90383aa32ea4587b9dedd48fc6fa297f4263e8a8
1 parent a3c72d5 commit 787ddf9

15 files changed

+6082
-17788
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1170,8 +1170,9 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
11701170
FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts,
11711171
FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
11721172
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
1173-
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
1174-
FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero
1173+
FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
1174+
FeatureUnalignedDSAccess, FeatureNegativeScratchOffsetBug, FeatureGWS,
1175+
FeatureDefaultComponentZero
11751176
]
11761177
>;
11771178

@@ -1190,9 +1191,9 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
11901191
FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts,
11911192
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
11921193
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
1193-
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts,
1194-
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
1195-
FeatureMaxHardClauseLength63,
1194+
FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
1195+
FeatureUnalignedDSAccess, FeatureImageInsts, FeatureGDS, FeatureGWS,
1196+
FeatureDefaultComponentZero, FeatureMaxHardClauseLength63,
11961197
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
11971198
FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts
11981199
]
@@ -1213,9 +1214,9 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
12131214
FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts,
12141215
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
12151216
FeatureA16, FeatureFastDenormalF32, FeatureG16,
1216-
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
1217-
FeatureGWS, FeatureDefaultComponentZero,
1218-
FeatureMaxHardClauseLength32,
1217+
FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
1218+
FeatureUnalignedDSAccess, FeatureGDS, FeatureGWS,
1219+
FeatureDefaultComponentZero, FeatureMaxHardClauseLength32,
12191220
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts
12201221
]
12211222
>;
@@ -1235,9 +1236,9 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
12351236
FeatureVOP3Literal, FeatureDPP8,
12361237
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
12371238
FeatureA16, FeatureFastDenormalF32, FeatureG16,
1238-
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
1239-
FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast,
1240-
FeatureMaxHardClauseLength32,
1239+
FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
1240+
FeatureUnalignedDSAccess, FeatureTrue16BitInsts,
1241+
FeatureDefaultComponentBroadcast, FeatureMaxHardClauseLength32,
12411242
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
12421243
FeatureAgentScopeFineGrainedRemoteMemoryAtomics
12431244
]

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -387,8 +387,8 @@ bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
387387
// them later if they may access private memory. We don't have enough context
388388
// here, and legalization can handle it.
389389
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
390-
return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
391-
ChainSizeInBytes <= ST->getMaxPrivateElementSize();
390+
return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
391+
ChainSizeInBytes <= ST->getMaxPrivateElementSize();
392392
}
393393
return true;
394394
}

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -590,6 +590,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
590590
return UnalignedScratchAccess;
591591
}
592592

593+
bool hasUnalignedScratchAccessEnabled() const {
594+
return UnalignedScratchAccess && UnalignedAccessMode;
595+
}
596+
593597
bool hasUnalignedAccessMode() const {
594598
return UnalignedAccessMode;
595599
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1826,26 +1826,16 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
18261826
Subtarget->hasUnalignedDSAccessEnabled();
18271827
}
18281828

1829-
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1830-
bool AlignedBy4 = Alignment >= Align(4);
1831-
if (IsFast)
1832-
*IsFast = AlignedBy4;
1833-
1834-
return AlignedBy4 ||
1835-
Subtarget->enableFlatScratch() ||
1836-
Subtarget->hasUnalignedScratchAccess();
1837-
}
1838-
18391829
// FIXME: We have to be conservative here and assume that flat operations
18401830
// will access scratch. If we had access to the IR function, then we
18411831
// could determine if any private memory was used in the function.
1842-
if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1843-
!Subtarget->hasUnalignedScratchAccess()) {
1832+
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1833+
AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
18441834
bool AlignedBy4 = Alignment >= Align(4);
18451835
if (IsFast)
18461836
*IsFast = AlignedBy4;
18471837

1848-
return AlignedBy4;
1838+
return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
18491839
}
18501840

18511841
// So long as they are correct, wide global memory operations perform better

0 commit comments

Comments
 (0)