Skip to content

Commit 9d0221e

Browse files
erichkeanedoru1004
authored andcommitted
[OpenACC][Docs] Add a release note for Clang 21 (llvm#145938)
This patch adds a release note that explains the current status of OpenACC in Clang. Currently we cannot actually make an executable because the OpenACC dialect of MLIR doesn't support any amount of lowering to LLVM-IR, so the usefulness of OpenACC is entirely for front-end related uses, such as tooling or semantic checking.
1 parent c73e5e3 commit 9d0221e

File tree

6 files changed

+447
-674
lines changed

6 files changed

+447
-674
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 43 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -344,9 +344,12 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344344
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
345345
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346346
return 32 * 4 / ElemWidth;
347-
return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
348-
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
349-
: 1;
347+
// For a given width return the max 0number of elements that can be combined
348+
// into a wider bit value:
349+
return (ElemWidth == 8 && ST->has16BitInsts()) ? 4
350+
: (ElemWidth == 16 && ST->has16BitInsts()) ? 2
351+
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
352+
: 1;
350353
}
351354

352355
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1195,14 +1198,15 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11951198

11961199
Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
11971200

1198-
// Larger vector widths may require additional instructions, but are
1199-
// typically cheaper than scalarized versions.
1200-
unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements();
1201+
unsigned ScalarSize = DL.getTypeSizeInBits(SrcTy->getElementType());
12011202
if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1202-
DL.getTypeSizeInBits(SrcTy->getElementType()) == 16) {
1203-
bool HasVOP3P = ST->hasVOP3PInsts();
1203+
(ScalarSize == 16 || ScalarSize == 8)) {
1204+
// Larger vector widths may require additional instructions, but are
1205+
// typically cheaper than scalarized versions.
1206+
unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements();
12041207
unsigned RequestedElts =
12051208
count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
1209+
unsigned EltsPerReg = 32 / ScalarSize;
12061210
if (RequestedElts == 0)
12071211
return 0;
12081212
switch (Kind) {
@@ -1211,9 +1215,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
12111215
case TTI::SK_PermuteSingleSrc: {
12121216
// With op_sel VOP3P instructions freely can access the low half or high
12131217
// half of a register, so any swizzle of two elements is free.
1214-
if (HasVOP3P && NumVectorElts == 2)
1218+
if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2)
12151219
return 0;
1216-
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1220+
unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
12171221
// SK_Broadcast just reuses the same mask
12181222
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
12191223
return NumPerms + NumPermMasks;
@@ -1225,12 +1229,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
12251229
return 0;
12261230
// Insert/extract subvectors only require shifts / extract code to get the
12271231
// relevant bits
1228-
return alignTo(RequestedElts, 2) / 2;
1232+
return alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
12291233
}
12301234
case TTI::SK_PermuteTwoSrc:
12311235
case TTI::SK_Splice:
12321236
case TTI::SK_Select: {
1233-
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1237+
unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
12341238
// SK_Select just reuses the same mask
12351239
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
12361240
return NumPerms + NumPermMasks;
@@ -1505,3 +1509,30 @@ GCNTTIImpl::fpenvIEEEMode(const Instruction &I) const {
15051509
return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off
15061510
: KnownIEEEMode::On;
15071511
}
1512+
1513+
InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1514+
Align Alignment,
1515+
unsigned AddressSpace,
1516+
TTI::TargetCostKind CostKind,
1517+
TTI::OperandValueInfo OpInfo,
1518+
const Instruction *I) const {
1519+
if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1520+
if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1521+
VecTy->getElementType()->isIntegerTy(8)) {
1522+
return divideCeil(DL.getTypeSizeInBits(VecTy) - 1,
1523+
getLoadStoreVecRegBitWidth(AddressSpace));
1524+
}
1525+
}
1526+
return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1527+
OpInfo, I);
1528+
}
1529+
1530+
unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
1531+
if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1532+
if (VecTy->getElementType()->isIntegerTy(8)) {
1533+
unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1534+
return divideCeil(ElementCount - 1, 4);
1535+
}
1536+
}
1537+
return BaseT::getNumberOfParts(Tp);
1538+
}

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,20 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
288288
/// "amdgpu-ieee"="true" and KnownIEEEMode::Off if we can assume
289289
/// "amdgpu-ieee"="false".
290290
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const;
291+
292+
/// Account for loads of i8 vector types to have reduced cost. For
293+
/// example the cost of load 4 i8s values is one is the cost of loading
294+
/// a single i32 value.
295+
InstructionCost getMemoryOpCost(
296+
unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
297+
TTI::TargetCostKind CostKind,
298+
TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
299+
const Instruction *I = nullptr) const override;
300+
301+
/// When counting parts on AMD GPUs, account for i8s being grouped
302+
/// together under a single i32 value. Otherwise fall back to base
303+
/// implementation.
304+
unsigned getNumberOfParts(Type *Tp) const override;
291305
};
292306

293307
} // end namespace llvm

0 commit comments

Comments
 (0)