Skip to content

Commit 12409a1

Browse files
authored
[DirectX] Implement memcpy in DXIL CBuffer Access pass (#144436)
Fixes #141840 This PR implements support for the `memcpy` intrinsic in the DXIL CBuffer Access pass with the following restrictions: - The CBuffer Access must be the `src` operand of `memcpy` and must be direct (i.e., not a GEP) - The type of the CBuffer Access must be of an Array Type These restrictions greatly simplify the implementation of `memcpy` yet still covers the known uses in DML shaders. Furthermore, to prevent errors like #141840 from occurring silently again, this PR adds error reporting for unsupported users of globals in the DXIL CBuffer Access pass.
1 parent 92b5095 commit 12409a1

File tree

2 files changed

+419
-67
lines changed

2 files changed

+419
-67
lines changed

llvm/lib/Target/DirectX/DXILCBufferAccess.cpp

Lines changed: 203 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,11 @@
1111
#include "llvm/Frontend/HLSL/CBuffer.h"
1212
#include "llvm/Frontend/HLSL/HLSLResource.h"
1313
#include "llvm/IR/IRBuilder.h"
14+
#include "llvm/IR/IntrinsicInst.h"
1415
#include "llvm/IR/IntrinsicsDirectX.h"
1516
#include "llvm/InitializePasses.h"
1617
#include "llvm/Pass.h"
18+
#include "llvm/Support/FormatVariadic.h"
1719
#include "llvm/Transforms/Utils/Local.h"
1820

1921
#define DEBUG_TYPE "dxil-cbuffer-access"
@@ -54,71 +56,106 @@ struct CBufferRowIntrin {
5456
}
5557
}
5658
};
57-
} // namespace
5859

59-
static size_t getOffsetForCBufferGEP(GEPOperator *GEP, GlobalVariable *Global,
60-
const DataLayout &DL) {
61-
// Since we should always have a constant offset, we should only ever have a
62-
// single GEP of indirection from the Global.
63-
assert(GEP->getPointerOperand() == Global &&
64-
"Indirect access to resource handle");
60+
// Helper for creating CBuffer handles and loading data from them
61+
struct CBufferResource {
62+
GlobalVariable *GVHandle;
63+
GlobalVariable *Member;
64+
size_t MemberOffset;
6565

66-
APInt ConstantOffset(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
67-
bool Success = GEP->accumulateConstantOffset(DL, ConstantOffset);
68-
(void)Success;
69-
assert(Success && "Offsets into cbuffer globals must be constant");
66+
LoadInst *Handle;
7067

71-
if (auto *ATy = dyn_cast<ArrayType>(Global->getValueType()))
72-
ConstantOffset = hlsl::translateCBufArrayOffset(DL, ConstantOffset, ATy);
68+
CBufferResource(GlobalVariable *GVHandle, GlobalVariable *Member,
69+
size_t MemberOffset)
70+
: GVHandle(GVHandle), Member(Member), MemberOffset(MemberOffset) {}
7371

74-
return ConstantOffset.getZExtValue();
75-
}
72+
const DataLayout &getDataLayout() { return GVHandle->getDataLayout(); }
73+
Type *getValueType() { return Member->getValueType(); }
74+
iterator_range<ConstantDataSequential::user_iterator> users() {
75+
return Member->users();
76+
}
77+
78+
/// Get the byte offset of a Pointer-typed Value * `Val` relative to Member.
79+
/// `Val` can either be Member itself, or a GEP of a constant offset from
80+
/// Member
81+
size_t getOffsetForCBufferGEP(Value *Val) {
82+
assert(isa<PointerType>(Val->getType()) &&
83+
"Expected a pointer-typed value");
84+
85+
if (Val == Member)
86+
return 0;
87+
88+
if (auto *GEP = dyn_cast<GEPOperator>(Val)) {
89+
// Since we should always have a constant offset, we should only ever have
90+
// a single GEP of indirection from the Global.
91+
assert(GEP->getPointerOperand() == Member &&
92+
"Indirect access to resource handle");
93+
94+
const DataLayout &DL = getDataLayout();
95+
APInt ConstantOffset(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
96+
bool Success = GEP->accumulateConstantOffset(DL, ConstantOffset);
97+
(void)Success;
98+
assert(Success && "Offsets into cbuffer globals must be constant");
99+
100+
if (auto *ATy = dyn_cast<ArrayType>(Member->getValueType()))
101+
ConstantOffset =
102+
hlsl::translateCBufArrayOffset(DL, ConstantOffset, ATy);
103+
104+
return ConstantOffset.getZExtValue();
105+
}
76106

77-
/// Replace access via cbuffer global with a load from the cbuffer handle
78-
/// itself.
79-
static void replaceAccess(LoadInst *LI, GlobalVariable *Global,
80-
GlobalVariable *HandleGV, size_t BaseOffset,
81-
SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
82-
const DataLayout &DL = HandleGV->getDataLayout();
107+
llvm_unreachable("Expected Val to be a GlobalVariable or GEP");
108+
}
83109

84-
size_t Offset = BaseOffset;
85-
if (auto *GEP = dyn_cast<GEPOperator>(LI->getPointerOperand()))
86-
Offset += getOffsetForCBufferGEP(GEP, Global, DL);
87-
else if (LI->getPointerOperand() != Global)
88-
llvm_unreachable("Load instruction doesn't reference cbuffer global");
110+
/// Create a handle for this cbuffer resource using the IRBuilder `Builder`
111+
/// and sets the handle as the current one to use for subsequent calls to
112+
/// `loadValue`
113+
void createAndSetCurrentHandle(IRBuilder<> &Builder) {
114+
Handle = Builder.CreateLoad(GVHandle->getValueType(), GVHandle,
115+
GVHandle->getName());
116+
}
89117

90-
IRBuilder<> Builder(LI);
91-
auto *Handle = Builder.CreateLoad(HandleGV->getValueType(), HandleGV,
92-
HandleGV->getName());
93-
94-
Type *Ty = LI->getType();
95-
CBufferRowIntrin Intrin(DL, Ty->getScalarType());
96-
// The cbuffer consists of some number of 16-byte rows.
97-
unsigned int CurrentRow = Offset / hlsl::CBufferRowSizeInBytes;
98-
unsigned int CurrentIndex =
99-
(Offset % hlsl::CBufferRowSizeInBytes) / Intrin.EltSize;
100-
101-
auto *CBufLoad = Builder.CreateIntrinsic(
102-
Intrin.RetTy, Intrin.IID,
103-
{Handle, ConstantInt::get(Builder.getInt32Ty(), CurrentRow)}, nullptr,
104-
LI->getName());
105-
auto *Elt =
106-
Builder.CreateExtractValue(CBufLoad, {CurrentIndex++}, LI->getName());
107-
108-
Value *Result = nullptr;
109-
unsigned int Remaining =
110-
((DL.getTypeSizeInBits(Ty) / 8) / Intrin.EltSize) - 1;
111-
if (Remaining == 0) {
112-
// We only have a single element, so we're done.
113-
Result = Elt;
114-
115-
// However, if we loaded a <1 x T>, then we need to adjust the type here.
116-
if (auto *VT = dyn_cast<FixedVectorType>(LI->getType())) {
117-
assert(VT->getNumElements() == 1 && "Can't have multiple elements here");
118-
Result = Builder.CreateInsertElement(PoisonValue::get(VT), Result,
119-
Builder.getInt32(0));
118+
/// Load a value of type `Ty` at offset `Offset` using the handle from the
119+
/// last call to `createAndSetCurrentHandle`
120+
Value *loadValue(IRBuilder<> &Builder, Type *Ty, size_t Offset,
121+
const Twine &Name = "") {
122+
assert(Handle &&
123+
"Expected a handle for this cbuffer global resource to be created "
124+
"before loading a value from it");
125+
const DataLayout &DL = getDataLayout();
126+
127+
size_t TargetOffset = MemberOffset + Offset;
128+
CBufferRowIntrin Intrin(DL, Ty->getScalarType());
129+
// The cbuffer consists of some number of 16-byte rows.
130+
unsigned int CurrentRow = TargetOffset / hlsl::CBufferRowSizeInBytes;
131+
unsigned int CurrentIndex =
132+
(TargetOffset % hlsl::CBufferRowSizeInBytes) / Intrin.EltSize;
133+
134+
auto *CBufLoad = Builder.CreateIntrinsic(
135+
Intrin.RetTy, Intrin.IID,
136+
{Handle, ConstantInt::get(Builder.getInt32Ty(), CurrentRow)}, nullptr,
137+
Name + ".load");
138+
auto *Elt = Builder.CreateExtractValue(CBufLoad, {CurrentIndex++},
139+
Name + ".extract");
140+
141+
Value *Result = nullptr;
142+
unsigned int Remaining =
143+
((DL.getTypeSizeInBits(Ty) / 8) / Intrin.EltSize) - 1;
144+
145+
if (Remaining == 0) {
146+
// We only have a single element, so we're done.
147+
Result = Elt;
148+
149+
// However, if we loaded a <1 x T>, then we need to adjust the type here.
150+
if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
151+
assert(VT->getNumElements() == 1 &&
152+
"Can't have multiple elements here");
153+
Result = Builder.CreateInsertElement(PoisonValue::get(VT), Result,
154+
Builder.getInt32(0), Name);
155+
}
156+
return Result;
120157
}
121-
} else {
158+
122159
// Walk each element and extract it, wrapping to new rows as needed.
123160
SmallVector<Value *> Extracts{Elt};
124161
while (Remaining--) {
@@ -128,40 +165,138 @@ static void replaceAccess(LoadInst *LI, GlobalVariable *Global,
128165
CBufLoad = Builder.CreateIntrinsic(
129166
Intrin.RetTy, Intrin.IID,
130167
{Handle, ConstantInt::get(Builder.getInt32Ty(), ++CurrentRow)},
131-
nullptr, LI->getName());
168+
nullptr, Name + ".load");
132169

133170
Extracts.push_back(Builder.CreateExtractValue(CBufLoad, {CurrentIndex++},
134-
LI->getName()));
171+
Name + ".extract"));
135172
}
136173

137174
// Finally, we build up the original loaded value.
138175
Result = PoisonValue::get(Ty);
139176
for (int I = 0, E = Extracts.size(); I < E; ++I)
140177
Result =
141-
Builder.CreateInsertElement(Result, Extracts[I], Builder.getInt32(I));
178+
Builder.CreateInsertElement(Result, Extracts[I], Builder.getInt32(I),
179+
Name + formatv(".upto{}", I));
180+
return Result;
142181
}
182+
};
143183

184+
} // namespace
185+
186+
/// Replace load via cbuffer global with a load from the cbuffer handle itself.
187+
static void replaceLoad(LoadInst *LI, CBufferResource &CBR,
188+
SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
189+
size_t Offset = CBR.getOffsetForCBufferGEP(LI->getPointerOperand());
190+
IRBuilder<> Builder(LI);
191+
CBR.createAndSetCurrentHandle(Builder);
192+
Value *Result = CBR.loadValue(Builder, LI->getType(), Offset, LI->getName());
144193
LI->replaceAllUsesWith(Result);
145194
DeadInsts.push_back(LI);
146195
}
147196

148-
static void replaceAccessesWithHandle(GlobalVariable *Global,
149-
GlobalVariable *HandleGV,
150-
size_t BaseOffset) {
197+
/// This function recursively copies N array elements from the cbuffer resource
198+
/// CBR to the MemCpy Destination. Recursion is used to unravel multidimensional
199+
/// arrays into a sequence of scalar/vector extracts and stores.
200+
static void copyArrayElemsForMemCpy(IRBuilder<> &Builder, MemCpyInst *MCI,
201+
CBufferResource &CBR, ArrayType *ArrTy,
202+
size_t ArrOffset, size_t N,
203+
const Twine &Name = "") {
204+
const DataLayout &DL = MCI->getDataLayout();
205+
Type *ElemTy = ArrTy->getElementType();
206+
size_t ElemTySize = DL.getTypeAllocSize(ElemTy);
207+
for (unsigned I = 0; I < N; ++I) {
208+
size_t Offset = ArrOffset + I * ElemTySize;
209+
210+
// Recursively copy nested arrays
211+
if (ArrayType *ElemArrTy = dyn_cast<ArrayType>(ElemTy)) {
212+
copyArrayElemsForMemCpy(Builder, MCI, CBR, ElemArrTy, Offset,
213+
ElemArrTy->getNumElements(), Name);
214+
continue;
215+
}
216+
217+
// Load CBuffer value and store it in Dest
218+
APInt CBufArrayOffset(
219+
DL.getIndexTypeSizeInBits(MCI->getSource()->getType()), Offset);
220+
CBufArrayOffset =
221+
hlsl::translateCBufArrayOffset(DL, CBufArrayOffset, ArrTy);
222+
Value *CBufferVal =
223+
CBR.loadValue(Builder, ElemTy, CBufArrayOffset.getZExtValue(), Name);
224+
Value *GEP =
225+
Builder.CreateInBoundsGEP(Builder.getInt8Ty(), MCI->getDest(),
226+
{Builder.getInt32(Offset)}, Name + ".dest");
227+
Builder.CreateStore(CBufferVal, GEP, MCI->isVolatile());
228+
}
229+
}
230+
231+
/// Replace memcpy from a cbuffer global with a memcpy from the cbuffer handle
232+
/// itself. Assumes the cbuffer global is an array, and the length of bytes to
233+
/// copy is divisible by array element allocation size.
234+
/// The memcpy source must also be a direct cbuffer global reference, not a GEP.
235+
static void replaceMemCpy(MemCpyInst *MCI, CBufferResource &CBR) {
236+
237+
ArrayType *ArrTy = dyn_cast<ArrayType>(CBR.getValueType());
238+
assert(ArrTy && "MemCpy lowering is only supported for array types");
239+
240+
// This assumption vastly simplifies the implementation
241+
if (MCI->getSource() != CBR.Member)
242+
reportFatalUsageError(
243+
"Expected MemCpy source to be a cbuffer global variable");
244+
245+
ConstantInt *Length = dyn_cast<ConstantInt>(MCI->getLength());
246+
uint64_t ByteLength = Length->getZExtValue();
247+
248+
// If length to copy is zero, no memcpy is needed
249+
if (ByteLength == 0) {
250+
MCI->eraseFromParent();
251+
return;
252+
}
253+
254+
const DataLayout &DL = CBR.getDataLayout();
255+
256+
Type *ElemTy = ArrTy->getElementType();
257+
size_t ElemSize = DL.getTypeAllocSize(ElemTy);
258+
assert(ByteLength % ElemSize == 0 &&
259+
"Length of bytes to MemCpy must be divisible by allocation size of "
260+
"source/destination array elements");
261+
size_t ElemsToCpy = ByteLength / ElemSize;
262+
263+
IRBuilder<> Builder(MCI);
264+
CBR.createAndSetCurrentHandle(Builder);
265+
266+
copyArrayElemsForMemCpy(Builder, MCI, CBR, ArrTy, 0, ElemsToCpy,
267+
"memcpy." + MCI->getDest()->getName() + "." +
268+
MCI->getSource()->getName());
269+
270+
MCI->eraseFromParent();
271+
}
272+
273+
static void replaceAccessesWithHandle(CBufferResource &CBR) {
151274
SmallVector<WeakTrackingVH> DeadInsts;
152275

153-
SmallVector<User *> ToProcess{Global->users()};
276+
SmallVector<User *> ToProcess{CBR.users()};
154277
while (!ToProcess.empty()) {
155278
User *Cur = ToProcess.pop_back_val();
156279

157280
// If we have a load instruction, replace the access.
158281
if (auto *LI = dyn_cast<LoadInst>(Cur)) {
159-
replaceAccess(LI, Global, HandleGV, BaseOffset, DeadInsts);
282+
replaceLoad(LI, CBR, DeadInsts);
283+
continue;
284+
}
285+
286+
// If we have a memcpy instruction, replace it with multiple accesses and
287+
// subsequent stores to the destination
288+
if (auto *MCI = dyn_cast<MemCpyInst>(Cur)) {
289+
replaceMemCpy(MCI, CBR);
160290
continue;
161291
}
162292

163293
// Otherwise, walk users looking for a load...
164-
ToProcess.append(Cur->user_begin(), Cur->user_end());
294+
if (isa<GetElementPtrInst>(Cur) || isa<GEPOperator>(Cur)) {
295+
ToProcess.append(Cur->user_begin(), Cur->user_end());
296+
continue;
297+
}
298+
299+
llvm_unreachable("Unexpected user of Global");
165300
}
166301
RecursivelyDeleteTriviallyDeadInstructions(DeadInsts);
167302
}
@@ -173,7 +308,8 @@ static bool replaceCBufferAccesses(Module &M) {
173308

174309
for (const hlsl::CBufferMapping &Mapping : *CBufMD)
175310
for (const hlsl::CBufferMember &Member : Mapping.Members) {
176-
replaceAccessesWithHandle(Member.GV, Mapping.Handle, Member.Offset);
311+
CBufferResource CBR(Mapping.Handle, Member.GV, Member.Offset);
312+
replaceAccessesWithHandle(CBR);
177313
Member.GV->removeFromParent();
178314
}
179315

0 commit comments

Comments
 (0)