Skip to content

[DirectX] Lower @llvm.dx.typedBufferStore to DXIL ops #104253

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 53 additions & 4 deletions llvm/docs/DirectX/DXILResources.rst
Original file line number Diff line number Diff line change
Expand Up @@ -361,11 +361,60 @@ Examples:
- ``i32``
- Index into the buffer

Texture and Typed Buffer Stores
-------------------------------

*relevant types: Textures and TypedBuffer*

The `TextureStore`_ and `BufferStore`_ DXIL operations always write all four
32-bit components to a texture or a typed buffer. While both operations include
a mask parameter, it is specified that the mask must cover all components when
used with these types.

The store operations that we define as intrinsics behave similarly, and will
only accept writes to the whole of the contained type. This differs from the
loads above, but this makes sense to do from a semantics preserving point of
view. Thus, texture and buffer stores may only operate on 4-element vectors of
types that are 32-bits or fewer, such as ``<4 x i32>``, ``<4 x float>``, and
``<4 x half>``, and 2 element vectors of 64-bit types like ``<2 x double>`` and
``<2 x i64>``.

.. _BufferStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#bufferstore
.. _TextureStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#texturestore

Examples:

.. code-block:: llvm
.. list-table:: ``@llvm.dx.typedBufferStore``
:header-rows: 1

%ret = call {<4 x float>, i1}
@llvm.dx.typedBufferLoad.checkbit.v4f32.tdx.TypedBuffer_v4f32_0_0_0t(
target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 %index)
* - Argument
-
- Type
- Description
* - Return value
-
- ``void``
-
* - ``%buffer``
- 0
- ``target(dx.TypedBuffer, ...)``
- The buffer to store into
* - ``%index``
- 1
- ``i32``
- Index into the buffer
* - ``%data``
- 2
- A 4- or 2-element vector of the type of the buffer
- The data to store

Examples:

.. code-block:: llvm

call void @llvm.dx.typedBufferStore.tdx.Buffer_v4f32_1_0_0t(
target("dx.TypedBuffer", f32, 1, 0) %buf, i32 %index, <4 x f32> %data)
call void @llvm.dx.typedBufferStore.tdx.Buffer_v4f16_1_0_0t(
target("dx.TypedBuffer", f16, 1, 0) %buf, i32 %index, <4 x f16> %data)
call void @llvm.dx.typedBufferStore.tdx.Buffer_v2f64_1_0_0t(
target("dx.TypedBuffer", f64, 1, 0) %buf, i32 %index, <2 x f64> %data)
2 changes: 2 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsDirectX.td
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def int_dx_handle_fromBinding

def int_dx_typedBufferLoad
: DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_any_ty, llvm_i32_ty]>;
def int_dx_typedBufferStore
: DefaultAttrsIntrinsic<[], [llvm_any_ty, llvm_i32_ty, llvm_anyvector_ty]>;

// Cast between target extension handle types and dxil-style opaque handles
def int_dx_cast_handle : Intrinsic<[llvm_any_ty], [llvm_any_ty]>;
Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/Target/DirectX/DXIL.td
Original file line number Diff line number Diff line change
Expand Up @@ -707,6 +707,18 @@ def BufferLoad : DXILOp<68, bufferLoad> {
let stages = [Stages<DXIL1_0, [all_stages]>];
}

def BufferStore : DXILOp<69, bufferStore> {
let Doc = "writes to an RWTypedBuffer";
// Handle, Coord0, Coord1, Val0, Val1, Val2, Val3, Mask
let arguments = [
HandleTy, Int32Ty, Int32Ty, OverloadTy, OverloadTy, OverloadTy, OverloadTy,
Int8Ty
];
let result = VoidTy;
let overloads = [Overloads<DXIL1_0, [HalfTy, FloatTy, Int16Ty, Int32Ty]>];
let stages = [Stages<DXIL1_0, [all_stages]>];
}

def ThreadId : DXILOp<93, threadId> {
let Doc = "Reads the thread ID";
let LLVMIntrinsic = int_dx_thread_id;
Expand Down
88 changes: 69 additions & 19 deletions llvm/lib/Target/DirectX/DXILOpLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,11 @@ class OpLowerer {
public:
OpLowerer(Module &M, DXILResourceMap &DRM) : M(M), OpBuilder(M), DRM(DRM) {}

void replaceFunction(Function &F,
llvm::function_ref<Error(CallInst *CI)> ReplaceCall) {
/// Replace every call to \c F using \c ReplaceCall, and then erase \c F. If
/// there is an error replacing a call, we emit a diagnostic and return true.
[[nodiscard]] bool
replaceFunction(Function &F,
llvm::function_ref<Error(CallInst *CI)> ReplaceCall) {
for (User *U : make_early_inc_range(F.users())) {
CallInst *CI = dyn_cast<CallInst>(U);
if (!CI)
Expand All @@ -94,16 +97,18 @@ class OpLowerer {
DiagnosticInfoUnsupported Diag(*CI->getFunction(), Message,
CI->getDebugLoc());
M.getContext().diagnose(Diag);
continue;
return true;
}
}
if (F.user_empty())
F.eraseFromParent();
return false;
}

void replaceFunctionWithOp(Function &F, dxil::OpCode DXILOp) {
[[nodiscard]]
bool replaceFunctionWithOp(Function &F, dxil::OpCode DXILOp) {
bool IsVectorArgExpansion = isVectorArgExpansion(F);
replaceFunction(F, [&](CallInst *CI) -> Error {
return replaceFunction(F, [&](CallInst *CI) -> Error {
SmallVector<Value *> Args;
OpBuilder.getIRB().SetInsertPoint(CI);
if (IsVectorArgExpansion) {
Expand Down Expand Up @@ -175,12 +180,12 @@ class OpLowerer {
CleanupCasts.clear();
}

void lowerToCreateHandle(Function &F) {
[[nodiscard]] bool lowerToCreateHandle(Function &F) {
IRBuilder<> &IRB = OpBuilder.getIRB();
Type *Int8Ty = IRB.getInt8Ty();
Type *Int32Ty = IRB.getInt32Ty();

replaceFunction(F, [&](CallInst *CI) -> Error {
return replaceFunction(F, [&](CallInst *CI) -> Error {
IRB.SetInsertPoint(CI);

auto *It = DRM.find(CI);
Expand All @@ -205,10 +210,10 @@ class OpLowerer {
});
}

void lowerToBindAndAnnotateHandle(Function &F) {
[[nodiscard]] bool lowerToBindAndAnnotateHandle(Function &F) {
IRBuilder<> &IRB = OpBuilder.getIRB();

replaceFunction(F, [&](CallInst *CI) -> Error {
return replaceFunction(F, [&](CallInst *CI) -> Error {
IRB.SetInsertPoint(CI);

auto *It = DRM.find(CI);
Expand Down Expand Up @@ -251,12 +256,11 @@ class OpLowerer {

/// Lower `dx.handle.fromBinding` intrinsics depending on the shader model and
/// taking into account binding information from DXILResourceAnalysis.
void lowerHandleFromBinding(Function &F) {
bool lowerHandleFromBinding(Function &F) {
Triple TT(Triple(M.getTargetTriple()));
if (TT.getDXILVersion() < VersionTuple(1, 6))
lowerToCreateHandle(F);
else
lowerToBindAndAnnotateHandle(F);
return lowerToCreateHandle(F);
return lowerToBindAndAnnotateHandle(F);
}

/// Replace uses of \c Intrin with the values in the `dx.ResRet` of \c Op.
Expand Down Expand Up @@ -342,11 +346,11 @@ class OpLowerer {
return Error::success();
}

void lowerTypedBufferLoad(Function &F) {
[[nodiscard]] bool lowerTypedBufferLoad(Function &F) {
IRBuilder<> &IRB = OpBuilder.getIRB();
Type *Int32Ty = IRB.getInt32Ty();

replaceFunction(F, [&](CallInst *CI) -> Error {
return replaceFunction(F, [&](CallInst *CI) -> Error {
IRB.SetInsertPoint(CI);

Value *Handle =
Expand All @@ -368,8 +372,51 @@ class OpLowerer {
});
}

[[nodiscard]] bool lowerTypedBufferStore(Function &F) {
IRBuilder<> &IRB = OpBuilder.getIRB();
Type *Int8Ty = IRB.getInt8Ty();
Type *Int32Ty = IRB.getInt32Ty();

return replaceFunction(F, [&](CallInst *CI) -> Error {
IRB.SetInsertPoint(CI);

Value *Handle =
createTmpHandleCast(CI->getArgOperand(0), OpBuilder.getHandleType());
Value *Index0 = CI->getArgOperand(1);
Value *Index1 = UndefValue::get(Int32Ty);
// For typed stores, the mask must always cover all four elements.
Constant *Mask = ConstantInt::get(Int8Ty, 0xF);

Value *Data = CI->getArgOperand(2);
auto *DataTy = dyn_cast<FixedVectorType>(Data->getType());
if (!DataTy || DataTy->getNumElements() != 4)
return make_error<StringError>(
"typedBufferStore data must be a vector of 4 elements",
inconvertibleErrorCode());
Value *Data0 =
IRB.CreateExtractElement(Data, ConstantInt::get(Int32Ty, 0));
Value *Data1 =
IRB.CreateExtractElement(Data, ConstantInt::get(Int32Ty, 1));
Value *Data2 =
IRB.CreateExtractElement(Data, ConstantInt::get(Int32Ty, 2));
Value *Data3 =
IRB.CreateExtractElement(Data, ConstantInt::get(Int32Ty, 3));

std::array<Value *, 8> Args{Handle, Index0, Index1, Data0,
Data1, Data2, Data3, Mask};
Expected<CallInst *> OpCall =
OpBuilder.tryCreateOp(OpCode::BufferStore, Args);
if (Error E = OpCall.takeError())
return E;

CI->eraseFromParent();
return Error::success();
});
}

bool lowerIntrinsics() {
bool Updated = false;
bool HasErrors = false;

for (Function &F : make_early_inc_range(M.functions())) {
if (!F.isDeclaration())
Expand All @@ -380,19 +427,22 @@ class OpLowerer {
continue;
#define DXIL_OP_INTRINSIC(OpCode, Intrin) \
case Intrin: \
replaceFunctionWithOp(F, OpCode); \
HasErrors |= replaceFunctionWithOp(F, OpCode); \
break;
#include "DXILOperation.inc"
case Intrinsic::dx_handle_fromBinding:
lowerHandleFromBinding(F);
HasErrors |= lowerHandleFromBinding(F);
break;
case Intrinsic::dx_typedBufferLoad:
lowerTypedBufferLoad(F);
HasErrors |= lowerTypedBufferLoad(F);
break;
case Intrinsic::dx_typedBufferStore:
HasErrors |= lowerTypedBufferStore(F);
break;
}
Updated = true;
}
if (Updated)
if (Updated && !HasErrors)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So dx_typedBufferLoad cannot fail?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It can fail in the same ways any of the op lowering can fail (like say if we just give it completely incorrect types) but I think it happens to always fail in such a way that the cleanup wouldn't crash. The HasErrors check is added here so that we can carry on and finish the pass and just let LLVM's error handling propagate the error afterwards, because when lowerTypedBufferStore fails it can leave the temporary casts in a bad state.

All that said, we can and should add some tests for cases where lowering loads fails. I'll do that in a follow up change.

cleanupHandleCasts();

return Updated;
Expand Down
37 changes: 37 additions & 0 deletions llvm/test/CodeGen/DirectX/BufferStore-errors.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
; We use llc for this test so that we don't abort after the first error.
; RUN: not llc %s -o /dev/null 2>&1 | FileCheck %s

target triple = "dxil-pc-shadermodel6.6-compute"

; CHECK: error:
; CHECK-SAME: in function storetoomany
; CHECK-SAME: typedBufferStore data must be a vector of 4 elements
define void @storetoomany(<5 x float> %data, i32 %index) {
%buffer = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
@llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0(
i32 0, i32 0, i32 1, i32 0, i1 false)

call void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f32_1_0_0t.v5f32(
target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer,
i32 %index, <5 x float> %data)

ret void
}

; CHECK: error:
; CHECK-SAME: in function storetoofew
; CHECK-SAME: typedBufferStore data must be a vector of 4 elements
define void @storetoofew(<3 x i32> %data, i32 %index) {
%buffer = call target("dx.TypedBuffer", <4 x i32>, 1, 0, 0)
@llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4i32_1_0_0(
i32 0, i32 0, i32 1, i32 0, i1 false)

call void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4i32_1_0_0t.v3i32(
target("dx.TypedBuffer", <4 x i32>, 1, 0, 0) %buffer,
i32 %index, <3 x i32> %data)

ret void
}

declare void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f32_1_0_0t.v5f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32, <5 x float>)
declare void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4i32_1_0_0t.v3i32(target("dx.TypedBuffer", <4 x i32>, 1, 0, 0), i32, <3 x i32>)
92 changes: 92 additions & 0 deletions llvm/test/CodeGen/DirectX/BufferStore.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
; RUN: opt -S -dxil-op-lower %s | FileCheck %s

target triple = "dxil-pc-shadermodel6.6-compute"

define void @storefloat(<4 x float> %data, i32 %index) {

; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding
; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BIND]]
%buffer = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
@llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0(
i32 0, i32 0, i32 1, i32 0, i1 false)

; The temporary casts should all have been cleaned up
; CHECK-NOT: %dx.cast_handle

; CHECK: [[DATA0_0:%.*]] = extractelement <4 x float> %data, i32 0
; CHECK: [[DATA0_1:%.*]] = extractelement <4 x float> %data, i32 1
; CHECK: [[DATA0_2:%.*]] = extractelement <4 x float> %data, i32 2
; CHECK: [[DATA0_3:%.*]] = extractelement <4 x float> %data, i32 3
; CHECK: call void @dx.op.bufferStore.f32(i32 69, %dx.types.Handle [[HANDLE]], i32 %index, i32 undef, float [[DATA0_0]], float [[DATA0_1]], float [[DATA0_2]], float [[DATA0_3]], i8 15)
call void @llvm.dx.typedBufferStore(
target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer,
i32 %index, <4 x float> %data)

ret void
}

define void @storeint(<4 x i32> %data, i32 %index) {

; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding
; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BIND]]
%buffer = call target("dx.TypedBuffer", <4 x i32>, 1, 0, 0)
@llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4i32_1_0_0(
i32 0, i32 0, i32 1, i32 0, i1 false)

; CHECK: [[DATA0_0:%.*]] = extractelement <4 x i32> %data, i32 0
; CHECK: [[DATA0_1:%.*]] = extractelement <4 x i32> %data, i32 1
; CHECK: [[DATA0_2:%.*]] = extractelement <4 x i32> %data, i32 2
; CHECK: [[DATA0_3:%.*]] = extractelement <4 x i32> %data, i32 3
; CHECK: call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle [[HANDLE]], i32 %index, i32 undef, i32 [[DATA0_0]], i32 [[DATA0_1]], i32 [[DATA0_2]], i32 [[DATA0_3]], i8 15)
call void @llvm.dx.typedBufferStore(
target("dx.TypedBuffer", <4 x i32>, 1, 0, 0) %buffer,
i32 %index, <4 x i32> %data)

ret void
}

define void @storehalf(<4 x half> %data, i32 %index) {

; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding
; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BIND]]
%buffer = call target("dx.TypedBuffer", <4 x half>, 1, 0, 0)
@llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f16_1_0_0(
i32 0, i32 0, i32 1, i32 0, i1 false)

; The temporary casts should all have been cleaned up
; CHECK-NOT: %dx.cast_handle

; CHECK: [[DATA0_0:%.*]] = extractelement <4 x half> %data, i32 0
; CHECK: [[DATA0_1:%.*]] = extractelement <4 x half> %data, i32 1
; CHECK: [[DATA0_2:%.*]] = extractelement <4 x half> %data, i32 2
; CHECK: [[DATA0_3:%.*]] = extractelement <4 x half> %data, i32 3
; CHECK: call void @dx.op.bufferStore.f16(i32 69, %dx.types.Handle [[HANDLE]], i32 %index, i32 undef, half [[DATA0_0]], half [[DATA0_1]], half [[DATA0_2]], half [[DATA0_3]], i8 15)
call void @llvm.dx.typedBufferStore(
target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer,
i32 %index, <4 x half> %data)

ret void
}

define void @storei16(<4 x i16> %data, i32 %index) {

; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding
; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BIND]]
%buffer = call target("dx.TypedBuffer", <4 x i16>, 1, 0, 0)
@llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4i16_1_0_0(
i32 0, i32 0, i32 1, i32 0, i1 false)

; The temporary casts should all have been cleaned up
; CHECK-NOT: %dx.cast_handle

; CHECK: [[DATA0_0:%.*]] = extractelement <4 x i16> %data, i32 0
; CHECK: [[DATA0_1:%.*]] = extractelement <4 x i16> %data, i32 1
; CHECK: [[DATA0_2:%.*]] = extractelement <4 x i16> %data, i32 2
; CHECK: [[DATA0_3:%.*]] = extractelement <4 x i16> %data, i32 3
; CHECK: call void @dx.op.bufferStore.i16(i32 69, %dx.types.Handle [[HANDLE]], i32 %index, i32 undef, i16 [[DATA0_0]], i16 [[DATA0_1]], i16 [[DATA0_2]], i16 [[DATA0_3]], i8 15)
call void @llvm.dx.typedBufferStore(
target("dx.TypedBuffer", <4 x i16>, 1, 0, 0) %buffer,
i32 %index, <4 x i16> %data)

ret void
}
Loading