llvm · bogner · Sep 10, 2024 · Aug 14, 2024 · Aug 14, 2024 · Aug 15, 2024
diff --git a/llvm/docs/DirectX/DXILResources.rst b/llvm/docs/DirectX/DXILResources.rst
@@ -361,11 +361,60 @@ Examples:
      - ``i32``
      - Index into the buffer
 
+Texture and Typed Buffer Stores
+-------------------------------
+
+*relevant types: Textures and TypedBuffer*
+
+The `TextureStore`_ and `BufferStore`_ DXIL operations always write all four
+32-bit components to a texture or a typed buffer. While both operations include
+a mask parameter, it is specified that the mask must cover all components when
+used with these types.
+
+The store operations that we define as intrinsics behave similarly, and will
+only accept writes to the whole of the contained type. This differs from the
+loads above, but this makes sense to do from a semantics preserving point of
+view. Thus, texture and buffer stores may only operate on 4-element vectors of
+types that are 32-bits or fewer, such as ``<4 x i32>``, ``<4 x float>``, and
+``<4 x half>``, and 2 element vectors of 64-bit types like ``<2 x double>`` and
+``<2 x i64>``.
+
+.. _BufferStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#bufferstore
+.. _TextureStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#texturestore
+
 Examples:
 
-.. code-block:: llvm
+.. list-table:: ``@llvm.dx.typedBufferStore``
+   :header-rows: 1
 
-   %ret = call {<4 x float>, i1}
-       @llvm.dx.typedBufferLoad.checkbit.v4f32.tdx.TypedBuffer_v4f32_0_0_0t(
-           target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 %index)
+   * - Argument
+     -
+     - Type
+     - Description
+   * - Return value
+     -
+     - ``void``
+     -
+   * - ``%buffer``
+     - 0
+     - ``target(dx.TypedBuffer, ...)``
+     - The buffer to store into
+   * - ``%index``
+     - 1
+     - ``i32``
+     - Index into the buffer
+   * - ``%data``
+     - 2
+     - A 4- or 2-element vector of the type of the buffer
+     - The data to store
+
+Examples:
+
+.. code-block:: llvm
 
+   call void @llvm.dx.typedBufferStore.tdx.Buffer_v4f32_1_0_0t(
+       target("dx.TypedBuffer", f32, 1, 0) %buf, i32 %index, <4 x f32> %data)
+   call void @llvm.dx.typedBufferStore.tdx.Buffer_v4f16_1_0_0t(
+       target("dx.TypedBuffer", f16, 1, 0) %buf, i32 %index, <4 x f16> %data)
+   call void @llvm.dx.typedBufferStore.tdx.Buffer_v2f64_1_0_0t(
+       target("dx.TypedBuffer", f64, 1, 0) %buf, i32 %index, <2 x f64> %data)
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -32,6 +32,8 @@ def int_dx_handle_fromBinding
 
 def int_dx_typedBufferLoad
     : DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_any_ty, llvm_i32_ty]>;
+def int_dx_typedBufferStore
+    : DefaultAttrsIntrinsic<[], [llvm_any_ty, llvm_i32_ty, llvm_anyvector_ty]>;
 
 // Cast between target extension handle types and dxil-style opaque handles
 def int_dx_cast_handle : Intrinsic<[llvm_any_ty], [llvm_any_ty]>;

diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
@@ -707,6 +707,18 @@ def BufferLoad : DXILOp<68, bufferLoad> {
   let stages = [Stages<DXIL1_0, [all_stages]>];
 }
 
+def BufferStore : DXILOp<69, bufferStore> {
+  let Doc = "writes to an RWTypedBuffer";
+  // Handle, Coord0, Coord1, Val0, Val1, Val2, Val3, Mask
+  let arguments = [
+    HandleTy, Int32Ty, Int32Ty, OverloadTy, OverloadTy, OverloadTy, OverloadTy,
+    Int8Ty
+  ];
+  let result = VoidTy;
+  let overloads = [Overloads<DXIL1_0, [HalfTy, FloatTy, Int16Ty, Int32Ty]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+}
+
 def ThreadId :  DXILOp<93, threadId> {
   let Doc = "Reads the thread ID";
   let LLVMIntrinsic = int_dx_thread_id;

diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -82,8 +82,11 @@ class OpLowerer {
 public:
   OpLowerer(Module &M, DXILResourceMap &DRM) : M(M), OpBuilder(M), DRM(DRM) {}
 
-  void replaceFunction(Function &F,
-                       llvm::function_ref<Error(CallInst *CI)> ReplaceCall) {
+  /// Replace every call to \c F using \c ReplaceCall, and then erase \c F. If
+  /// there is an error replacing a call, we emit a diagnostic and return true.
+  [[nodiscard]] bool
+  replaceFunction(Function &F,
+                  llvm::function_ref<Error(CallInst *CI)> ReplaceCall) {
     for (User *U : make_early_inc_range(F.users())) {
       CallInst *CI = dyn_cast<CallInst>(U);
       if (!CI)
@@ -94,16 +97,18 @@ class OpLowerer {
         DiagnosticInfoUnsupported Diag(*CI->getFunction(), Message,
                                        CI->getDebugLoc());
         M.getContext().diagnose(Diag);
-        continue;
+        return true;
       }
     }
     if (F.user_empty())
       F.eraseFromParent();
+    return false;
   }
 
-  void replaceFunctionWithOp(Function &F, dxil::OpCode DXILOp) {
+  [[nodiscard]]
+  bool replaceFunctionWithOp(Function &F, dxil::OpCode DXILOp) {
     bool IsVectorArgExpansion = isVectorArgExpansion(F);
-    replaceFunction(F, [&](CallInst *CI) -> Error {
+    return replaceFunction(F, [&](CallInst *CI) -> Error {
       SmallVector<Value *> Args;
       OpBuilder.getIRB().SetInsertPoint(CI);
       if (IsVectorArgExpansion) {
@@ -175,12 +180,12 @@ class OpLowerer {
     CleanupCasts.clear();
   }
 
-  void lowerToCreateHandle(Function &F) {
+  [[nodiscard]] bool lowerToCreateHandle(Function &F) {
     IRBuilder<> &IRB = OpBuilder.getIRB();
     Type *Int8Ty = IRB.getInt8Ty();
     Type *Int32Ty = IRB.getInt32Ty();
 
-    replaceFunction(F, [&](CallInst *CI) -> Error {
+    return replaceFunction(F, [&](CallInst *CI) -> Error {
       IRB.SetInsertPoint(CI);
 
       auto *It = DRM.find(CI);
@@ -205,10 +210,10 @@ class OpLowerer {
     });
   }
 
-  void lowerToBindAndAnnotateHandle(Function &F) {
+  [[nodiscard]] bool lowerToBindAndAnnotateHandle(Function &F) {
     IRBuilder<> &IRB = OpBuilder.getIRB();
 
-    replaceFunction(F, [&](CallInst *CI) -> Error {
+    return replaceFunction(F, [&](CallInst *CI) -> Error {
       IRB.SetInsertPoint(CI);
 
       auto *It = DRM.find(CI);
@@ -251,12 +256,11 @@ class OpLowerer {
 
   /// Lower `dx.handle.fromBinding` intrinsics depending on the shader model and
   /// taking into account binding information from DXILResourceAnalysis.
-  void lowerHandleFromBinding(Function &F) {
+  bool lowerHandleFromBinding(Function &F) {
     Triple TT(Triple(M.getTargetTriple()));
     if (TT.getDXILVersion() < VersionTuple(1, 6))
-      lowerToCreateHandle(F);
-    else
-      lowerToBindAndAnnotateHandle(F);
+      return lowerToCreateHandle(F);
+    return lowerToBindAndAnnotateHandle(F);
   }
 
   /// Replace uses of \c Intrin with the values in the `dx.ResRet` of \c Op.
@@ -342,11 +346,11 @@ class OpLowerer {
     return Error::success();
   }
 
-  void lowerTypedBufferLoad(Function &F) {
+  [[nodiscard]] bool lowerTypedBufferLoad(Function &F) {
     IRBuilder<> &IRB = OpBuilder.getIRB();
     Type *Int32Ty = IRB.getInt32Ty();
 
-    replaceFunction(F, [&](CallInst *CI) -> Error {
+    return replaceFunction(F, [&](CallInst *CI) -> Error {
       IRB.SetInsertPoint(CI);
 
       Value *Handle =
@@ -368,8 +372,51 @@ class OpLowerer {
     });
   }
 
+  [[nodiscard]] bool lowerTypedBufferStore(Function &F) {
+    IRBuilder<> &IRB = OpBuilder.getIRB();
+    Type *Int8Ty = IRB.getInt8Ty();
+    Type *Int32Ty = IRB.getInt32Ty();
+
+    return replaceFunction(F, [&](CallInst *CI) -> Error {
+      IRB.SetInsertPoint(CI);
+
+      Value *Handle =
+          createTmpHandleCast(CI->getArgOperand(0), OpBuilder.getHandleType());
+      Value *Index0 = CI->getArgOperand(1);
+      Value *Index1 = UndefValue::get(Int32Ty);
+      // For typed stores, the mask must always cover all four elements.
+      Constant *Mask = ConstantInt::get(Int8Ty, 0xF);
+
+      Value *Data = CI->getArgOperand(2);
+      auto *DataTy = dyn_cast<FixedVectorType>(Data->getType());
+      if (!DataTy || DataTy->getNumElements() != 4)
+        return make_error<StringError>(
+            "typedBufferStore data must be a vector of 4 elements",
+            inconvertibleErrorCode());
+      Value *Data0 =
+          IRB.CreateExtractElement(Data, ConstantInt::get(Int32Ty, 0));
+      Value *Data1 =
+          IRB.CreateExtractElement(Data, ConstantInt::get(Int32Ty, 1));
+      Value *Data2 =
+          IRB.CreateExtractElement(Data, ConstantInt::get(Int32Ty, 2));
+      Value *Data3 =
+          IRB.CreateExtractElement(Data, ConstantInt::get(Int32Ty, 3));
+
+      std::array<Value *, 8> Args{Handle, Index0, Index1, Data0,
+                                  Data1,  Data2,  Data3,  Mask};
+      Expected<CallInst *> OpCall =
+          OpBuilder.tryCreateOp(OpCode::BufferStore, Args);
+      if (Error E = OpCall.takeError())
+        return E;
+
+      CI->eraseFromParent();
+      return Error::success();
+    });
+  }
+
   bool lowerIntrinsics() {
     bool Updated = false;
+    bool HasErrors = false;
 
     for (Function &F : make_early_inc_range(M.functions())) {
       if (!F.isDeclaration())
@@ -380,19 +427,22 @@ class OpLowerer {
         continue;
 #define DXIL_OP_INTRINSIC(OpCode, Intrin)                                      \
   case Intrin:                                                                 \
-    replaceFunctionWithOp(F, OpCode);                                          \
+    HasErrors |= replaceFunctionWithOp(F, OpCode);                             \
     break;
 #include "DXILOperation.inc"
       case Intrinsic::dx_handle_fromBinding:
-        lowerHandleFromBinding(F);
+        HasErrors |= lowerHandleFromBinding(F);
         break;
       case Intrinsic::dx_typedBufferLoad:
-        lowerTypedBufferLoad(F);
+        HasErrors |= lowerTypedBufferLoad(F);
+        break;
+      case Intrinsic::dx_typedBufferStore:
+        HasErrors |= lowerTypedBufferStore(F);
         break;
       }
       Updated = true;
     }
-    if (Updated)
+    if (Updated && !HasErrors)
       cleanupHandleCasts();
 
     return Updated;

diff --git a/llvm/test/CodeGen/DirectX/BufferStore-errors.ll b/llvm/test/CodeGen/DirectX/BufferStore-errors.ll
@@ -0,0 +1,37 @@
+; We use llc for this test so that we don't abort after the first error.
+; RUN: not llc %s -o /dev/null 2>&1 | FileCheck %s
+
+target triple = "dxil-pc-shadermodel6.6-compute"
+
+; CHECK: error:
+; CHECK-SAME: in function storetoomany
+; CHECK-SAME: typedBufferStore data must be a vector of 4 elements
+define void @storetoomany(<5 x float> %data, i32 %index) {
+  %buffer = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
+      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  call void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f32_1_0_0t.v5f32(
+      target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer,
+      i32 %index, <5 x float> %data)
+
+  ret void
+}
+
+; CHECK: error:
+; CHECK-SAME: in function storetoofew
+; CHECK-SAME: typedBufferStore data must be a vector of 4 elements
+define void @storetoofew(<3 x i32> %data, i32 %index) {
+  %buffer = call target("dx.TypedBuffer", <4 x i32>, 1, 0, 0)
+      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4i32_1_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  call void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4i32_1_0_0t.v3i32(
+      target("dx.TypedBuffer", <4 x i32>, 1, 0, 0) %buffer,
+      i32 %index, <3 x i32> %data)
+
+  ret void
+}
+
+declare void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f32_1_0_0t.v5f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32, <5 x float>)
+declare void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4i32_1_0_0t.v3i32(target("dx.TypedBuffer", <4 x i32>, 1, 0, 0), i32, <3 x i32>)
diff --git a/llvm/test/CodeGen/DirectX/BufferStore.ll b/llvm/test/CodeGen/DirectX/BufferStore.ll
@@ -0,0 +1,92 @@
+; RUN: opt -S -dxil-op-lower %s | FileCheck %s
+
+target triple = "dxil-pc-shadermodel6.6-compute"
+
+define void @storefloat(<4 x float> %data, i32 %index) {
+
+  ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding
+  ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BIND]]
+  %buffer = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
+      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; The temporary casts should all have been cleaned up
+  ; CHECK-NOT: %dx.cast_handle
+
+  ; CHECK: [[DATA0_0:%.*]] = extractelement <4 x float> %data, i32 0
+  ; CHECK: [[DATA0_1:%.*]] = extractelement <4 x float> %data, i32 1
+  ; CHECK: [[DATA0_2:%.*]] = extractelement <4 x float> %data, i32 2
+  ; CHECK: [[DATA0_3:%.*]] = extractelement <4 x float> %data, i32 3
+  ; CHECK: call void @dx.op.bufferStore.f32(i32 69, %dx.types.Handle [[HANDLE]], i32 %index, i32 undef, float [[DATA0_0]], float [[DATA0_1]], float [[DATA0_2]], float [[DATA0_3]], i8 15)
+  call void @llvm.dx.typedBufferStore(
+      target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer,
+      i32 %index, <4 x float> %data)
+
+  ret void
+}
+
+define void @storeint(<4 x i32> %data, i32 %index) {
+
+  ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding
+  ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BIND]]
+  %buffer = call target("dx.TypedBuffer", <4 x i32>, 1, 0, 0)
+      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4i32_1_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA0_0:%.*]] = extractelement <4 x i32> %data, i32 0
+  ; CHECK: [[DATA0_1:%.*]] = extractelement <4 x i32> %data, i32 1
+  ; CHECK: [[DATA0_2:%.*]] = extractelement <4 x i32> %data, i32 2
+  ; CHECK: [[DATA0_3:%.*]] = extractelement <4 x i32> %data, i32 3
+  ; CHECK: call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle [[HANDLE]], i32 %index, i32 undef, i32 [[DATA0_0]], i32 [[DATA0_1]], i32 [[DATA0_2]], i32 [[DATA0_3]], i8 15)
+  call void @llvm.dx.typedBufferStore(
+      target("dx.TypedBuffer", <4 x i32>, 1, 0, 0) %buffer,
+      i32 %index, <4 x i32> %data)
+
+  ret void
+}
+
+define void @storehalf(<4 x half> %data, i32 %index) {
+
+  ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding
+  ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BIND]]
+  %buffer = call target("dx.TypedBuffer", <4 x half>, 1, 0, 0)
+      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f16_1_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; The temporary casts should all have been cleaned up
+  ; CHECK-NOT: %dx.cast_handle
+
+  ; CHECK: [[DATA0_0:%.*]] = extractelement <4 x half> %data, i32 0
+  ; CHECK: [[DATA0_1:%.*]] = extractelement <4 x half> %data, i32 1
+  ; CHECK: [[DATA0_2:%.*]] = extractelement <4 x half> %data, i32 2
+  ; CHECK: [[DATA0_3:%.*]] = extractelement <4 x half> %data, i32 3
+  ; CHECK: call void @dx.op.bufferStore.f16(i32 69, %dx.types.Handle [[HANDLE]], i32 %index, i32 undef, half [[DATA0_0]], half [[DATA0_1]], half [[DATA0_2]], half [[DATA0_3]], i8 15)
+  call void @llvm.dx.typedBufferStore(
+      target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer,
+      i32 %index, <4 x half> %data)
+
+  ret void
+}
+
+define void @storei16(<4 x i16> %data, i32 %index) {
+
+  ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding
+  ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BIND]]
+  %buffer = call target("dx.TypedBuffer", <4 x i16>, 1, 0, 0)
+      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4i16_1_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; The temporary casts should all have been cleaned up
+  ; CHECK-NOT: %dx.cast_handle
+
+  ; CHECK: [[DATA0_0:%.*]] = extractelement <4 x i16> %data, i32 0
+  ; CHECK: [[DATA0_1:%.*]] = extractelement <4 x i16> %data, i32 1
+  ; CHECK: [[DATA0_2:%.*]] = extractelement <4 x i16> %data, i32 2
+  ; CHECK: [[DATA0_3:%.*]] = extractelement <4 x i16> %data, i32 3
+  ; CHECK: call void @dx.op.bufferStore.i16(i32 69, %dx.types.Handle [[HANDLE]], i32 %index, i32 undef, i16 [[DATA0_0]], i16 [[DATA0_1]], i16 [[DATA0_2]], i16 [[DATA0_3]], i8 15)
+  call void @llvm.dx.typedBufferStore(
+      target("dx.TypedBuffer", <4 x i16>, 1, 0, 0) %buffer,
+      i32 %index, <4 x i16> %data)
+
+  ret void
+}