[DirectX] Implement the resource.load.rawbuffer intrinsic (llvm#121012)

bogner · web-flow · commit cba9bd5cb077 · 2025-01-08T16:56:05.000-08:00
This introduces `@llvm.dx.resource.load.rawbuffer` and generalizes the buffer load docs under DirectX/DXILResources. This resolves the "load" parts of llvm#106188
diff --git a/llvm/docs/DirectX/DXILResources.rst b/llvm/docs/DirectX/DXILResources.rst
@@ -318,39 +318,43 @@ Examples:
    %ptr = call ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_v4f32_0_0_0t(
        target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 %index)
 
-16-byte Loads, Samples, and Gathers
------------------------------------
-
-*relevant types: TypedBuffer, CBuffer, and Textures*
-
-TypedBuffer, CBuffer, and Texture loads, as well as samples and gathers, can
-return 1 to 4 elements from the given resource, to a maximum of 16 bytes of
-data. DXIL's modeling of this is influenced by DirectX and DXBC's history and
-it generally treats these operations as returning 4 32-bit values. For 16-bit
-elements the values are 16-bit values, and for 64-bit values the operations
-return 4 32-bit integers and emit further code to construct the double.
-
-In DXIL, these operations return `ResRet`_ and `CBufRet`_ values, are structs
-containing 4 elements of the same type, and in the case of `ResRet` a 5th
-element that is used by the `CheckAccessFullyMapped`_ operation.
-
-In LLVM IR the intrinsics will return the contained type of the resource
-instead. That is, ``llvm.dx.resource.load.typedbuffer`` from a
-``Buffer<float>`` would return a single float, from ``Buffer<float4>`` a vector
-of 4 floats, and from ``Buffer<double2>`` a vector of two doubles, etc. The
-operations are then expanded out to match DXIL's format during lowering.
-
-In order to support ``CheckAccessFullyMapped``, we need these intrinsics to
-return an anonymous struct with element-0 being the contained type, and
-element-1 being the ``i1`` result of a ``CheckAccessFullyMapped`` call. We
-don't have a separate call to ``CheckAccessFullyMapped`` at all, since that's
-the only operation that can possibly be done on this value. In practice this
-may mean we insert a DXIL operation for the check when this was missing in the
-HLSL source, but this actually matches DXC's behaviour in practice.
+Loads, Samples, and Gathers
+---------------------------
+
+*relevant types: Buffers, CBuffers, and Textures*
+
+All load, sample, and gather operations in DXIL return a `ResRet`_ type, and
+CBuffer loads return a similar `CBufRet`_ type. These types are structs
+containing 4 elements of some basic type, and in the case of `ResRet` a 5th
+element that is used by the `CheckAccessFullyMapped`_ operation. Some of these
+operations, like `RawBufferLoad`_ include a mask and/or alignment that tell us
+some information about how to interpret those four values.
+
+In the LLVM IR representations of these operations we instead return scalars or
+vectors, but we keep the requirement that we only return up to 4 elements of a
+basic type. This avoids some unnecessary casting and structure manipulation in
+the intermediate format while also keeping lowering to DXIL straightforward.
+
+LLVM intrinsics that map to operations returning `ResRet` return an anonymous
+struct with element-0 being the scalar or vector type, and element-1 being the
+``i1`` result of a ``CheckAccessFullyMapped`` call. We don't have a separate
+call to ``CheckAccessFullyMapped`` at all, since that's the only operation that
+can possibly be done on this value. In practice this may mean we insert a DXIL
+operation for the check when this was missing in the HLSL source, but this
+actually matches DXC's behaviour in practice.
+
+For TypedBuffer and Texture, we map directly from the contained type of the
+resource to the return value of the intrinsic. Since these resources are
+constrained to contain only scalars and vectors of up to 4 elements, the
+lowering to DXIL ops is generally straightforward. The one exception we have
+here is that `double` types in the elements are special - these are allowed in
+the LLVM intrinsics, but are lowered to pairs of `i32` followed by
+``MakeDouble`` operations for DXIL.
 
 .. _ResRet: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#resource-operation-return-types
 .. _CBufRet: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#cbufferloadlegacy
 .. _CheckAccessFullyMapped: https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/checkaccessfullymapped
+.. _RawBufferLoad: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#rawbufferload
 
 .. list-table:: ``@llvm.dx.resource.load.typedbuffer``
    :header-rows: 1
@@ -392,6 +396,101 @@ Examples:
        @llvm.dx.resource.load.typedbuffer.v2f64.tdx.TypedBuffer_v2f64_0_0t(
            target("dx.TypedBuffer", <2 x double>, 0, 0, 0) %buffer, i32 %index)
 
+For RawBuffer, an HLSL load operation may return an arbitrarily sized result,
+but we still constrain the LLVM intrinsic to return only up to 4 elements of a
+basic type. This means that larger loads are represented as a series of loads,
+which matches DXIL. Unlike in the `RawBufferLoad`_ operation, we do not need
+arguments for the mask/type size and alignment, since we can calculate these
+from the return type of the load during lowering.
+
+.. _RawBufferLoad: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#rawbufferload
+
+.. list-table:: ``@llvm.dx.resource.load.rawbuffer``
+   :header-rows: 1
+
+   * - Argument
+     -
+     - Type
+     - Description
+   * - Return value
+     -
+     - A structure of a scalar or vector and the check bit
+     - The data loaded from the buffer and the check bit
+   * - ``%buffer``
+     - 0
+     - ``target(dx.RawBuffer, ...)``
+     - The buffer to load from
+   * - ``%index``
+     - 1
+     - ``i32``
+     - Index into the buffer
+   * - ``%offset``
+     - 2
+     - ``i32``
+     - Offset into the structure at the given index
+
+Examples:
+
+.. code-block:: llvm
+
+   ; float
+   %ret = call {float, i1}
+       @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_f32_0_0_0t(
+           target("dx.RawBuffer", float, 0, 0, 0) %buffer,
+           i32 %index,
+           i32 0)
+   %ret = call {float, i1}
+       @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_i8_0_0_0t(
+           target("dx.RawBuffer", i8, 0, 0, 0) %buffer,
+           i32 %byte_offset,
+           i32 0)
+
+   ; float4
+   %ret = call {<4 x float>, i1}
+       @llvm.dx.resource.load.rawbuffer.v4f32.tdx.RawBuffer_v4f32_0_0_0t(
+           target("dx.RawBuffer", float, 0, 0, 0) %buffer,
+           i32 %index,
+           i32 0)
+   %ret = call {float, i1}
+       @llvm.dx.resource.load.rawbuffer.v4f32.tdx.RawBuffer_i8_0_0_0t(
+           target("dx.RawBuffer", i8, 0, 0, 0) %buffer,
+           i32 %byte_offset,
+           i32 0)
+
+   ; struct S0 { float4 f; int4 i; };
+   %ret = call {<4 x float>, i1}
+       @llvm.dx.resource.load.rawbuffer.v4f32.tdx.RawBuffer_sl_v4f32v4i32s_0_0t(
+           target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0, 0) %buffer,
+           i32 %index,
+           i32 0)
+   %ret = call {<4 x i32>, i1}
+       @llvm.dx.resource.load.rawbuffer.v4i32.tdx.RawBuffer_sl_v4f32v4i32s_0_0t(
+           target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0, 0) %buffer,
+           i32 %index,
+           i32 1)
+
+   ; struct Q { float4 f; int3 i; }
+   ; struct R { int z; S x; }
+   %ret = call {i32, i1}
+       @llvm.dx.resource.load.rawbuffer.i32(
+           target("dx.RawBuffer", {i32, {<4 x float>, <3 x i32>}}, 0, 0, 0)
+               %buffer, i32 %index, i32 0)
+   %ret = call {<4 x float>, i1}
+       @llvm.dx.resource.load.rawbuffer.i32(
+           target("dx.RawBuffer", {i32, {<4 x float>, <3 x i32>}}, 0, 0, 0)
+               %buffer, i32 %index, i32 4)
+   %ret = call {<3 x i32>, i1}
+       @llvm.dx.resource.load.rawbuffer.i32(
+           target("dx.RawBuffer", {i32, {<4 x float>, <3 x i32>}}, 0, 0, 0)
+               %buffer, i32 %index, i32 20)
+
+   ; byteaddressbuf.Load<int64_t4>
+   %ret = call {<4 x i64>, i1}
+       @llvm.dx.resource.load.rawbuffer.v4i64.tdx.RawBuffer_i8_0_0t(
+           target("dx.RawBuffer", i8, 0, 0, 0) %buffer,
+           i32 %byte_offset,
+           i32 0)
+
 Texture and Typed Buffer Stores
 -------------------------------
 
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -36,6 +36,10 @@ def int_dx_resource_load_typedbuffer
 def int_dx_resource_store_typedbuffer
     : DefaultAttrsIntrinsic<[], [llvm_any_ty, llvm_i32_ty, llvm_anyvector_ty],
                             [IntrWriteMem]>;
+def int_dx_resource_load_rawbuffer
+    : DefaultAttrsIntrinsic<[llvm_any_ty, llvm_i1_ty],
+                            [llvm_any_ty, llvm_i32_ty, llvm_i32_ty],
+                            [IntrReadMem]>;
 
 def int_dx_resource_updatecounter
     : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_i8_ty],
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
@@ -42,8 +42,10 @@ def FloatTy : DXILOpParamType;
 def DoubleTy : DXILOpParamType;
 def ResRetHalfTy : DXILOpParamType;
 def ResRetFloatTy : DXILOpParamType;
+def ResRetDoubleTy : DXILOpParamType;
 def ResRetInt16Ty : DXILOpParamType;
 def ResRetInt32Ty : DXILOpParamType;
+def ResRetInt64Ty : DXILOpParamType;
 def HandleTy : DXILOpParamType;
 def ResBindTy : DXILOpParamType;
 def ResPropsTy : DXILOpParamType;
@@ -890,6 +892,23 @@ def SplitDouble :  DXILOp<102, splitDouble> {
   let attributes = [Attributes<DXIL1_0, [ReadNone]>];
 }
 
+def RawBufferLoad : DXILOp<139, rawBufferLoad> {
+  let Doc = "reads from a raw buffer and structured buffer";
+  // Handle, Coord0, Coord1, Mask, Alignment
+  let arguments = [HandleTy, Int32Ty, Int32Ty, Int8Ty, Int32Ty];
+  let result = OverloadTy;
+  let overloads = [
+    Overloads<DXIL1_2,
+              [ResRetHalfTy, ResRetFloatTy, ResRetInt16Ty, ResRetInt32Ty]>,
+    Overloads<DXIL1_3,
+              [
+                ResRetHalfTy, ResRetFloatTy, ResRetDoubleTy, ResRetInt16Ty,
+                ResRetInt32Ty, ResRetInt64Ty
+              ]>
+  ];
+  let stages = [Stages<DXIL1_2, [all_stages]>];
+}
+
 def Dot4AddI8Packed : DXILOp<163, dot4AddPacked> {
   let Doc = "signed dot product of 4 x i8 vectors packed into i32, with "
             "accumulate to i32";
diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
@@ -263,10 +263,14 @@ static Type *getTypeFromOpParamType(OpParamType Kind, LLVMContext &Ctx,
     return getResRetType(Type::getHalfTy(Ctx));
   case OpParamType::ResRetFloatTy:
     return getResRetType(Type::getFloatTy(Ctx));
+  case OpParamType::ResRetDoubleTy:
+    return getResRetType(Type::getDoubleTy(Ctx));
   case OpParamType::ResRetInt16Ty:
     return getResRetType(Type::getInt16Ty(Ctx));
   case OpParamType::ResRetInt32Ty:
     return getResRetType(Type::getInt32Ty(Ctx));
+  case OpParamType::ResRetInt64Ty:
+    return getResRetType(Type::getInt64Ty(Ctx));
   case OpParamType::HandleTy:
     return getHandleType(Ctx);
   case OpParamType::ResBindTy:
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -542,6 +542,48 @@ class OpLowerer {
     });
   }
 
+  [[nodiscard]] bool lowerRawBufferLoad(Function &F) {
+    Triple TT(Triple(M.getTargetTriple()));
+    VersionTuple DXILVersion = TT.getDXILVersion();
+    const DataLayout &DL = F.getDataLayout();
+    IRBuilder<> &IRB = OpBuilder.getIRB();
+    Type *Int8Ty = IRB.getInt8Ty();
+    Type *Int32Ty = IRB.getInt32Ty();
+
+    return replaceFunction(F, [&](CallInst *CI) -> Error {
+      IRB.SetInsertPoint(CI);
+
+      Type *OldTy = cast<StructType>(CI->getType())->getElementType(0);
+      Type *ScalarTy = OldTy->getScalarType();
+      Type *NewRetTy = OpBuilder.getResRetType(ScalarTy);
+
+      Value *Handle =
+          createTmpHandleCast(CI->getArgOperand(0), OpBuilder.getHandleType());
+      Value *Index0 = CI->getArgOperand(1);
+      Value *Index1 = CI->getArgOperand(2);
+      uint64_t NumElements =
+          DL.getTypeSizeInBits(OldTy) / DL.getTypeSizeInBits(ScalarTy);
+      Value *Mask = ConstantInt::get(Int8Ty, ~(~0U << NumElements));
+      Value *Align =
+          ConstantInt::get(Int32Ty, DL.getPrefTypeAlign(ScalarTy).value());
+
+      Expected<CallInst *> OpCall =
+          DXILVersion >= VersionTuple(1, 2)
+              ? OpBuilder.tryCreateOp(OpCode::RawBufferLoad,
+                                      {Handle, Index0, Index1, Mask, Align},
+                                      CI->getName(), NewRetTy)
+              : OpBuilder.tryCreateOp(OpCode::BufferLoad,
+                                      {Handle, Index0, Index1}, CI->getName(),
+                                      NewRetTy);
+      if (Error E = OpCall.takeError())
+        return E;
+      if (Error E = replaceResRetUses(CI, *OpCall, /*HasCheckBit=*/true))
+        return E;
+
+      return Error::success();
+    });
+  }
+
   [[nodiscard]] bool lowerUpdateCounter(Function &F) {
     IRBuilder<> &IRB = OpBuilder.getIRB();
     Type *Int32Ty = IRB.getInt32Ty();
@@ -736,6 +778,9 @@ class OpLowerer {
       case Intrinsic::dx_resource_store_typedbuffer:
         HasErrors |= lowerTypedBufferStore(F);
         break;
+      case Intrinsic::dx_resource_load_rawbuffer:
+        HasErrors |= lowerRawBufferLoad(F);
+        break;
       case Intrinsic::dx_resource_updatecounter:
         HasErrors |= lowerUpdateCounter(F);
         break;
diff --git a/llvm/test/CodeGen/DirectX/BufferLoad-sm61.ll b/llvm/test/CodeGen/DirectX/BufferLoad-sm61.ll
@@ -0,0 +1,60 @@
+; RUN: opt -S -dxil-op-lower %s | FileCheck %s
+; Before SM6.2 ByteAddressBuffer and StructuredBuffer lower to bufferLoad.
+
+target triple = "dxil-pc-shadermodel6.1-compute"
+
+; CHECK-LABEL: define void @loadf32_struct
+define void @loadf32_struct(i32 %index) {
+  %buffer = call target("dx.RawBuffer", float, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle %{{.*}}, i32 %index, i32 0)
+  %load = call {float, i1}
+      @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_f32_0_0_0t(
+          target("dx.RawBuffer", float, 0, 0, 0) %buffer,
+          i32 %index,
+          i32 0)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @loadv4f32_byte
+define void @loadv4f32_byte(i32 %offset) {
+  %buffer = call target("dx.RawBuffer", i8, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle %{{.*}}, i32 %offset, i32 0)
+  %load = call {<4 x float>, i1}
+      @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_i8_0_0_0t(
+          target("dx.RawBuffer", i8, 0, 0, 0) %buffer,
+          i32 %offset,
+          i32 0)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @loadnested
+define void @loadnested(i32 %index) {
+  %buffer = call
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATAI32:%.*]] = call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32(i32 68, %dx.types.Handle %{{.*}}, i32 %index, i32 0)
+  %loadi32 = call {i32, i1} @llvm.dx.resource.load.rawbuffer.i32(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0) %buffer,
+      i32 %index, i32 0)
+
+  ; CHECK: [[DATAF32:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle %{{.*}}, i32 %index, i32 4)
+  %loadf32 = call {<4 x float>, i1} @llvm.dx.resource.load.rawbuffer.v4f32(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0) %buffer,
+      i32 %index, i32 4)
+
+  ; CHECK: [[DATAF16:%.*]] = call %dx.types.ResRet.f16 @dx.op.bufferLoad.f16(i32 68, %dx.types.Handle %{{.*}}, i32 %index, i32 20)
+  %loadf16 = call {<3 x half>, i1} @llvm.dx.resource.load.rawbuffer.v3f16(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0) %buffer,
+      i32 %index, i32 20)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/RawBufferLoad-error64.ll b/llvm/test/CodeGen/DirectX/RawBufferLoad-error64.ll
@@ -0,0 +1,24 @@
+; We use llc for this test so that we don't abort after the first error.
+; RUN: not llc %s -o /dev/null 2>&1 | FileCheck %s
+
+target triple = "dxil-pc-shadermodel6.2-compute"
+
+declare void @v4f64_user(<4 x double>)
+
+; Can't load 64 bit types directly until SM6.3 (byteaddressbuf.Load<int64_t4>)
+; CHECK: error:
+; CHECK-SAME: in function loadv4f64_byte
+; CHECK-SAME: Cannot create RawBufferLoad operation: Invalid overload type
+define void @loadv4f64_byte(i32 %offset) "hlsl.export" {
+  %buffer = call target("dx.RawBuffer", i8, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  %load = call {<4 x double>, i1} @llvm.dx.resource.load.rawbuffer.v4i64(
+      target("dx.RawBuffer", i8, 0, 0, 0) %buffer, i32 %offset, i32 0)
+  %data = extractvalue {<4 x double>, i1} %load, 0
+
+  call void @v4f64_user(<4 x double> %data)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/RawBufferLoad.ll b/llvm/test/CodeGen/DirectX/RawBufferLoad.ll
diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp