llvm
diff --git a/‎mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td
Lines changed: 169 additions & 63 deletions b/‎mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td
Lines changed: 169 additions & 63 deletions
diff --git a/‎mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
Lines changed: 19 additions & 19 deletions b/‎mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
Lines changed: 19 additions & 19 deletions
diff --git a/‎mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
Lines changed: 3 additions & 3 deletions b/‎mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
Lines changed: 3 additions & 3 deletions
@@ -20,107 +20,214 @@ include "mlir/Dialect/SCF/IR/DeviceMappingInterface.td"
 def DimX : I64EnumAttrCase<"DimX", 0, "x">;
 def DimY : I64EnumAttrCase<"DimY", 1, "y">;
 def DimZ : I64EnumAttrCase<"DimZ", 2, "z">;
-
-def ThreadsEnum : I64EnumAttr<"Threads", "threads for loop mapping", [
-    DimX, DimY, DimZ]> {
+def LinearDim0 : I64EnumAttrCase<"LinearDim0", 3, "linear_dim_0">;
+def LinearDim1 : I64EnumAttrCase<"LinearDim1", 4, "linear_dim_1">;
+def LinearDim2 : I64EnumAttrCase<"LinearDim2", 5, "linear_dim_2">;
+def LinearDim3 : I64EnumAttrCase<"LinearDim3", 6, "linear_dim_3">;
+def LinearDim4 : I64EnumAttrCase<"LinearDim4", 7, "linear_dim_4">;
+def LinearDim5 : I64EnumAttrCase<"LinearDim5", 8, "linear_dim_5">;
+def LinearDim6 : I64EnumAttrCase<"LinearDim6", 9, "linear_dim_6">;
+def LinearDim7 : I64EnumAttrCase<"LinearDim7", 10, "linear_dim_7">;
+def LinearDim8 : I64EnumAttrCase<"LinearDim8", 11, "linear_dim_8">;
+def LinearDim9 : I64EnumAttrCase<"LinearDim9", 12, "linear_dim_9">;
+
+// TODO: This would be better represented with separate Grid and Linear Mapping
+// ids. Unfortunately it is not yet possible to have an optional EnumParameter
+// so we currently embed the 2 modes in the same enum.
+def MappingIdEnum : I64EnumAttr<"MappingId", "Mapping ids for loop mapping", [
+    DimX, DimY, DimZ,
+    LinearDim0, LinearDim1, LinearDim2, LinearDim3, LinearDim4, 
+    LinearDim5, LinearDim6, LinearDim7, LinearDim8, LinearDim9]> {
   let cppNamespace = "::mlir::gpu";
 }
 
-def GPUThreadMappingAttr
-    : GPU_Attr<"GPUThreadMapping", "thread", [
-      DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ]> {
+def GPUBlockMappingAttr : GPU_Attr<"GPUBlockMapping", "block", [
+  DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ] >  {
   let parameters = (ins
-    EnumParameter<ThreadsEnum>:$thread
+    EnumParameter<MappingIdEnum>:$block
   );
   let assemblyFormat = "`<` params `>`";
   let description = [{
-    An attribute that allows defining thread parallelism for GPU devices.
+    An attribute that allows defining thread block parallelism for GPU devices.
 
-    Thread (aka work item) are grouped into a thread blocks where block may be
-    described by a 1-, 2-, or 3-dimensional rectangle. This attribute indicates
-    that thread parallelism is desired. It can be consumed by lowering to
-    generate GPU.
-  }];
-}
+    Thread blocks (aka workgroup) are grouped into a grid described by a 
+    3-dimensional rectangle.
+    This attribute indicates that thread block parallelism is desired.
+    It can be consumed by lowering to generate GPU code.
+    2 modes are supported: (1) 3D mapping mode and (2) linear mapping mode.
 
-def WarpsEnum : I64EnumAttr<"Warps", "threads for loop mapping", [
-    DimX, DimY, DimZ]> {
-  let cppNamespace = "::mlir::gpu";
+    #### 3D mapping mode
+
+    The 3D block id is simply the 3D index of the block `(bidx, bidy, bidz)`. 
+    If required, predication occurs on a per-dimension basis. This allows 
+    specifying predication on a 3D sub-rectangle of the grid.
+
+    #### Linear mapping mode
+
+    The linear block id is obtained by linearizing the index of the block. 
+    If required, predication occurs on the linear id. This allows specifying
+    predication on a 1D subset of the (linearized) grid.
+
+    For instance, if the basis is denoted as (GX, GY, GZ) and the block id is
+    denoted by (bx, by, bz), the block id is:
+      `linear_id = bx + by * GX + bz * GX * GBY)`.
+    The linear block id is fixed for the duration of a GPU kernel.
+    
+    This linear id mapping attribute indicates a different linearization relation
+    is applied locally to a loop nest. 
+    
+    For instance, if the new basis is denoted as (LBD0, LBD1, LBD2, LBD3) the 
+    block id in the new basis is:
+      ```(linear_id mod LBD0 , 
+          (linear_id / LBD0) mod * LBD1, 
+          (linear_id / (LBD0 * LBD1)) mod LBD2, 
+          (linear_id / (LBD0 * LBD1 * LBD2)) mod LBD3)```.
+    This reinterpretation is only fixed for the duration of a loop nest.
+  }];
 }
 
-def GPUWarpMappingAttr : GPU_Attr<"GPUWarpMapping", "warp", [
-  DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ] >  {
+def GPUWarpgroupMappingAttr
+    : GPU_Attr<"GPUWarpgroupMapping", "warpgroup", [
+      DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ]> {
   let parameters = (ins
-    EnumParameter<WarpsEnum>:$warp
+    EnumParameter<MappingIdEnum>:$warpgroup
   );
   let assemblyFormat = "`<` params `>`";
   let description = [{
-    An attribute that allows defining thread block parallelism for GPU devices.
+    An attribute that allows defining warpgroup parallelism for GPU devices.
 
-    Warp (aka subgroup) are grouped into a grid where grid may be
-    described by a 1-, 2-, or 3-dimensional rectangle. This attribute indicates
-    that thread block parallelism is desired. It can be consumed by lowering to
-    generate GPU code.
-  }];
-}
+    Threads of proper granularity (e.g. multiple of 
+    "kNumWarpsPerGroup * kWarpSize" on CUDA devices) can be grouped into
+    warpgroups described by a 3-dimensional rectangle. 
+    This attribute indicates that warpgroup parallelism is desired. 
+    It can be consumed by lowering to generate GPU code.
+    2 modes are supported: (1) 3D mapping mode and (2) linear mapping mode.
 
-def LinearIdEnum : I64EnumAttr<"LinearId", "linear ids for loop mapping", [
-    DimX, DimY, DimZ]> {
-  let cppNamespace = "::mlir::gpu";
+    #### 3D mapping mode
+
+    The 3D warpgroup id is simply the adjusted 3D index of the thread 
+    `(tidx / (kNumWarpsPerGroup * kWarpSize), tidy, tidz)`.
+    If required, predication occurs on a per-dimension basis. This allows 
+    specifying predication on a 3D sub-rectangle of the warpgroups.
+
+    #### Linear mapping mode
+
+    The linear warpgroup id is obtained by linearizing the index of the warpgroup.
+    If required, predication occurs on the linear id. This allows specifying
+    predication on a 1D "kNumWarpsPerGroup * kWarpSize"-aligned subset of the 
+    (linearized) block.
+
+    For instance, if the basis is denoted as (BX, BY, BZ) and the thread id is
+    id is denoted by (tx, ty, tz), the linear warpgroup id is:
+      ```linear_id = (tx + ty * BX + tz * BX * BY) 
+                 / (kNumWarpsPerGroup * kWarpSize)```.
+    The linear warpgroup id is fixed for the duration of a GPU kernel.
+    
+    This linear id mapping attribute indicates a different linearization relation
+    is applied locally to a loop nest. 
+    
+    For instance, if the new basis is denoted as (LWGD0, LWGD1, LWGD2, LWGD3) the 
+    warpgroup id in the new basis is:
+      ```(linear_id mod LWGD0 , 
+          (linear_id / LWGD0) mod * LWGD1, 
+          (linear_id / (LWGD0 * LWGD1)) mod LWGD2, 
+          (linear_id / (LWGD0 * LWGD1 * LWGD2)) mod LWGD3)```.
+    This reinterpretation is only fixed for the duration of a loop nest.
+  }];
 }
 
-def GPULinearIdMapping : GPU_Attr<"GPULinearIdMapping", "linear", [
-  DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ] >  {
+def GPUWarpMappingAttr
+    : GPU_Attr<"GPUWarpMapping", "warp", [
+      DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ]> {
   let parameters = (ins
-    EnumParameter<LinearIdEnum>:$linear_id
+    EnumParameter<MappingIdEnum>:$warp
   );
   let assemblyFormat = "`<` params `>`";
   let description = [{
-    An attribute to allow re-interpreting the linear mapping for threads in GPU
-    devices.
+    An attribute that allows defining warp parallelism for GPU devices.
 
-    Threads (aka work item) are grouped into a thread block where block may be
-    described by a 1-, 2-, or 3-dimensional rectangular basis.
-    The linear thread id is obtained by linearizing the 1-, 2- or 3-dimensional
-    index. For instance, if the basis is denoted as (BX, BY, BZ) and the thread
-    id is denoted by (tx, ty, tz), the linear thread id is:
-      `linear_id = tx + ty * BX + tz * BX * BY)`.
-    The linear thread id is fixed for the duration of a GPU kernel.
+    Threads of proper granularity (e.g. multiple of "warp size" on CUDA devices) 
+    can be grouped into warps described by a 3-dimensional rectangle. 
+    This attribute indicates that warp parallelism is desired.
+    It can be consumed by lowering to generate GPU code.
+    2 modes are supported: (1) 3D mapping mode and (2) linear mapping mode.
+
+    #### 3D mapping mode
+
+    The 3D warp id is simply the adjusted 3D index of the thread 
+    `(tidx / kWarpSize, tidy, tidz)`.
+    If required, predication occurs on a per-dimension basis. This allows 
+    specifying predication on a 3D sub-rectangle of the warpgroups.
+
+    #### Linear mapping mode
+
+    The linear warp id is obtained by linearizing the index of the warp.
+    If required, predication occurs on the linear id. This allows specifying
+    predication on a 1D "kWarpSize"-aligned subset of the (linearized) block.
+
+    For instance, if the basis is denoted as (BX, BY, BZ) and the thread id is
+    id is denoted by (tx, ty, tz), the linear warp id is:
+      `linear_id = (tx + ty * BX + tz * BX * BY) / kWarpSize`.
+    The linear warp id is fixed for the duration of a GPU kernel.
 
     This linear id mapping attribute indicates a different linearization relation
     is applied locally to a loop nest. 
 
-    For instance, if the new basis is denoted as (LBX, LBY, LBZ) the thread id
-    in the new basis is:
-      `(linear_id mod LBX , (linear_id / LBX) mod * LBY, linear_id / (LBX * LBY))`.
-    This reinterpretation is only fixe for the duration of a loop nest.
-    
-    It can be consumed by lowering to generate GPU code.
+    For instance, if the new basis is denoted as (LWD0, LWD1, LWD2, LWD3) the 
+    warp id in the new basis is:
+      ```(linear_id mod LWD0 , 
+          (linear_id / LWD0) mod * LWD1, 
+          (linear_id / (LWD0 * LWD1)) mod LWD2, 
+          (linear_id / (LWD0 * LWD1 * LWD2)) mod LWD3)```.
+    This reinterpretation is only fixed for the duration of a loop nest.
   }];
 }
 
-def BlocksEnum : I64EnumAttr<"Blocks", "threads for loop mapping", [
-    DimX, DimY, DimZ]> {
-  let cppNamespace = "::mlir::gpu";
-}
-
-def GPUBlockMappingAttr : GPU_Attr<"GPUBlockMapping", "block", [
-  DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ] >  {
+def GPUThreadMappingAttr
+    : GPU_Attr<"GPUThreadMapping", "thread", [
+      DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ]> {
   let parameters = (ins
-    EnumParameter<BlocksEnum>:$block
+    EnumParameter<MappingIdEnum>:$thread
   );
   let assemblyFormat = "`<` params `>`";
   let description = [{
-    An attribute that allows defining thread block parallelism for GPU devices.
+    An attribute that allows defining thread parallelism for GPU devices.
+
+    Thread (aka work item) are grouped into a thread blocks described by a 
+    3-dimensional rectangle.
+    This attribute indicates that thread parallelism is desired.
+    It can be consumed by lowering to generate GPU.
+
+    #### 3D mapping mode
+
+    The 3D thread id is simply the 3D index of the thread `(tidx, tidy, tidz)`. 
+    If required, predication occurs on a per-dimension basis. This allows 
+    specifying predication on a 3D sub-rectangle of the block.
+
+    #### Linear mapping mode
 
-    Thread blocks (aka work-group) are grouped into a grid where grid may be
-    described by a 1-, 2-, or 3-dimensional rectangle. This attribute indicates
-    that thread block parallelism is desired. It can be consumed by lowering to
-    generate GPU code.
+    The linear thread id is obtained by linearizing the index of the thread. 
+    If required, predication occurs on the linear id. This allows specifying
+    predication on a 1D subset of the (linearized) block.
+
+    For instance, if the basis is denoted as (BX, BY, BZ) and the thread id is
+    id is denoted by (tx, ty, tz), the linear thread id is:
+      ```linear_id = (tx + ty * BX + tz * BX * BY)```.
+    The linear thread id is fixed for the duration of a GPU kernel.
+    
+    This linear id mapping attribute indicates a different linearization relation
+    is applied locally to a loop nest. 
+    
+    For instance, if the new basis is denoted as (LTD0, LTD1, LTD2, LTD3) the 
+    thread id in the new basis is:
+      ```(linear_id mod LTD0 , 
+          (linear_id / LTD0) mod * LTD1, 
+          (linear_id / (LTD0 * LTD1)) mod LTD2, 
+          (linear_id / (LTD0 * LTD1 * LTD2)) mod LTD3)```.
+    This reinterpretation is only fixed for the duration of a loop nest.
   }];
 }
 
-
 def GPUMemorySpaceMappingAttr : GPU_Attr<"GPUMemorySpaceMapping", "memory_space", [
   DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ] >  {
   let parameters = (ins
@@ -138,5 +245,4 @@ def GPUMemorySpaceMappingAttr : GPU_Attr<"GPUMemorySpaceMapping", "memory_space"
   }];
 }
 
-
 #endif // GPU_DEVICE_MAPPING_ATTR
@@ -33,49 +33,49 @@ namespace transform {
 namespace gpu {
 struct GpuIdBuilder;
 
-/// Map the top level `scf.forall` op to GPU Thread Blocks.
+/// Map the top level `scf.forall` op to GPU blocks.
 /// Mapping is one-to-one and the induction variables of `scf.forall` are
 /// rewritten to gpu.block_id according to the thread_dim_mapping attribute.
 ///
 /// Dynamic, `scf.forall` trip counts are currently not supported.
-/// Dynamic block dim sizes are currently not supported.
+/// Dynamic `gridDims` are currently not supported.
 DiagnosedSilenceableFailure
 mapForallToBlocksImpl(RewriterBase &rewriter, TransformOpInterface transformOp,
                       scf::ForallOp forallOp,
                       SmallVectorImpl<int64_t> &gridDims,
                       const GpuIdBuilder &gpuIdBuilder);
 
 /// Search `scf.forall` ops nested under `target` and map each such op to an
-/// explicit GPU implementation along `availableMappingSizes`.
+/// explicit GPU implementation along `blockDims`.
 /// The mapping is one-to-one and the induction variables of `scf.forall` are
 /// rewritten to gpuIdBuilder.idBuilder according to the
 /// gpuIdBuilder.mappingAttributes attribute.
 ///
 /// Dynamic, `scf.forall` trip counts are currently not supported.
-/// Dynamic `availableMappingSizes` sizes are currently not supported.
-/// `availableMappingSizes` is expected to be of size 3.
-DiagnosedSilenceableFailure mapOneForallToThreadsImpl(
-    RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
-    scf::ForallOp forallOp, ArrayRef<int64_t> availableMappingSizes,
-    bool syncAfterDistribute, const GpuIdBuilder &gpuIdBuilder);
+/// Dynamic `blockDims` sizes are currently not supported.
+/// `blockDims` is expected to be of size 3.
+DiagnosedSilenceableFailure
+mapOneForallToThreadsImpl(RewriterBase &rewriter,
+                          std::optional<TransformOpInterface> transformOp,
+                          scf::ForallOp forallOp, ArrayRef<int64_t> blockDims,
+                          int64_t warpSize, bool syncAfterDistribute);
 
 /// Search `scf.forall` ops nested under `target` and map each such op to an
-/// explicit GPU implementation along blockDims and warpDims.
+/// explicit GPU implementation along `blockDims`.
 /// The mapping is one-to-one and the induction variables of `scf.forall` are
-/// rewritten to threads and warps ids according to the mapping attribute.
+/// rewritten to appropriate ids according to the mapping attribute.
 ///
 /// Dynamic, `scf.forall` trip counts are currently not supported.
-/// Dynamic `blockDims` or `warpDims` or `linearDims` sizes are currently not
-/// supported.
-/// `blockDims` is expected to be of size 3.
-/// `warpDims` is expected to be empty or of size 3.
+/// Dynamic `blockDims` or `newBasis` entries are currently not
+/// supported. `blockDims` is expected to be of size 3.
 ///
 /// The insertion point of the `rewriter` is expected to be set at the
 /// beginning of the `target` body block and dominate all other blocks.
-DiagnosedSilenceableFailure mapNestedForallToThreadsImpl(
-    RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
-    Operation *target, ArrayRef<int64_t> blockDimsOfr,
-    ArrayRef<int64_t> warpDims, bool syncAfterDistribute);
+DiagnosedSilenceableFailure
+mapNestedForallToThreadsImpl(RewriterBase &rewriter,
+                             std::optional<TransformOpInterface> transformOp,
+                             Operation *target, ArrayRef<int64_t> blockDims,
+                             int64_t warpSize, bool syncAfterDistribute);
 
 } // namespace gpu
 } // namespace transform
 
@@ -167,15 +167,15 @@ def MapNestedForallToThreads :
 
   let arguments = (ins TransformHandleTypeInterface:$target,
                    DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$block_dims,
-                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$warp_dims,
-                   DefaultValuedAttr<BoolAttr, "true">:$sync_after_distribute);
+                   DefaultValuedAttr<BoolAttr, "true">:$sync_after_distribute,
+                   DefaultValuedAttr<I64Attr, "32">:$warp_size);
   let results = (outs TransformHandleTypeInterface:$result);
 
   let assemblyFormat = [{
     $target
     `block_dims` `=` $block_dims
-    (`warp_dims` `=` $warp_dims^)?
     (`sync_after_distribute` `=` $sync_after_distribute^)?
+    (`warp_size` `=` $warp_size^)?
     attr-dict
     `:` functional-type($target, $result)
   }];