Skip to content

Commit 2a1d373

Browse files
committed
Switch to 1D representation for SIMT
1 parent 56b7923 commit 2a1d373

File tree

6 files changed

+250
-285
lines changed

6 files changed

+250
-285
lines changed

mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -833,30 +833,25 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
833833
data type, the matrices are `A: vector<8x16xf16>`, `B: vector<16x16xf16>`,
834834
and `C/D: vector<8x16xf32>`. Besides the matrix size requirements, DPAS
835835
also requires A and B to be loaded with the required data layout. Specially,
836-
837836
VNNI layout is required for B operand. It is achieved via adding `packed`
838837
attribute to the `load_nd` operator. Due to the VNNI transformation, B operands
839838
can be represented as a 3D vector, with the last dimension representing the VNNI
840839
factor, which is computed as `32/bit_width_of_elem_type`. Thus, `B: vector<16x16xf16>`
841840
can be represented as `B: vector<8x16x2xf16>`.
842841

843-
In SIMT mode, DpasOp expects layout attributes `a`, `b`, and `c` (only if acc is used)
844-
which describe the data fragment owned by each work-item w.r.t. the tensor descriptor
845-
these data are loaded from.
842+
In SIMT code, each work-item from a subgroup holds a data fragment for A, B, C and the result,
843+
which are represented as 1D vectors.
846844

847845
Note: on PVC, the hardware can perform load with VNNI transformation when data
848846
element type is 16-bit or lower precision, taking 2 or 4 elements from
849847
the first dimension and inserted into the newly added innermost dimension.
850848
}];
851849

852850
let arguments = (ins
853-
XeGPU_DpasOpType : $lhs,
854-
XeGPU_DpasOpType : $rhs,
855-
Optional<XeGPU_Vector2DType>: $acc,
856-
OptionalAttr<XeGPU_LayoutAttr>:$a_layout,
857-
OptionalAttr<XeGPU_LayoutAttr>:$b_layout,
858-
OptionalAttr<XeGPU_LayoutAttr>:$c_layout);
859-
let results = (outs XeGPU_Vector2DType: $result);
851+
XeGPU_DpasOprType : $lhs,
852+
XeGPU_DpasOprType : $rhs,
853+
Optional<XeGPU_DpasResType>: $acc);
854+
let results = (outs XeGPU_DpasResType: $result);
860855

861856
let extraClassDeclaration = [{
862857
VectorType getLhsType() {

mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ def XeGPU_IntType: AnyTypeOf<[I1, I8, I16, I32, I64, SI1, SI8, SI16, SI32, SI64,
1717
def XeGPU_FloatType: AnyTypeOf<[F16, F32, F64, BF16, TF32]>;
1818
def XeGPU_ScalarType: AnyTypeOf<[XeGPU_IntType, XeGPU_FloatType]>;
1919
def XeGPU_BaseAddrType: AnyTypeOf<[Non0RankedMemRefOf<[XeGPU_ScalarType]>, UI64, UI32, I64, I32]>;
20-
def XeGPU_DpasOpType: VectorOfRankAndType<[2, 3], [XeGPU_ScalarType]>;
20+
def XeGPU_DpasOprType: VectorOfRankAndType<[1, 2, 3], [XeGPU_ScalarType]>;
21+
def XeGPU_DpasResType: VectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>;
2122
def XeGPU_OffsetType: VectorOfRankAndType<[1], [Index]>;
2223
def XeGPU_MaskType: AnyTypeOf<[VectorOfRankAndType<[1], [I1]>, I1]>;
2324
def XeGPU_ValueType: AnyTypeOf<[VectorOfRankAndType<[1,2,3,4], [XeGPU_ScalarType]>, XeGPU_ScalarType]>;

mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include "mlir/IR/Builders.h"
1111
#include "mlir/IR/DialectImplementation.h"
1212
#include "llvm/ADT/TypeSwitch.h"
13+
#include <numeric>
1314

1415
namespace mlir {
1516
namespace xegpu {
@@ -336,32 +337,30 @@ LogicalResult TensorDescType::verify(
336337
// [n_distribution_units, lane_data_size]
337338
FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
338339
auto layout = llvm::dyn_cast_if_present<LayoutAttr>(getLayout());
339-
// If no layout is provided, tensor desc is not used in SIMT mode.
340-
if (!layout)
340+
// It only works for subgroup level layout, which only has lane_layout
341+
// and lane_data, and is to distribute a SIMD code into SIMT code.
342+
if (!layout || !layout.isSgLayout())
341343
return failure();
342344

343345
SmallVector<int64_t> laneData(layout.getLaneData().asArrayRef());
344346
SmallVector<int64_t> laneLayout(layout.getLaneLayout().asArrayRef());
345347
auto tdescShape = getShape();
346348

347-
auto laneDataSize = 1, sgSize = 1;
348-
for (auto [laneDim, laneDataDim] : llvm::zip_equal(laneLayout, laneData)) {
349-
laneDataSize *= laneDataDim;
350-
sgSize *= laneDim;
351-
}
349+
// compute sgSize by multiply elements of laneLayout
350+
// e.g. for 2D layout, sgSize = laneLayout[0] * laneLayout[1]
351+
// e.g. for 1D layout, sgSize = laneLayout[0]
352+
auto sgSize = std::accumulate(laneLayout.begin(), laneLayout.end(), 1,
353+
std::multiplies<int64_t>());
352354

353355
// Case 1: regular loads/stores
354356
auto scatterAttr = getEncodingAsScatterTensorDescAttr();
355357
if (scatterAttr) {
356358
auto chunkSize = scatterAttr.getChunkSize().getInt();
357359
// Verify if the first dimension of the tensor descriptor shape is
358360
// distributable.
359-
assert(tdescShape[0] % (laneLayout[0]) == 0 &&
361+
assert(tdescShape[0] == laneLayout[0] &&
360362
"tensor descriptor shape is not distributable");
361-
if (chunkSize > 1)
362-
return VectorType::get({chunkSize / laneDataSize, laneDataSize},
363-
getElementType());
364-
return VectorType::get({laneDataSize}, getElementType());
363+
return VectorType::get({chunkSize}, getElementType());
365364
}
366365

367366
// Case 2: block loads/stores
@@ -376,8 +375,7 @@ FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
376375
// tensorSize must be adjusted for array_length.
377376
tensorSize *= getArrayLength();
378377

379-
return VectorType::get({tensorSize / (sgSize * laneDataSize), laneDataSize},
380-
getElementType());
378+
return VectorType::get({tensorSize / sgSize}, getElementType());
381379
}
382380

383381
} // namespace xegpu

0 commit comments

Comments
 (0)