Skip to content

Commit d8f3703

Browse files
Adding permlanex16 and other dpp related ops to mlir dialect
1 parent b43f593 commit d8f3703

File tree

6 files changed

+59
-10
lines changed

6 files changed

+59
-10
lines changed

mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -524,7 +524,8 @@ def AMDGPU_DPPPerm : I32EnumAttr<"DPPPerm",
524524
I32EnumAttrCase<"row_mirror", 8>,
525525
I32EnumAttrCase<"row_half_mirror", 9>,
526526
I32EnumAttrCase<"row_bcast_15", 10>,
527-
I32EnumAttrCase<"row_bcast_31", 11>
527+
I32EnumAttrCase<"row_bcast_31", 11>,
528+
I32EnumAttrCase<"row_share", 12>
528529
]> {
529530
let genSpecializedAttr = 0;
530531
let cppNamespace = "::mlir::amdgpu";
@@ -557,6 +558,7 @@ def AMDGPU_DPPOp : AMDGPU_Op<"dpp", [SameTypeOperands, AllTypesMatch<["result",
557558
- Reverse within a half-row (`row_half_mirror`)
558559
- Broadcast the 15th lane of each row to the next row (`row_bcast`)
559560
- Broadcast lane 31 to rows 2 and 3 (`row_bcast`)
561+
- Broadcast a lane [0-15] within row 0 to all lanes of row 0 (`row_share`)
560562
}];
561563
let results = (outs AnyType:$result);
562564
let assemblyFormat = [{

mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -668,6 +668,22 @@ def ROCDL_DPPUpdateOp : ROCDL_IntrOp<"update.dpp", [], [0],
668668
}];
669669
}
670670

671+
// PermLaneX16 intrinsic operation
672+
def ROCDL_PermlaneX16Op : ROCDL_IntrOp<"permlanex16", [], [0],
673+
[AllTypesMatch<["res", "old", "src0", "src1", "src2"]>], 1, 0, 0,
674+
[4, 5], ["fi", "boundControl"]>,
675+
Arguments<(ins LLVM_Type:$old, LLVM_Type:$src0, LLVM_Type:$src1, LLVM_Type:$src2,
676+
I1Attr:$fi, I1Attr:$boundControl)> {
677+
let results = (outs LLVM_Type:$res);
678+
let assemblyFormat = [{
679+
attr-dict $old `,` $src0 `,` $src1 `,` $src2 `,` $fi `,` $boundControl `:` type($src0)
680+
}];
681+
let description = [{
682+
Performs a `permlanex16` operation with the given operands, applying the
683+
permutation specified by $fi to the provided inputs.
684+
}];
685+
}
686+
671687
def ROCDL_V2I16Type : FixedVectorOfLengthAndType<[2], [I16]>,
672688
BuildableType<"::mlir::VectorType::get("
673689
"{2},$_builder.getI16Type())">;

mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1244,6 +1244,7 @@ struct AMDGPUDPPLowering : public ConvertOpToLLVMPattern<DPPOp> {
12441244
ROW_HALF_MIRROR = 0x141,
12451245
BCAST15 = 0x142,
12461246
BCAST31 = 0x143,
1247+
ROW_SHARE0 = 0x150
12471248
};
12481249

12491250
auto kind = DppOp.getKind();
@@ -1301,6 +1302,11 @@ struct AMDGPUDPPLowering : public ConvertOpToLLVMPattern<DPPOp> {
13011302
case DPPPerm::row_bcast_31:
13021303
DppCtrl = DppCtrl::BCAST31;
13031304
break;
1305+
case DPPPerm::row_share:
1306+
if (auto intAttr = cast<IntegerAttr>(*permArgument)) {
1307+
DppCtrl = intAttr.getInt() + DppCtrl::ROW_SHARE0;
1308+
}
1309+
break;
13041310
}
13051311

13061312
// Check for row_mask, bank_mask, bound_ctrl if they exist and create

mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,20 @@ LogicalResult DPPOp::verify() {
457457
}
458458
break;
459459
}
460+
461+
case DPPPerm::row_share: {
462+
if (!permArgument) {
463+
return emitOpError("Attribute '" + Twine(stringifyDPPPerm(kind)) +
464+
"' value not specified");
465+
}
466+
if (auto intAttr = dyn_cast<IntegerAttr>(permArgument)) {
467+
uint32_t attrValue = intAttr.getInt();
468+
if (attrValue < 0 || attrValue > 15) {
469+
return emitOpError(
470+
"Attribute value for 'row_share' must be between 0 and 15");
471+
}
472+
}
473+
} break;
460474
}
461475
return success();
462476
}

mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -370,7 +370,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
370370
const ClusterInfo &ci) {
371371
Value result = input;
372372
if (ci.clusterSize >= 2) {
373-
auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1);
373+
auto permArg = b.getI32IntegerAttr(1);
374374
Value dppResult =
375375
b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
376376
amdgpu::DPPPerm::row_shl, permArg);
@@ -379,7 +379,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
379379
}
380380

381381
if (ci.clusterSize >= 4) {
382-
auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2);
382+
auto permArg = b.getI32IntegerAttr(2);
383383
Value dppResult =
384384
b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
385385
amdgpu::DPPPerm::row_shl, permArg);
@@ -405,16 +405,19 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
405405

406406
const int allRows = 0xf;
407407
const int allBanks = 0xf;
408-
auto int32Type = IntegerType::get(b.getContext(), 32);
408+
auto uint32Type = b.getIntegerType(32, false);
409409
if (ci.clusterSize >= 32) {
410-
auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
411-
Value dppResult = b.create<amdgpu::DPPOp>(
412-
loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
413-
b.getUnitAttr(), 0xa, allBanks, false);
410+
// auto permArg = b.getI32IntegerAttr(15);
411+
// Value dppResult = b.create<amdgpu::DPPOp>(
412+
// loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
413+
// b.getUnitAttr(), 0xa, allBanks, false);
414+
auto uIntMax = llvm::APInt::getMaxValue(32u);
415+
Value uIntMaxConst = b.create<LLVM::ConstantOp>(loc, uint32Type, uIntMax);
416+
Value dppResult = b.create<ROCDL::PermlaneX16Op>(loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst, true, false);
414417
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
415418
result, dppResult);
416419
if (ci.subgroupSize == 32) {
417-
Value lane01 = b.create<LLVM::ConstantOp>(loc, int32Type, 1);
420+
Value lane01 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 1);
418421
result =
419422
b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane01);
420423
}
@@ -427,7 +430,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
427430
b.getUnitAttr(), allRows, allBanks, false);
428431
result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
429432
result, dppResult);
430-
Value lane63 = b.create<LLVM::ConstantOp>(loc, int32Type, 63);
433+
Value lane63 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 63);
431434
result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
432435
}
433436

mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,3 +137,11 @@ func.func @row_bcast_update_dpp_f16(%arg0: f16, %arg1: f16) -> f16 {
137137
%0 = amdgpu.dpp %arg0 %arg1 row_bcast_15 { bound_ctrl = true } : f16
138138
return %0 : f16
139139
}
140+
141+
func.func @dpp_row_share(%arg0: i32, %arg1: i32) -> i32 {
142+
// CHECK-LABEL: func @dpp_row_share
143+
// CHECK: rocdl.update.dpp %arg0, %arg1 with 351, 15, 15, false : i32
144+
// CHECK: return %0 : i32
145+
%0 = amdgpu.dpp %arg0 %arg1 row_share ( 0xf : i32 ) : i32
146+
return %0 : i32
147+
}

0 commit comments

Comments
 (0)