Adding permlanex16 and other dpp related ops to mlir dialect

Muzammiluddin-Syed-ECE · Muzammiluddin-Syed-ECE · commit d8f370366752 · 2025-04-10T14:06:51.000-05:00
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -524,7 +524,8 @@ def AMDGPU_DPPPerm : I32EnumAttr<"DPPPerm",
       I32EnumAttrCase<"row_mirror", 8>,
       I32EnumAttrCase<"row_half_mirror", 9>,
       I32EnumAttrCase<"row_bcast_15", 10>,
-      I32EnumAttrCase<"row_bcast_31", 11>
+      I32EnumAttrCase<"row_bcast_31", 11>,
+      I32EnumAttrCase<"row_share", 12>
     ]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::amdgpu";
@@ -557,6 +558,7 @@ def AMDGPU_DPPOp : AMDGPU_Op<"dpp", [SameTypeOperands, AllTypesMatch<["result",
     - Reverse within a half-row (`row_half_mirror`)
     - Broadcast the 15th lane of each row to the next row (`row_bcast`)
     - Broadcast lane 31 to rows 2 and 3 (`row_bcast`)
+    - Broadcast a lane [0-15] within row 0 to all lanes of row 0 (`row_share`)
   }];
   let results = (outs AnyType:$result);
   let assemblyFormat = [{
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -668,6 +668,22 @@ def ROCDL_DPPUpdateOp : ROCDL_IntrOp<"update.dpp", [], [0],
   }];
 }
 
+// PermLaneX16 intrinsic operation
+def ROCDL_PermlaneX16Op : ROCDL_IntrOp<"permlanex16", [], [0],
+    [AllTypesMatch<["res", "old", "src0", "src1", "src2"]>], 1, 0, 0,
+    [4, 5], ["fi", "boundControl"]>,
+  Arguments<(ins LLVM_Type:$old, LLVM_Type:$src0, LLVM_Type:$src1, LLVM_Type:$src2,
+             I1Attr:$fi, I1Attr:$boundControl)> {
+  let results = (outs LLVM_Type:$res);
+  let assemblyFormat = [{
+    attr-dict $old `,` $src0 `,` $src1 `,` $src2 `,` $fi `,` $boundControl `:` type($src0)
+  }];
+  let description = [{
+    Performs a `permlanex16` operation with the given operands, applying the
+    permutation specified by $fi to the provided inputs.
+  }];
+}
+
 def ROCDL_V2I16Type : FixedVectorOfLengthAndType<[2], [I16]>,
                         BuildableType<"::mlir::VectorType::get("
                           "{2},$_builder.getI16Type())">;
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1244,6 +1244,7 @@ struct AMDGPUDPPLowering : public ConvertOpToLLVMPattern<DPPOp> {
       ROW_HALF_MIRROR = 0x141,
       BCAST15 = 0x142,
       BCAST31 = 0x143,
+      ROW_SHARE0 = 0x150
     };
 
     auto kind = DppOp.getKind();
@@ -1301,6 +1302,11 @@ struct AMDGPUDPPLowering : public ConvertOpToLLVMPattern<DPPOp> {
     case DPPPerm::row_bcast_31:
       DppCtrl = DppCtrl::BCAST31;
       break;
+    case DPPPerm::row_share:
+      if (auto intAttr = cast<IntegerAttr>(*permArgument)) {
+        DppCtrl = intAttr.getInt() + DppCtrl::ROW_SHARE0;
+      }
+      break;
     }
 
     // Check for row_mask, bank_mask, bound_ctrl if they exist and create
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -457,6 +457,20 @@ LogicalResult DPPOp::verify() {
     }
     break;
   }
+
+  case DPPPerm::row_share: {
+    if (!permArgument) {
+      return emitOpError("Attribute '" + Twine(stringifyDPPPerm(kind)) +
+                         "' value not specified");
+    }
+    if (auto intAttr = dyn_cast<IntegerAttr>(permArgument)) {
+      uint32_t attrValue = intAttr.getInt();
+      if (attrValue < 0 || attrValue > 15) {
+        return emitOpError(
+            "Attribute value for 'row_share' must be between 0 and 15");
+      }
+    }
+  } break;
   }
   return success();
 }
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -370,7 +370,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
                                  const ClusterInfo &ci) {
   Value result = input;
   if (ci.clusterSize >= 2) {
-    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1);
+    auto permArg = b.getI32IntegerAttr(1);
     Value dppResult =
         b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
                                 amdgpu::DPPPerm::row_shl, permArg);
@@ -379,7 +379,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
   }
 
   if (ci.clusterSize >= 4) {
-    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2);
+    auto permArg = b.getI32IntegerAttr(2);
     Value dppResult =
         b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
                                 amdgpu::DPPPerm::row_shl, permArg);
@@ -405,16 +405,19 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
 
   const int allRows = 0xf;
   const int allBanks = 0xf;
-  auto int32Type = IntegerType::get(b.getContext(), 32);
+  auto uint32Type = b.getIntegerType(32, false);
   if (ci.clusterSize >= 32) {
-    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
-    Value dppResult = b.create<amdgpu::DPPOp>(
-        loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
-        b.getUnitAttr(), 0xa, allBanks, false);
+    // auto permArg = b.getI32IntegerAttr(15);
+    // Value dppResult = b.create<amdgpu::DPPOp>(
+    //     loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
+    //     b.getUnitAttr(), 0xa, allBanks, false);
+    auto uIntMax = llvm::APInt::getMaxValue(32u);
+    Value uIntMaxConst = b.create<LLVM::ConstantOp>(loc, uint32Type, uIntMax);
+    Value dppResult = b.create<ROCDL::PermlaneX16Op>(loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst, true, false);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
     if (ci.subgroupSize == 32) {
-      Value lane01 = b.create<LLVM::ConstantOp>(loc, int32Type, 1);
+      Value lane01 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 1);
       result =
           b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane01);
     }
@@ -427,7 +430,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
         b.getUnitAttr(), allRows, allBanks, false);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
-    Value lane63 = b.create<LLVM::ConstantOp>(loc, int32Type, 63);
+    Value lane63 = b.create<LLVM::ConstantOp>(loc, b.getI32Type(), 63);
     result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
   }
 
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir b/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir
@@ -137,3 +137,11 @@ func.func @row_bcast_update_dpp_f16(%arg0: f16, %arg1: f16) -> f16 {
   %0 = amdgpu.dpp %arg0 %arg1 row_bcast_15 { bound_ctrl = true } : f16
     return %0 : f16
 }
+
+func.func @dpp_row_share(%arg0: i32, %arg1: i32) -> i32 {  
+  // CHECK-LABEL: func @dpp_row_share  
+  // CHECK: rocdl.update.dpp %arg0, %arg1 with 351, 15, 15, false : i32  
+  // CHECK: return %0 : i32  
+  %0 = amdgpu.dpp %arg0 %arg1 row_share ( 0xf : i32 ) : i32  
+  return %0 : i32  
+}

Original file line number	Diff line number	Diff line change
`@@ -457,6 +457,20 @@ LogicalResult DPPOp::verify() {`
`457`	`457`	`}`
`458`	`458`	`break;`
`459`	`459`	`}`
	`460`	`+`
	`461`	`+ case DPPPerm::row_share: {`
	`462`	`+ if (!permArgument) {`
	`463`	`+ return emitOpError("Attribute '" + Twine(stringifyDPPPerm(kind)) +`
	`464`	`+ "' value not specified");`
	`465`	`+ }`
	`466`	`+ if (auto intAttr = dyn_cast<IntegerAttr>(permArgument)) {`
	`467`	`+ uint32_t attrValue = intAttr.getInt();`
	`468`	`+ if (attrValue < 0 \|\| attrValue > 15) {`
	`469`	`+ return emitOpError(`
	`470`	`+ "Attribute value for 'row_share' must be between 0 and 15");`
	`471`	`+ }`
	`472`	`+ }`
	`473`	`+ } break;`
`460`	`474`	`}`
`461`	`475`	`return success();`
`462`	`476`	`}`