@@ -98,10 +98,6 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
98
98
}
99
99
100
100
if (ci.clusterSize >= 32 ) {
101
- // auto permArg = builder.getInt32(15);
102
- // auto rowMask = builder.getInt32("0xa");
103
- // auto bankMask = builder.getInt32("0xf");
104
- // auto boundCtrl = builder.getBoolAttr(false);
105
101
auto permArg = b.getIntegerAttr (b.getIntegerType (32 ), 15 );
106
102
Value dppResult = b.create <amdgpu::DPPOp>(
107
103
loc, result.getType (), result, result, amdgpu::DPPPerm::row_bcast_15,
@@ -111,10 +107,6 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
111
107
}
112
108
113
109
if (ci.clusterSize == 64 ) {
114
- // auto permArg = builder.getInt32(31);
115
- // auto rowMask = builder.getInt32("0xc");
116
- // auto bankMask = builder.getInt32("0xf");
117
- // auto boundCtrl = builder.getBoolAttr(false);
118
110
auto permArg = b.getIntegerAttr (b.getIntegerType (32 ), 31 );
119
111
Value dppResult = b.create <amdgpu::DPPOp>(
120
112
loc, result.getType (), result, result, amdgpu::DPPPerm::row_bcast_31,
@@ -123,9 +115,9 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
123
115
result, dppResult);
124
116
}
125
117
126
- // // read lane 63 with the final result.
127
- // auto lane = b.getIntegerAttr(b.getIntegerType(32) , 63);
128
- // result = b.create<ROCDL::ReadLaneOp >(loc, input.getType(), result, lane );
118
+ auto int32Type = IntegerType::get (b. getContext (), 32 );
119
+ Value lane63 = b.create <LLVM::ConstantOp>(loc, int32Type , 63 );
120
+ result = b.create <ROCDL::ReadlaneOp >(loc, input.getType (), result, lane63 );
129
121
assert (result.getType () == input.getType ());
130
122
return result;
131
123
}
@@ -170,16 +162,16 @@ struct ConvertGPUToAMDGPUPass
170
162
void runOnOperation () override {
171
163
RewritePatternSet patterns (&getContext ());
172
164
int subgroupSizeInt = static_cast <int >(subgroupSize);
173
- populateSubgroupReduceLoweringPatterns (patterns, subgroupSizeInt,
165
+ populateAMDGPUOptimizedSubgroupReducePatterns (patterns, subgroupSizeInt,
174
166
PatternBenefit (1 ));
175
167
walkAndApplyPatterns (getOperation (), std::move (patterns));
176
168
}
177
169
};
178
170
} // namespace
179
171
180
- void mlir::populateSubgroupReduceLoweringPatterns (RewritePatternSet &patterns,
172
+ void mlir::populateAMDGPUOptimizedSubgroupReducePatterns (RewritePatternSet &patterns,
181
173
unsigned subgroupSize,
182
174
PatternBenefit benefit) {
183
175
patterns.add <ScalarSubgroupReduceToShuffles>(
184
176
patterns.getContext (), subgroupSize, /* matchClustered=*/ true , benefit);
185
- }
177
+ }
0 commit comments