@@ -35,31 +35,26 @@ namespace amdgpu {
35
35
using namespace mlir ;
36
36
using namespace mlir ::amdgpu;
37
37
38
- // / The size of a shared memory line according to AMD documentation.
39
- // / https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/instinct-mi200-cdna2-instruction-set-architecture.pdf
40
- constexpr int64_t kSharedMemoryLineSizeBytes = 64 ;
41
- // / We optimize for 64bit accesses, but this can be made an argument in the
42
- // / future.
43
- constexpr int64_t kDefaultVectorSizeBits = 64 ;
44
-
45
38
// / Uses `srcIndexValue` to permute `tgtIndexValue` via
46
39
// / `result = xor(floordiv(srcIdxVal,permuteEveryN),
47
40
// / floordiv(tgtIdxVal,vectorSize)))
48
41
// / + tgtIdxVal % vectorSize`
49
42
// / This is done using an optimized sequence of `arith` operations.
50
43
static Value permuteVectorOffset (OpBuilder &b, Location loc,
51
44
ArrayRef<Value> indices, MemRefType memrefTy,
52
- int64_t srcDim, int64_t tgtDim) {
45
+ int64_t srcDim, int64_t tgtDim,
46
+ int64_t sharedMemoryLineSizeBytes,
47
+ int64_t defaultVectorSizeBits) {
53
48
// Adjust the src index to change how often the permutation changes
54
49
// if necessary.
55
50
Value src = indices[srcDim];
56
51
57
52
// We only want to permute every N iterations of the target dim where N is
58
53
// ceil(sharedMemoryLineSizeBytes / dimSizeBytes(tgtDim)).
59
54
const int64_t permuteEveryN = std::max<int64_t >(
60
- 1 , kSharedMemoryLineSizeBytes / ((memrefTy.getDimSize (tgtDim) *
61
- memrefTy.getElementTypeBitWidth ()) /
62
- 8 ));
55
+ 1 , sharedMemoryLineSizeBytes / ((memrefTy.getDimSize (tgtDim) *
56
+ memrefTy.getElementTypeBitWidth ()) /
57
+ 8 ));
63
58
64
59
// clang-format off
65
60
// Index bit representation (b0 = least significant bit) for dim(1)
@@ -71,7 +66,7 @@ static Value permuteVectorOffset(OpBuilder &b, Location loc,
71
66
// bits[N:M] = vector index
72
67
// clang-format on
73
68
int64_t n =
74
- llvm::Log2_64 (kDefaultVectorSizeBits / memrefTy.getElementTypeBitWidth ());
69
+ llvm::Log2_64 (defaultVectorSizeBits / memrefTy.getElementTypeBitWidth ());
75
70
int64_t m = llvm::Log2_64 (memrefTy.getDimSize (tgtDim));
76
71
77
72
// Capture bits[0:(M-N)] of src by first creating a (M-N) mask.
@@ -105,9 +100,11 @@ static Value permuteVectorOffset(OpBuilder &b, Location loc,
105
100
static void transformIndices (OpBuilder &builder, Location loc,
106
101
SmallVector<Value, 4 > &indices,
107
102
MemRefType memrefTy, int64_t srcDim,
108
- int64_t tgtDim) {
103
+ int64_t tgtDim, int64_t sharedMemoryLineSizeBytes,
104
+ int64_t defaultVectorSizeBits) {
109
105
indices[tgtDim] =
110
- permuteVectorOffset (builder, loc, indices, memrefTy, srcDim, tgtDim);
106
+ permuteVectorOffset (builder, loc, indices, memrefTy, srcDim, tgtDim,
107
+ sharedMemoryLineSizeBytes, defaultVectorSizeBits);
111
108
}
112
109
113
110
// Return all operations within `parentOp` that read from or write to
@@ -149,8 +146,9 @@ getShmReadAndWriteOps(Operation *parentOp, Value shmMemRef,
149
146
return success ();
150
147
}
151
148
152
- LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites (Operation *parentOp,
153
- Value memrefValue) {
149
+ LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites (
150
+ Operation *parentOp, Value memrefValue, int64_t sharedMemoryLineSizeBytes,
151
+ int64_t defaultVectorSizeBits) {
154
152
auto memRefType = dyn_cast<MemRefType>(memrefValue.getType ());
155
153
if (!memRefType ||
156
154
!amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace (memRefType))
@@ -167,10 +165,10 @@ LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
167
165
// If dim[rank-1] is small enough to fit 8 rows in a 128B line.
168
166
const int64_t rowSize = memRefType.getDimSize (memRefType.getRank () - 1 );
169
167
const int64_t rowsPerLine =
170
- (8 * kSharedMemoryLineSizeBytes / memRefType.getElementTypeBitWidth ()) /
168
+ (8 * sharedMemoryLineSizeBytes / memRefType.getElementTypeBitWidth ()) /
171
169
rowSize;
172
170
const int64_t threadGroupSize =
173
- 1LL << (7 - llvm::Log2_64 (kDefaultVectorSizeBits / 8 ));
171
+ 1LL << (7 - llvm::Log2_64 (defaultVectorSizeBits / 8 ));
174
172
if (rowsPerLine >= threadGroupSize)
175
173
return failure ();
176
174
@@ -198,7 +196,8 @@ LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
198
196
auto indices = amdgpu::getIndices (shmWriteOp);
199
197
SmallVector<Value, 4 > transformedIndices (indices->begin (), indices->end ());
200
198
transformIndices (builder, shmWriteOp->getLoc (), transformedIndices,
201
- memRefType, srcDim, tgtDim);
199
+ memRefType, srcDim, tgtDim, sharedMemoryLineSizeBytes,
200
+ defaultVectorSizeBits);
202
201
amdgpu::setIndices (shmWriteOp, transformedIndices);
203
202
}
204
203
@@ -210,24 +209,28 @@ LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
210
209
auto indices = amdgpu::getIndices (shmReadOp);
211
210
SmallVector<Value, 4 > transformedIndices (indices->begin (), indices->end ());
212
211
transformIndices (builder, shmReadOp->getLoc (), transformedIndices,
213
- memRefType, srcDim, tgtDim);
212
+ memRefType, srcDim, tgtDim, sharedMemoryLineSizeBytes,
213
+ defaultVectorSizeBits);
214
214
amdgpu::setIndices (shmReadOp, transformedIndices);
215
215
}
216
216
217
217
return success ();
218
218
}
219
219
220
220
std::optional<LogicalResult>
221
- amdgpu::optimizeSharedMemoryReadsAndWritesOp (func::FuncOp funcOp) {
221
+ amdgpu::optimizeSharedMemoryReadsAndWritesOp (func::FuncOp funcOp,
222
+ int64_t sharedMemoryLineSizeBytes,
223
+ int64_t defaultVectorSizeBits) {
222
224
SmallVector<memref::AllocOp> shmAllocOps;
223
225
funcOp.walk ([&](memref::AllocOp allocOp) {
224
226
if (!amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace (allocOp.getType ()))
225
227
return ;
226
228
shmAllocOps.push_back (allocOp);
227
229
});
228
230
for (auto allocOp : shmAllocOps) {
229
- if (failed (amdgpu::optimizeSharedMemoryReadsAndWrites (funcOp,
230
- allocOp.getMemref ())))
231
+ if (failed (amdgpu::optimizeSharedMemoryReadsAndWrites (
232
+ funcOp, allocOp.getMemref (), sharedMemoryLineSizeBytes,
233
+ defaultVectorSizeBits)))
231
234
return failure ();
232
235
}
233
236
return success ();
@@ -237,7 +240,8 @@ struct OptimizeSharedMemoryPass
237
240
: public amdgpu::impl::OptimizeSharedMemoryBase<OptimizeSharedMemoryPass> {
238
241
public:
239
242
OptimizeSharedMemoryPass () = default ;
240
-
243
+ OptimizeSharedMemoryPass (const OptimizeSharedMemoryOptions &options)
244
+ : OptimizeSharedMemoryBase(options) {}
241
245
void runOnOperation () override {
242
246
Operation *op = getOperation ();
243
247
SmallVector<memref::AllocOp> shmAllocOps;
@@ -248,8 +252,9 @@ struct OptimizeSharedMemoryPass
248
252
shmAllocOps.push_back (allocOp);
249
253
});
250
254
for (auto allocOp : shmAllocOps) {
251
- if (failed (optimizeSharedMemoryReadsAndWrites (getOperation (),
252
- allocOp.getMemref ())))
255
+ if (failed (optimizeSharedMemoryReadsAndWrites (op, allocOp.getMemref (),
256
+ sharedMemoryLineSizeBytes,
257
+ defaultVectorSizeBits)))
253
258
return ;
254
259
}
255
260
}
0 commit comments