Skip to content

Commit 5b33cff

Browse files
authored
[mlir][gpu] Add Support for Cluster of Thread Blocks in gpu.launch (#76924)
1 parent ab073cb commit 5b33cff

File tree

6 files changed

+219
-22
lines changed

6 files changed

+219
-22
lines changed

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

Lines changed: 48 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -678,6 +678,9 @@ def GPU_LaunchOp : GPU_Op<"launch", [
678678
Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
679679
Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
680680
Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
681+
Optional<Index>:$clusterSizeX,
682+
Optional<Index>:$clusterSizeY,
683+
Optional<Index>:$clusterSizeZ,
681684
Optional<I32>:$dynamicSharedMemorySize)>,
682685
Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> {
683686
let summary = "GPU kernel launch operation";
@@ -700,8 +703,11 @@ def GPU_LaunchOp : GPU_Op<"launch", [
700703
to the amount of dynamic shared memory a kernel's workgroup should be
701704
allocated; when this operand is not present, a zero size is assumed.
702705

703-
The body region has at least _twelve_ arguments, grouped as follows:
706+
The body region has at least _twelve_ arguments, or _eighteen_ if cluster
707+
dimensions are present, grouped as follows:
704708

709+
- three optional arguments that contain cluster identifiers along x,y,z
710+
dimensions;
705711
- three arguments that contain block identifiers along x,y,z dimensions;
706712
- three arguments that contain thread identifiers along x,y,z dimensions;
707713
- operands of the `gpu.launch` operation as is (i.e. the operands for
@@ -713,6 +719,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
713719

714720
```
715721
operation ::= `gpu.launch` (`async` (`[` ssa-id-list `]`)? )?
722+
( `clusters` `(` ssa-id-list `)` `in` ssa-reassignment )?
716723
`blocks` `(` ssa-id-list `)` `in` ssa-reassignment
717724
`threads` `(` ssa-id-list `)` `in` ssa-reassignment
718725
(dynamic_shared_memory_size ssa-use)?
@@ -763,6 +770,16 @@ def GPU_LaunchOp : GPU_Op<"launch", [
763770
// Assuming %val1 is defined outside the gpu.launch region.
764771
%42 = load %workgroup[%bx] : memref<32xf32, 3>
765772
}
773+
774+
// Launch with clusters.
775+
gpu.launch clusters(%cx, %cy, %cz) in (%sz_cx = %0, %sz_cy = %1, %sz_cz = %2)
776+
blocks(%bx, %by, %bz) in (%sz_bx = %3, %sz_by = %4, %sz_bz = %5)
777+
threads(%tx, %ty, %tz) in (%sz_tx = %6, %sz_ty = %7, %sz_tz = %8)
778+
{
779+
// Cluster, block and thread identifiers, as well as cluster/block/grid
780+
// sizes are immediately usable inside body region.
781+
"some_op"(%cx, %bx, %tx) : (index, index, index) -> ()
782+
}
766783
```
767784

768785
Rationale: using operation/block arguments gives analyses a clear way of
@@ -784,25 +801,35 @@ def GPU_LaunchOp : GPU_Op<"launch", [
784801
CArg<"Type", "nullptr">:$asyncTokenType,
785802
CArg<"ValueRange", "{}">:$asyncDependencies,
786803
CArg<"TypeRange", "{}">:$workgroupAttributions,
787-
CArg<"TypeRange", "{}">:$privateAttributions)>
804+
CArg<"TypeRange", "{}">:$privateAttributions,
805+
CArg<"Value", "nullptr">:$clusterSizeX,
806+
CArg<"Value", "nullptr">:$clusterSizeY,
807+
CArg<"Value", "nullptr">:$clusterSizeZ)>
788808
];
789809

790810
let extraClassDeclaration = [{
791811
/// Get the SSA values corresponding to kernel block identifiers.
792812
KernelDim3 getBlockIds();
793813
/// Get the SSA values corresponding to kernel thread identifiers.
794814
KernelDim3 getThreadIds();
815+
/// Get the SSA values corresponding to kernel cluster identifiers.
816+
std::optional<KernelDim3> getClusterIds();
795817
/// Get the SSA values corresponding to kernel grid size.
796818
KernelDim3 getGridSize();
797819
/// Get the SSA values corresponding to kernel block size.
798820
KernelDim3 getBlockSize();
821+
/// Get the SSA values corresponding to kernel cluster size.
822+
std::optional<KernelDim3> getClusterSize();
799823

800824
/// Get the SSA values passed as operands to specify the grid size.
801825
KernelDim3 getGridSizeOperandValues();
802826
/// Get the SSA values passed as operands to specify the block size.
803827
KernelDim3 getBlockSizeOperandValues();
828+
/// Get the SSA values passed as operands to specify the cluster size.
829+
std::optional<KernelDim3> getClusterSizeOperandValues();
804830

805831
static StringRef getBlocksKeyword() { return "blocks"; }
832+
static StringRef getClustersKeyword() { return "clusters"; }
806833
static StringRef getThreadsKeyword() { return "threads"; }
807834
static StringRef getDynamicSharedMemorySizeKeyword() {
808835
return "dynamic_shared_memory_size";
@@ -816,6 +843,21 @@ def GPU_LaunchOp : GPU_Op<"launch", [
816843
/// placed in the leading positions of the argument list.
817844
static constexpr unsigned kNumConfigRegionAttributes = 12;
818845

846+
/// Returns true if cluster size is specified.
847+
bool hasClusterSize() {
848+
if (getClusterSizeX() && getClusterSizeY() && getClusterSizeZ())
849+
return true;
850+
return false;
851+
}
852+
/// Returns the number of operands including cluster size
853+
unsigned getNumConfigOperands() {
854+
return kNumConfigOperands + (hasClusterSize() ? 3 : 0);
855+
}
856+
/// Returns the number of region attributes including cluster size
857+
unsigned getNumConfigRegionAttributes() {
858+
return kNumConfigRegionAttributes + (hasClusterSize() ? 6 : 0);
859+
}
860+
819861
/// Returns the keywords used in the custom syntax for this Op.
820862
static StringRef getWorkgroupKeyword() { return "workgroup"; }
821863
static StringRef getPrivateKeyword() { return "private"; }
@@ -831,7 +873,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
831873
/// the workgroup memory
832874
ArrayRef<BlockArgument> getWorkgroupAttributions() {
833875
auto begin =
834-
std::next(getBody().args_begin(), kNumConfigRegionAttributes);
876+
std::next(getBody().args_begin(), getNumConfigRegionAttributes());
835877
auto end = std::next(begin, getNumWorkgroupAttributions());
836878
return {begin, end};
837879
}
@@ -842,7 +884,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
842884

843885
/// Returns the number of buffers located in the private memory.
844886
unsigned getNumPrivateAttributions() {
845-
return getBody().getNumArguments() - kNumConfigRegionAttributes -
887+
return getBody().getNumArguments() - getNumConfigRegionAttributes() -
846888
getNumWorkgroupAttributions();
847889
}
848890

@@ -853,7 +895,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
853895
// memory.
854896
auto begin =
855897
std::next(getBody().args_begin(),
856-
kNumConfigRegionAttributes + getNumWorkgroupAttributions());
898+
getNumConfigRegionAttributes() + getNumWorkgroupAttributions());
857899
return {begin, getBody().args_end()};
858900
}
859901

@@ -871,6 +913,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
871913
let hasCanonicalizer = 1;
872914
let hasCustomAssemblyFormat = 1;
873915
let hasRegionVerifier = 1;
916+
let hasVerifier = 1;
874917
}
875918

876919
def GPU_PrintfOp : GPU_Op<"printf", [MemoryEffects<[MemWrite]>]>,

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

Lines changed: 79 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -646,7 +646,8 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
646646
Value getBlockSizeZ, Value dynamicSharedMemorySize,
647647
Type asyncTokenType, ValueRange asyncDependencies,
648648
TypeRange workgroupAttributions,
649-
TypeRange privateAttributions) {
649+
TypeRange privateAttributions, Value clusterSizeX,
650+
Value clusterSizeY, Value clusterSizeZ) {
650651
// Add a WorkGroup attribution attribute. This attribute is required to
651652
// identify private attributions in the list of block argguments.
652653
result.addAttribute(getNumWorkgroupAttributionsAttrName(),
@@ -660,6 +661,12 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
660661
// Add grid and block sizes as op operands, followed by the data operands.
661662
result.addOperands({gridSizeX, gridSizeY, gridSizeZ, getBlockSizeX,
662663
getBlockSizeY, getBlockSizeZ});
664+
if (clusterSizeX)
665+
result.addOperands(clusterSizeX);
666+
if (clusterSizeY)
667+
result.addOperands(clusterSizeY);
668+
if (clusterSizeZ)
669+
result.addOperands(clusterSizeZ);
663670
if (dynamicSharedMemorySize)
664671
result.addOperands(dynamicSharedMemorySize);
665672

@@ -678,9 +685,12 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
678685
body->addArgument(argTy, result.location);
679686
kernelRegion->push_back(body);
680687
// Fill OperandSegmentSize Attribute.
681-
SmallVector<int32_t, 8> segmentSizes(8, 1);
688+
SmallVector<int32_t, 11> segmentSizes(11, 1);
682689
segmentSizes.front() = asyncDependencies.size();
683690
segmentSizes.back() = dynamicSharedMemorySize ? 1 : 0;
691+
segmentSizes[7] = clusterSizeX ? 1 : 0;
692+
segmentSizes[8] = clusterSizeY ? 1 : 0;
693+
segmentSizes[9] = clusterSizeZ ? 1 : 0;
684694
result.addAttribute(getOperandSegmentSizeAttr(),
685695
builder.getDenseI32ArrayAttr(segmentSizes));
686696
}
@@ -709,6 +719,22 @@ KernelDim3 LaunchOp::getBlockSize() {
709719
return KernelDim3{args[9], args[10], args[11]};
710720
}
711721

722+
std::optional<KernelDim3> LaunchOp::getClusterIds() {
723+
assert(!getBody().empty() && "LaunchOp body must not be empty.");
724+
if (!hasClusterSize())
725+
return std::nullopt;
726+
auto args = getBody().getArguments();
727+
return KernelDim3{args[12], args[13], args[14]};
728+
}
729+
730+
std::optional<KernelDim3> LaunchOp::getClusterSize() {
731+
assert(!getBody().empty() && "LaunchOp body must not be empty.");
732+
if (!hasClusterSize())
733+
return std::nullopt;
734+
auto args = getBody().getArguments();
735+
return KernelDim3{args[15], args[16], args[17]};
736+
}
737+
712738
KernelDim3 LaunchOp::getGridSizeOperandValues() {
713739
auto operands = getOperands().drop_front(getAsyncDependencies().size());
714740
return KernelDim3{operands[0], operands[1], operands[2]};
@@ -719,6 +745,20 @@ KernelDim3 LaunchOp::getBlockSizeOperandValues() {
719745
return KernelDim3{operands[3], operands[4], operands[5]};
720746
}
721747

748+
std::optional<KernelDim3> LaunchOp::getClusterSizeOperandValues() {
749+
auto operands = getOperands().drop_front(getAsyncDependencies().size());
750+
if (!hasClusterSize())
751+
return std::nullopt;
752+
return KernelDim3{operands[6], operands[7], operands[8]};
753+
}
754+
755+
LogicalResult LaunchOp::verify() {
756+
if (!(hasClusterSize()) &&
757+
(getClusterSizeX() || getClusterSizeY() || getClusterSizeZ()))
758+
return emitOpError() << "cluster size must be all present";
759+
return success();
760+
}
761+
722762
LogicalResult LaunchOp::verifyRegions() {
723763
// Kernel launch takes kNumConfigOperands leading operands for grid/block
724764
// sizes and transforms them into kNumConfigRegionAttributes region arguments
@@ -778,6 +818,12 @@ void LaunchOp::print(OpAsmPrinter &p) {
778818
p << " [" << getAsyncDependencies() << ']';
779819
}
780820
// Print the launch configuration.
821+
if (hasClusterSize()) {
822+
p << ' ' << getClustersKeyword();
823+
printSizeAssignment(p, getClusterSize().value(),
824+
getClusterSizeOperandValues().value(),
825+
getClusterIds().value());
826+
}
781827
p << ' ' << getBlocksKeyword();
782828
printSizeAssignment(p, getGridSize(), getGridSizeOperandValues(),
783829
getBlockIds());
@@ -831,6 +877,7 @@ parseSizeAssignment(OpAsmParser &parser,
831877

832878
/// Parses a Launch operation.
833879
/// operation ::= `gpu.launch` (`async` `[` ssa-id-list `]`)?
880+
/// `clusters` `(` ssa-id-list `)` `in` ssa-reassignment (Optional)
834881
/// `blocks` `(` ssa-id-list `)` `in` ssa-reassignment
835882
/// `threads` `(` ssa-id-list `)` `in` ssa-reassignment
836883
/// memory-attribution
@@ -840,15 +887,13 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
840887
// Sizes of the grid and block.
841888
SmallVector<OpAsmParser::UnresolvedOperand, LaunchOp::kNumConfigOperands>
842889
sizes(LaunchOp::kNumConfigOperands);
843-
MutableArrayRef<OpAsmParser::UnresolvedOperand> sizesRef(sizes);
844890

845891
// Actual (data) operands passed to the kernel.
846892
SmallVector<OpAsmParser::UnresolvedOperand, 4> dataOperands;
847893

848894
// Region arguments to be created.
849895
SmallVector<OpAsmParser::UnresolvedOperand, 16> regionArgs(
850896
LaunchOp::kNumConfigRegionAttributes);
851-
MutableArrayRef<OpAsmParser::UnresolvedOperand> regionArgsRef(regionArgs);
852897

853898
// Parse optional async dependencies.
854899
SmallVector<OpAsmParser::UnresolvedOperand, 4> asyncDependencies;
@@ -861,6 +906,24 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
861906
if (parser.getNumResults() > 0)
862907
result.types.push_back(asyncTokenType);
863908

909+
bool hasCluster = false;
910+
if (succeeded(
911+
parser.parseOptionalKeyword(LaunchOp::getClustersKeyword().data()))) {
912+
hasCluster = true;
913+
sizes.resize(9);
914+
regionArgs.resize(18);
915+
}
916+
MutableArrayRef<OpAsmParser::UnresolvedOperand> sizesRef(sizes);
917+
MutableArrayRef<OpAsmParser::UnresolvedOperand> regionArgsRef(regionArgs);
918+
919+
// Last three segment assigns the cluster size. In the region argument
920+
// list, this is last 6 arguments.
921+
if (hasCluster) {
922+
if (parseSizeAssignment(parser, sizesRef.drop_front(6),
923+
regionArgsRef.slice(15, 3),
924+
regionArgsRef.slice(12, 3)))
925+
return failure();
926+
}
864927
// Parse the size assignment segments: the first segment assigns grid sizes
865928
// and defines values for block identifiers; the second segment assigns block
866929
// sizes and defines values for thread identifiers. In the region argument
@@ -898,7 +961,7 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
898961
// LaunchOp::getNumWorkgroupAttributionsAttrName().
899962
Type index = parser.getBuilder().getIndexType();
900963
SmallVector<Type, LaunchOp::kNumConfigRegionAttributes> dataTypes(
901-
LaunchOp::kNumConfigRegionAttributes, index);
964+
LaunchOp::kNumConfigRegionAttributes + 6, index);
902965

903966
SmallVector<OpAsmParser::Argument> regionArguments;
904967
for (auto ssaValueAndType : llvm::zip(regionArgs, dataTypes)) {
@@ -916,8 +979,9 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
916979

917980
// Store the number of operands we just parsed as the number of workgroup
918981
// memory attributions.
919-
unsigned numWorkgroupAttrs =
920-
regionArguments.size() - LaunchOp::kNumConfigRegionAttributes;
982+
unsigned numWorkgroupAttrs = regionArguments.size() -
983+
LaunchOp::kNumConfigRegionAttributes -
984+
(hasCluster ? 6 : 0);
921985
result.addAttribute(LaunchOp::getNumWorkgroupAttributionsAttrName(),
922986
builder.getI64IntegerAttr(numWorkgroupAttrs));
923987

@@ -934,8 +998,14 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
934998
parser.parseOptionalAttrDict(result.attributes))
935999
return failure();
9361000

937-
SmallVector<int32_t, 8> segmentSizes(8, 1);
1001+
SmallVector<int32_t, 11> segmentSizes(11, 1);
9381002
segmentSizes.front() = asyncDependencies.size();
1003+
1004+
if (!hasCluster) {
1005+
segmentSizes[7] = 0;
1006+
segmentSizes[8] = 0;
1007+
segmentSizes[9] = 0;
1008+
}
9391009
segmentSizes.back() = hasDynamicSharedMemorySize ? 1 : 0;
9401010
result.addAttribute(LaunchOp::getOperandSegmentSizeAttr(),
9411011
parser.getBuilder().getDenseI32ArrayAttr(segmentSizes));
@@ -992,7 +1062,7 @@ BlockArgument LaunchOp::addWorkgroupAttribution(Type type, Location loc) {
9921062
(*this)->setAttr(attrName,
9931063
IntegerAttr::get(attr.getType(), attr.getValue() + 1));
9941064
return getBody().insertArgument(
995-
LaunchOp::kNumConfigRegionAttributes + attr.getInt(), type, loc);
1065+
LaunchOp::getNumConfigRegionAttributes() + attr.getInt(), type, loc);
9961066
}
9971067

9981068
/// Adds a new block argument that corresponds to buffers located in

mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -49,15 +49,21 @@ static void createForAllDimensions(OpBuilder &builder, Location loc,
4949
/// entry block of `launchOpBody`, to the corresponding result value of the
5050
/// added operations.
5151
static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody,
52-
Region &launchOpBody, IRMapping &map) {
52+
Region &launchOpBody, IRMapping &map,
53+
bool hasCluster = false) {
5354
OpBuilder builder(loc->getContext());
5455
Block &firstBlock = launchOpBody.front();
5556
builder.setInsertionPointToStart(&launchFuncOpBody.front());
56-
SmallVector<Value, 12> indexOps;
57+
SmallVector<Value> indexOps;
58+
// The order is important here, as it must match the order of the arguments
5759
createForAllDimensions<gpu::BlockIdOp>(builder, loc, indexOps);
5860
createForAllDimensions<gpu::ThreadIdOp>(builder, loc, indexOps);
5961
createForAllDimensions<gpu::GridDimOp>(builder, loc, indexOps);
6062
createForAllDimensions<gpu::BlockDimOp>(builder, loc, indexOps);
63+
if (hasCluster) {
64+
createForAllDimensions<gpu::ClusterIdOp>(builder, loc, indexOps);
65+
createForAllDimensions<gpu::ClusterDimOp>(builder, loc, indexOps);
66+
}
6167
// Replace the leading 12 function args with the respective thread/block index
6268
// operations. Iterate backwards since args are erased and indices change.
6369
for (const auto &indexOp : enumerate(indexOps))
@@ -212,9 +218,11 @@ static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
212218
IRMapping map;
213219

214220
// Map the arguments corresponding to the launch parameters like blockIdx,
215-
// threadIdx, etc.
221+
// threadIdx, etc. If cluster is present, then we also generate clusterIdx and
222+
// clusterDim.
216223
Region &outlinedFuncBody = outlinedFunc.getBody();
217-
injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map);
224+
injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map,
225+
launchOp.hasClusterSize());
218226

219227
// Map memory attributions from the LaunOp op to the GPUFuncOp attributions.
220228
for (const auto &[launchArg, funcArg] :
@@ -278,12 +286,14 @@ static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,
278286
// The launch op has an optional dynamic shared memory size. If it doesn't
279287
// exist, we use zero.
280288
Value asyncToken = launchOp.getAsyncToken();
289+
std::optional<gpu::KernelDim3> clusterSize =
290+
launchOp.getClusterSizeOperandValues();
281291
auto launchFunc = builder.create<gpu::LaunchFuncOp>(
282292
launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(),
283293
launchOp.getBlockSizeOperandValues(),
284294
launchOp.getDynamicSharedMemorySize(), operands,
285295
asyncToken ? asyncToken.getType() : nullptr,
286-
launchOp.getAsyncDependencies());
296+
launchOp.getAsyncDependencies(), clusterSize);
287297
launchOp.replaceAllUsesWith(launchFunc);
288298
launchOp.erase();
289299
}

mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@ func.func @one_d_loop(%A : memref<?xf32>, %B : memref<?xf32>) {
1717
// CHECK-BLOCKS-NEXT: %{{.*}} = arith.constant 1 : index
1818
// CHECK-BLOCKS-NEXT: %[[ONE:.*]] = arith.constant 1 : index
1919

20-
// CHECK-THREADS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}}0 = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]])
21-
// CHECK-BLOCKS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}}0 = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]])
20+
// CHECK-THREADS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]])
21+
// CHECK-BLOCKS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]])
2222
affine.for %i = 0 to 42 {
2323
// CHECK-THREADS-NEXT: %[[INDEX:.*]] = arith.addi %{{.*}}, %[[T0]]
2424
// CHECK-THREADS-NEXT: memref.load %{{.*}}[%[[INDEX]]]

0 commit comments

Comments
 (0)