Skip to content

Commit 227bfa1

Browse files
authored
[mlir] fix a crash when lower parallel loop to gpu (#75811) (#75946)
1 parent f1156fb commit 227bfa1

File tree

2 files changed

+46
-1
lines changed

2 files changed

+46
-1
lines changed

mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -456,7 +456,8 @@ static LogicalResult processParallelLoop(
456456
rewriter.getAffineSymbolExpr(1));
457457
newIndex = rewriter.create<AffineApplyOp>(
458458
loc, annotation.getMap().compose(lowerAndStep),
459-
ValueRange{operand, step, lowerBound});
459+
ValueRange{operand, ensureLaunchIndependent(step),
460+
ensureLaunchIndependent(lowerBound)});
460461
// If there was also a bound, insert that, too.
461462
// TODO: Check that we do not assign bounds twice.
462463
if (annotation.getBound()) {

mlir/test/Conversion/SCFToGPU/parallel_loop.mlir

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,3 +384,47 @@ func.func @parallel_no_annotations(%arg0 : index, %arg1 : index, %arg2 : index,
384384

385385
// CHECK-LABEL: @parallel_no_annotations
386386
// CHECK: scf.parallel
387+
388+
// -----
389+
390+
// CHECK-LABEL: @step_invariant
391+
func.func @step_invariant() {
392+
%alloc = memref.alloc() : memref<1x1xf64>
393+
%alloc_0 = memref.alloc() : memref<1x1xf64>
394+
%alloc_1 = memref.alloc() : memref<1x1xf64>
395+
%c0 = arith.constant 0 : index
396+
%c1 = arith.constant 1 : index
397+
%c1_2 = arith.constant 1 : index
398+
scf.parallel (%arg0) = (%c0) to (%c1) step (%c1_2) {
399+
%c0_3 = arith.constant 0 : index
400+
%c1_4 = arith.constant 1 : index
401+
%c1_5 = arith.constant 1 : index
402+
scf.parallel (%arg1) = (%c0_3) to (%c1_4) step (%c1_5) {
403+
%0 = memref.load %alloc_1[%arg0, %arg1] : memref<1x1xf64>
404+
%1 = memref.load %alloc_0[%arg0, %arg1] : memref<1x1xf64>
405+
%2 = arith.addf %0, %1 : f64
406+
memref.store %2, %alloc[%arg0, %arg1] : memref<1x1xf64>
407+
scf.yield
408+
} {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
409+
scf.yield
410+
} {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
411+
memref.dealloc %alloc_1 : memref<1x1xf64>
412+
memref.dealloc %alloc_0 : memref<1x1xf64>
413+
memref.dealloc %alloc : memref<1x1xf64>
414+
return
415+
}
416+
417+
// CHECK: %[[alloc_0:.*]] = memref.alloc() : memref<1x1xf64>
418+
// CHECK: %[[alloc_1:.*]] = memref.alloc() : memref<1x1xf64>
419+
// CHECK: %[[alloc_2:.*]] = memref.alloc() : memref<1x1xf64>
420+
// CHECK: %[[map_0:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}]
421+
// CHECK: %[[map_1:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}]
422+
// CHECK: gpu.launch
423+
// CHECK-SAME: blocks(%[[arg_0:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_0]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}})
424+
// CHECK-SAME: threads(%[[arg_3:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_1]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}})
425+
// CHECK: %[[dim0:.*]] = affine.apply #map1(%[[arg_0]])[{{.*}}, {{.*}}]
426+
// CHECK: %[[dim1:.*]] = affine.apply #map1(%[[arg_3]])[{{.*}}, {{.*}}]
427+
// CHECK: %[[lhs:.*]] = memref.load %[[alloc_2]][%[[dim0]], %[[dim1]]] : memref<1x1xf64>
428+
// CHECK: %[[rhs:.*]] = memref.load %[[alloc_1]][%[[dim0]], %[[dim1]]] : memref<1x1xf64>
429+
// CHECK: %[[sum:.*]] = arith.addf %[[lhs]], %[[rhs]] : f64
430+
// CHECK: memref.store %[[sum]], %[[alloc_0]][%[[dim0]], %[[dim1]]] : memref<1x1xf64>

0 commit comments

Comments
 (0)