-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[flang][cuda] Propagate the data attribute on the converted calls #124877
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-flang-fir-hlfir Author: Valentin Clement (バレンタイン クレメン) (clementval) ChangesThe CUDA Data attribute has information about the type of memory used for an allocation operation. Some of the cuf allocation operations are converted to runtime call and this information was lost. Carry over the attribute on the calls so the information can be queried for future analysis. Full diff: https://github.com/llvm/llvm-project/pull/124877.diff 4 Files Affected:
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index cc525d703ae57f..710aed5031f5bc 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -294,19 +294,22 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern<cuf::AllocOp> {
matchAndRewrite(cuf::AllocOp op,
mlir::PatternRewriter &rewriter) const override {
+ mlir::Location loc = op.getLoc();
+
if (inDeviceContext(op.getOperation())) {
// In device context just replace the cuf.alloc operation with a fir.alloc
// the cuf.free will be removed.
- rewriter.replaceOpWithNewOp<fir::AllocaOp>(
- op, op.getInType(), op.getUniqName() ? *op.getUniqName() : "",
+ auto allocaOp = rewriter.create<fir::AllocaOp>(
+ loc, op.getInType(), op.getUniqName() ? *op.getUniqName() : "",
op.getBindcName() ? *op.getBindcName() : "", op.getTypeparams(),
op.getShape());
+ allocaOp->setAttr(cuf::getDataAttrName(), op.getDataAttrAttr());
+ rewriter.replaceOp(op, allocaOp);
return mlir::success();
}
auto mod = op->getParentOfType<mlir::ModuleOp>();
fir::FirOpBuilder builder(rewriter, mod);
- mlir::Location loc = op.getLoc();
mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
if (!mlir::dyn_cast_or_null<fir::BaseBoxType>(op.getInType())) {
@@ -359,6 +362,7 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern<cuf::AllocOp> {
llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
builder, loc, fTy, bytes, memTy, sourceFile, sourceLine)};
auto callOp = builder.create<fir::CallOp>(loc, func, args);
+ callOp->setAttr(cuf::getDataAttrName(), op.getDataAttrAttr());
auto convOp = builder.createConvert(loc, op.getResult().getType(),
callOp.getResult(0));
rewriter.replaceOp(op, convOp);
@@ -381,6 +385,7 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern<cuf::AllocOp> {
llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
builder, loc, fTy, sizeInBytes, sourceFile, sourceLine)};
auto callOp = builder.create<fir::CallOp>(loc, func, args);
+ callOp->setAttr(cuf::getDataAttrName(), op.getDataAttrAttr());
auto convOp = builder.createConvert(loc, op.getResult().getType(),
callOp.getResult(0));
rewriter.replaceOp(op, convOp);
@@ -508,7 +513,8 @@ struct CUFFreeOpConversion : public mlir::OpRewritePattern<cuf::FreeOp> {
fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
builder, loc, fTy, op.getDevptr(), sourceFile, sourceLine)};
- builder.create<fir::CallOp>(loc, func, args);
+ auto callOp = builder.create<fir::CallOp>(loc, func, args);
+ callOp->setAttr(cuf::getDataAttrName(), op.getDataAttrAttr());
rewriter.eraseOp(op);
return mlir::success();
}
diff --git a/flang/test/Fir/CUDA/cuda-alloc-free.fir b/flang/test/Fir/CUDA/cuda-alloc-free.fir
index 6194f0071cd796..31f2ed022b6c43 100644
--- a/flang/test/Fir/CUDA/cuda-alloc-free.fir
+++ b/flang/test/Fir/CUDA/cuda-alloc-free.fir
@@ -11,7 +11,7 @@ func.func @_QPsub1() {
// CHECK-LABEL: func.func @_QPsub1()
// CHECK: %[[BYTES:.*]] = fir.convert %c4{{.*}} : (index) -> i64
-// CHECK: %[[ALLOC:.*]] = fir.call @_FortranACUFMemAlloc(%[[BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (i64, i32, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
+// CHECK: %[[ALLOC:.*]] = fir.call @_FortranACUFMemAlloc(%[[BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) {cuf.data_attr = #cuf.cuda<device>} : (i64, i32, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
// CHECK: %[[CONV:.*]] = fir.convert %3 : (!fir.llvm_ptr<i8>) -> !fir.ref<i32>
// CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[CONV]] {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub1Eidev"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
// CHECK: %[[DEVPTR:.*]] = fir.convert %[[DECL]]#1 : (!fir.ref<i32>) -> !fir.llvm_ptr<i8>
@@ -26,7 +26,7 @@ func.func @_QPsub2() {
// CHECK-LABEL: func.func @_QPsub2()
// CHECK: %[[BYTES:.*]] = arith.muli %c10{{.*}}, %c4{{.*}} : index
// CHECK: %[[CONV_BYTES:.*]] = fir.convert %[[BYTES]] : (index) -> i64
-// CHECK: %{{.*}} = fir.call @_FortranACUFMemAlloc(%[[CONV_BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (i64, i32, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
+// CHECK: %{{.*}} = fir.call @_FortranACUFMemAlloc(%[[CONV_BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) {cuf.data_attr = #cuf.cuda<device>} : (i64, i32, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
// CHECK: fir.call @_FortranACUFMemFree
func.func @_QPsub3(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}, %arg1: !fir.ref<i32> {fir.bindc_name = "m"}) {
@@ -58,7 +58,7 @@ func.func @_QPsub3(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}, %arg1: !fir.ref<
// CHECK: %[[NBELEM:.*]] = arith.muli %[[N]], %[[M]] : index
// CHECK: %[[BYTES:.*]] = arith.muli %[[NBELEM]], %c4{{.*}} : index
// CHECK: %[[CONV_BYTES:.*]] = fir.convert %[[BYTES]] : (index) -> i64
-// CHECK: %{{.*}} = fir.call @_FortranACUFMemAlloc(%[[CONV_BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (i64, i32, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
+// CHECK: %{{.*}} = fir.call @_FortranACUFMemAlloc(%[[CONV_BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) {cuf.data_attr = #cuf.cuda<device>} : (i64, i32, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
// CHECK: fir.call @_FortranACUFMemFree
func.func @_QPtest_type() {
@@ -71,7 +71,7 @@ func.func @_QPtest_type() {
// CHECK-LABEL: func.func @_QPtest_type()
// CHECK: %[[BYTES:.*]] = arith.constant 12 : index
// CHECK: %[[CONV_BYTES:.*]] = fir.convert %[[BYTES]] : (index) -> i64
-// CHECK: fir.call @_FortranACUFMemAlloc(%[[CONV_BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (i64, i32, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
+// CHECK: fir.call @_FortranACUFMemAlloc(%[[CONV_BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) {cuf.data_attr = #cuf.cuda<device>} : (i64, i32, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
gpu.module @cuda_device_mod {
gpu.func @_QMalloc() kernel {
@@ -81,7 +81,7 @@ gpu.module @cuda_device_mod {
}
// CHECK-LABEL: gpu.func @_QMalloc() kernel
-// CHECK: fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QMallocEa"}
+// CHECK: fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", cuf.data_attr = #cuf.cuda<device>, uniq_name = "_QMallocEa"}
func.func @_QQalloc_char() attributes {fir.bindc_name = "alloc_char"} {
%c1 = arith.constant 1 : index
@@ -92,6 +92,6 @@ func.func @_QQalloc_char() attributes {fir.bindc_name = "alloc_char"} {
// CHECK-LABEL: func.func @_QQalloc_char()
// CHECK: %[[BYTES:.*]] = arith.muli %c10{{.*}}, %c1{{.*}} : index
// CHECK: %[[BYTES_CONV:.*]] = fir.convert %[[BYTES]] : (index) -> i64
-// CHECK: fir.call @_FortranACUFMemAlloc(%[[BYTES_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (i64, i32, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
+// CHECK: fir.call @_FortranACUFMemAlloc(%[[BYTES_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) {cuf.data_attr = #cuf.cuda<device>} : (i64, i32, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
} // end module
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index b8457b846716ef..08573110821cc2 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -15,7 +15,7 @@ func.func @_QPsub1() {
}
// CHECK-LABEL: func.func @_QPsub1()
-// CHECK: %[[DESC_RT_CALL:.*]] = fir.call @_FortranACUFAllocDescriptor(%{{.*}}, %{{.*}}, %{{.*}}) : (i64, !fir.ref<i8>, i32) -> !fir.ref<!fir.box<none>>
+// CHECK: %[[DESC_RT_CALL:.*]] = fir.call @_FortranACUFAllocDescriptor(%{{.*}}, %{{.*}}, %{{.*}}) {cuf.data_attr = #cuf.cuda<device>} : (i64, !fir.ref<i8>, i32) -> !fir.ref<!fir.box<none>>
// CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
// CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
@@ -24,7 +24,7 @@ func.func @_QPsub1() {
// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFFreeDescriptor(%[[BOX_NONE]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i8>, i32) -> ()
+// CHECK: fir.call @_FortranACUFFreeDescriptor(%[[BOX_NONE]], %{{.*}}, %{{.*}}) {cuf.data_attr = #cuf.cuda<device>} : (!fir.ref<!fir.box<none>>, !fir.ref<i8>, i32) -> ()
fir.global @_QMmod1Ea {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?xf32>>> {
%0 = fir.zero_bits !fir.heap<!fir.array<?xf32>>
diff --git a/flang/test/Fir/CUDA/cuda-data-transfer.fir b/flang/test/Fir/CUDA/cuda-data-transfer.fir
index 415d0015918bb3..b62c500f4a2d32 100644
--- a/flang/test/Fir/CUDA/cuda-data-transfer.fir
+++ b/flang/test/Fir/CUDA/cuda-data-transfer.fir
@@ -329,7 +329,7 @@ func.func @_QPtest_array_type() {
// CHECK-LABEL: func.func @_QPtest_array_type()
// CHECK: %[[BYTES:.*]] = arith.muli %c10{{.*}}, %c12 : index
// CHECK: %[[CONV_BYTES:.*]] = fir.convert %[[BYTES]] : (index) -> i64
-// CHECK: fir.call @_FortranACUFMemAlloc(%[[CONV_BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (i64, i32, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
+// CHECK: fir.call @_FortranACUFMemAlloc(%[[CONV_BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) {cuf.data_attr = #cuf.cuda<device>} : (i64, i32, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
// CHECK: %[[BYTES:.*]] = arith.muli %c10{{.*}}, %c12{{.*}} : i64
// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%{{.*}}, %{{.*}}, %[[BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> ()
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you! Seems reasonable to propagate the data attribute.
The CUDA Data attribute has information about the type of memory used for an allocation operation. Some of the cuf allocation operations are converted to runtime call and this information was lost. Carry over the attribute on the calls so the information can be queried for future analysis.