Skip to content

Commit 652ff20

Browse files
authored
[flang][cuda] Adding atomicadd as a cudadevice intrinsic and converting it LLVM dialect (#123840)
With these changes, CUF atomic operations are handled as cudadevice intrinsics and are converted straight to the LLVM dialect with the `llvm.atomicrw` operation. I am only submitting changes for `atomicadd` to gather feedback. If we are to proceed with these changes I will add support for all other applicable atomic operations following this pattern.
1 parent b46fcb9 commit 652ff20

File tree

5 files changed

+75
-2
lines changed

5 files changed

+75
-2
lines changed

flang/include/flang/Optimizer/Builder/IntrinsicCall.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,7 @@ struct IntrinsicLibrary {
185185
mlir::Value genAnint(mlir::Type, llvm::ArrayRef<mlir::Value>);
186186
fir::ExtendedValue genAny(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
187187
mlir::Value genAtanpi(mlir::Type, llvm::ArrayRef<mlir::Value>);
188+
mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef<mlir::Value>);
188189
fir::ExtendedValue
189190
genCommandArgumentCount(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
190191
mlir::Value genAsind(mlir::Type, llvm::ArrayRef<mlir::Value>);

flang/lib/Optimizer/Builder/IntrinsicCall.cpp

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,14 +44,14 @@
4444
#include "flang/Runtime/iostat-consts.h"
4545
#include "mlir/Dialect/Complex/IR/Complex.h"
4646
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
47+
#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
4748
#include "mlir/Dialect/Math/IR/Math.h"
4849
#include "mlir/Dialect/Vector/IR/VectorOps.h"
4950
#include "llvm/Support/CommandLine.h"
5051
#include "llvm/Support/Debug.h"
5152
#include "llvm/Support/MathExtras.h"
5253
#include "llvm/Support/raw_ostream.h"
5354
#include <cfenv> // temporary -- only used in genIeeeGetOrSetModesOrStatus
54-
#include <mlir/IR/Value.h>
5555
#include <optional>
5656

5757
#define DEBUG_TYPE "flang-lower-intrinsic"
@@ -147,6 +147,10 @@ static constexpr IntrinsicHandler handlers[]{
147147
{"atan2pi", &I::genAtanpi},
148148
{"atand", &I::genAtand},
149149
{"atanpi", &I::genAtanpi},
150+
{"atomicaddd", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
151+
{"atomicaddf", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
152+
{"atomicaddi", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
153+
{"atomicaddl", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
150154
{"bessel_jn",
151155
&I::genBesselJn,
152156
{{{"n1", asValue}, {"n2", asValue}, {"x", asValue}}},
@@ -2574,6 +2578,26 @@ mlir::Value IntrinsicLibrary::genAtanpi(mlir::Type resultType,
25742578
return builder.create<mlir::arith::MulFOp>(loc, atan, factor);
25752579
}
25762580

2581+
static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc,
2582+
mlir::LLVM::AtomicBinOp binOp, mlir::Value arg0,
2583+
mlir::Value arg1) {
2584+
auto llvmPointerType = mlir::LLVM::LLVMPointerType::get(builder.getContext());
2585+
arg0 = builder.createConvert(loc, llvmPointerType, arg0);
2586+
return builder.create<mlir::LLVM::AtomicRMWOp>(
2587+
loc, binOp, arg0, arg1, mlir::LLVM::AtomicOrdering::seq_cst);
2588+
}
2589+
2590+
mlir::Value IntrinsicLibrary::genAtomicAdd(mlir::Type resultType,
2591+
llvm::ArrayRef<mlir::Value> args) {
2592+
assert(args.size() == 2);
2593+
2594+
mlir::LLVM::AtomicBinOp binOp =
2595+
mlir::isa<mlir::IntegerType>(args[1].getType())
2596+
? mlir::LLVM::AtomicBinOp::add
2597+
: mlir::LLVM::AtomicBinOp::fadd;
2598+
return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
2599+
}
2600+
25772601
// ASSOCIATED
25782602
fir::ExtendedValue
25792603
IntrinsicLibrary::genAssociated(mlir::Type resultType,

flang/module/cudadevice.f90

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,5 +92,31 @@ attributes(device) subroutine threadfence_system()
9292
end function
9393
end interface
9494
public :: __fadd_ru
95-
95+
96+
! Atomic Operations
97+
98+
interface atomicadd
99+
attributes(device) pure integer function atomicaddi(address, val)
100+
!dir$ ignore_tkr (d) address, (d) val
101+
integer, intent(inout) :: address
102+
integer, value :: val
103+
end function
104+
attributes(device) pure real function atomicaddf(address, val)
105+
!dir$ ignore_tkr (d) address, (d) val
106+
real, intent(inout) :: address
107+
real, value :: val
108+
end function
109+
attributes(device) pure real*8 function atomicaddd(address, val)
110+
!dir$ ignore_tkr (d) address, (d) val
111+
real*8, intent(inout) :: address
112+
real*8, value :: val
113+
end function
114+
attributes(device) pure integer(8) function atomicaddl(address, val)
115+
!dir$ ignore_tkr (d) address, (d) val
116+
integer(8), intent(inout) :: address
117+
integer(8), value :: val
118+
end function
119+
end interface
120+
public :: atomicadd
121+
96122
end module

flang/test/Lower/CUDA/cuda-device-proc.cuf

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
attributes(global) subroutine devsub()
66
implicit none
77
integer :: ret
8+
real(4) :: af
9+
real(8) :: ad
10+
integer(4) :: ai
11+
integer(8) :: al
812

913
call syncthreads()
1014
call syncwarp(1)
@@ -14,6 +18,11 @@ attributes(global) subroutine devsub()
1418
ret = syncthreads_and(1)
1519
ret = syncthreads_count(1)
1620
ret = syncthreads_or(1)
21+
22+
ai = atomicadd(ai, 1_4)
23+
al = atomicadd(al, 1_8)
24+
af = atomicadd(af, 1.0_4)
25+
ad = atomicadd(ad, 1.0_8)
1726
end
1827

1928
! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
@@ -25,6 +34,10 @@ end
2534
! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%c1_i32_0) fastmath<contract> : (i32) -> i32
2635
! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.popc(%c1_i32_1) fastmath<contract> : (i32) -> i32
2736
! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.or(%c1_i32_2) fastmath<contract> : (i32) -> i32
37+
! CHECK: %{{.*}} = llvm.atomicrmw add %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
38+
! CHECK: %{{.*}} = llvm.atomicrmw add %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64
39+
! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32
40+
! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f64
2841

2942
! CHECK: func.func private @llvm.nvvm.barrier0()
3043
! CHECK: func.func private @__syncwarp(!fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>}) attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncwarp", fir.proc_attrs = #fir.proc_attrs<bind_c>}

flang/test/Semantics/cuf-device-procedures01.cuf

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,17 @@ end
2828
! CHECK: threadfence_system (Subroutine): Use from threadfence_system in cudadevice
2929

3030
subroutine host()
31+
real(4) :: af
32+
real(8) :: ad
33+
integer(4) :: ai
34+
integer(8) :: al
3135
call syncthreads()
36+
ai = atomicadd(ai, 1_4)
37+
al = atomicadd(al, 1_8)
38+
af = atomicadd(af, 1.0_4)
39+
ad = atomicadd(ad, 1.0_8)
3240
end subroutine
3341

3442
! CHECK-LABEL: Subprogram scope: host
43+
! CHECK: atomicadd, EXTERNAL: HostAssoc{{$}}
3544
! CHECK: syncthreads, EXTERNAL: HostAssoc{{$}}

0 commit comments

Comments
 (0)