[flang][cuda] Adding atomicadd as a cudadevice intrinsic and converting it LLVM dialect #123840

Renaud-K · 2025-01-21T22:58:46Z

With these changes, CUF atomic operations are handled as cudadevice intrinsics and are converted straight to the LLVM dialect with the llvm.atomicrw operation.

I am only submitting changes for atomicadd to gather feedback. If we are to proceed with these changes I will add support for all other applicable atomic operations following this pattern.

…lect

llvmbot · 2025-01-21T22:59:19Z

@llvm/pr-subscribers-flang-semantics

@llvm/pr-subscribers-flang-fir-hlfir

Author: Renaud Kauffmann (Renaud-K)

Changes

With these changes, CUF atomic operations are handled as cudadevice intrinsics and are converted straight to the LLVM dialect with the llvm.atomicrw operation.

I am only submitting changes for atomicadd to gather feedback. If we are to proceed with these changes I will add support for all other applicable atomic operations following this pattern.

Full diff: https://github.com/llvm/llvm-project/pull/123840.diff

5 Files Affected:

(modified) flang/include/flang/Optimizer/Builder/IntrinsicCall.h (+1)
(modified) flang/lib/Optimizer/Builder/IntrinsicCall.cpp (+25-1)
(modified) flang/module/cudadevice.f90 (+26)
(modified) flang/test/Lower/CUDA/cuda-device-proc.cuf (+13)
(modified) flang/test/Semantics/cuf-device-procedures01.cuf (+9)

diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 9c9c0609f4fc3c..d5f794a382334f 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -185,6 +185,7 @@ struct IntrinsicLibrary {
   mlir::Value genAnint(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genAny(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genAtanpi(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genAtomAdd(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue
       genCommandArgumentCount(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genAsind(mlir::Type, llvm::ArrayRef<mlir::Value>);
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 6a343645ab8786..0dccd168714e1a 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -44,6 +44,7 @@
 #include "flang/Runtime/iostat-consts.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "llvm/Support/CommandLine.h"
@@ -51,7 +52,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cfenv> // temporary -- only used in genIeeeGetOrSetModesOrStatus
-#include <mlir/IR/Value.h>
 #include <optional>
 
 #define DEBUG_TYPE "flang-lower-intrinsic"
@@ -147,6 +147,10 @@ static constexpr IntrinsicHandler handlers[]{
     {"atan2pi", &I::genAtanpi},
     {"atand", &I::genAtand},
     {"atanpi", &I::genAtanpi},
+    {"atomicaddd", &I::genAtomAdd, {{{"addr", asAddr}, {"v", asValue}}}, false},
+    {"atomicaddf", &I::genAtomAdd, {{{"addr", asAddr}, {"v", asValue}}}, false},
+    {"atomicaddi", &I::genAtomAdd, {{{"addr", asAddr}, {"v", asValue}}}, false},
+    {"atomicaddl", &I::genAtomAdd, {{{"addr", asAddr}, {"v", asValue}}}, false},
     {"bessel_jn",
      &I::genBesselJn,
      {{{"n1", asValue}, {"n2", asValue}, {"x", asValue}}},
@@ -2574,6 +2578,26 @@ mlir::Value IntrinsicLibrary::genAtanpi(mlir::Type resultType,
   return builder.create<mlir::arith::MulFOp>(loc, atan, factor);
 }
 
+static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc,
+                                mlir::LLVM::AtomicBinOp binOp, mlir::Value arg0,
+                                mlir::Value arg1) {
+  auto llvmPointerType = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+  arg0 = builder.createConvert(loc, llvmPointerType, arg0);
+  return builder.create<mlir::LLVM::AtomicRMWOp>(
+      loc, binOp, arg0, arg1, mlir::LLVM::AtomicOrdering::seq_cst);
+}
+
+mlir::Value IntrinsicLibrary::genAtomAdd(mlir::Type resultType,
+                                         llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+
+  mlir::LLVM::AtomicBinOp binOp =
+      mlir::isa<mlir::IntegerType>(args[1].getType())
+          ? mlir::LLVM::AtomicBinOp::add
+          : mlir::LLVM::AtomicBinOp::fadd;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
 // ASSOCIATED
 fir::ExtendedValue
 IntrinsicLibrary::genAssociated(mlir::Type resultType,
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index 3d487fd000a094..2dc0ed1f49bf67 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -92,5 +92,31 @@ attributes(device) subroutine threadfence_system()
     end function
   end interface
   public :: __fadd_ru
+
+  ! Atomic Operations
   
+  interface atomicadd
+    attributes(device) pure integer function atomicaddi(address, val)
+  !dir$ ignore_tkr (rd) address, (d) val
+    integer, intent(inout) :: address
+    integer, value :: val
+    end function
+    attributes(device) pure real function atomicaddf(address, val)
+  !dir$ ignore_tkr (rd) address, (d) val
+    real, intent(inout) :: address
+    real, value :: val
+    end function
+    attributes(device) pure real*8 function atomicaddd(address, val)
+  !dir$ ignore_tkr (rd) address, (d) val
+    real*8, intent(inout) :: address
+    real*8, value :: val
+    end function
+    attributes(device) pure integer(8) function atomicaddl(address, val)
+  !dir$ ignore_tkr (rd) address, (dk) val
+    integer(8), intent(inout) :: address
+    integer(8), value :: val
+    end function
+  end interface 
+public :: atomicadd
+
 end module
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 2042bbbe19650a..661e5728bf85b8 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -5,6 +5,10 @@
 attributes(global) subroutine devsub()
   implicit none
   integer :: ret
+  real(4) :: af
+  real(8) :: ad
+  integer(4) :: ai
+  integer(8) :: al
 
   call syncthreads()
   call syncwarp(1)
@@ -14,6 +18,11 @@ attributes(global) subroutine devsub()
   ret = syncthreads_and(1)
   ret = syncthreads_count(1)
   ret = syncthreads_or(1)
+
+  ai = atomicadd(ai, 1_4)
+  al = atomicadd(al, 1_8)
+  af = atomicadd(af, 1.0_4)
+  ad = atomicadd(ad, 1.0_8)
 end
 
 ! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
@@ -25,6 +34,10 @@ end
 ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%c1_i32_0) fastmath<contract> : (i32) -> i32
 ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.popc(%c1_i32_1) fastmath<contract> : (i32) -> i32
 ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.or(%c1_i32_2) fastmath<contract> : (i32) -> i32
+! CHECK: %{{.*}} = llvm.atomicrmw add  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
+! CHECK: %{{.*}} = llvm.atomicrmw add  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64
+! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32
+! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f64
 
 ! CHECK: func.func private @llvm.nvvm.barrier0()
 ! CHECK: func.func private @__syncwarp(!fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>}) attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncwarp", fir.proc_attrs = #fir.proc_attrs<bind_c>}
diff --git a/flang/test/Semantics/cuf-device-procedures01.cuf b/flang/test/Semantics/cuf-device-procedures01.cuf
index b9918d8a4ae4ce..92ee02bb3c64df 100644
--- a/flang/test/Semantics/cuf-device-procedures01.cuf
+++ b/flang/test/Semantics/cuf-device-procedures01.cuf
@@ -28,8 +28,17 @@ end
 ! CHECK: threadfence_system (Subroutine): Use from threadfence_system in cudadevice
 
 subroutine host()
+  real(4) :: af
+  real(8) :: ad
+  integer(4) :: ai
+  integer(8) :: al
   call syncthreads()
+  ai = atomicadd(ai, 1_4)
+  al = atomicadd(al, 1_8)
+  af = atomicadd(af, 1.0_4)
+  ad = atomicadd(ad, 1.0_8)
 end subroutine
 
 ! CHECK-LABEL: Subprogram scope: host
+! CHECK: atomicadd, EXTERNAL: HostAssoc{{$}}
 ! CHECK: syncthreads, EXTERNAL: HostAssoc{{$}}

clementval

LGTM. Just a nit on the naming.

clementval · 2025-01-21T23:02:23Z

flang/include/flang/Optimizer/Builder/IntrinsicCall.h

@@ -185,6 +185,7 @@ struct IntrinsicLibrary {
  mlir::Value genAnint(mlir::Type, llvm::ArrayRef<mlir::Value>);
  fir::ExtendedValue genAny(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
  mlir::Value genAtanpi(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genAtomAdd(mlir::Type, llvm::ArrayRef<mlir::Value>);


Is there a reason to truncate Atomic to Atom?

The formatting. It will make each line become 3 lines. I found it a lot less readable.

Looks like it fits here. Are you talking about about the implementation or in the table?

As someone that spending a lot of time grepping, I would much rather having the full "atomic" spelled at the cost of extra lines in the table.

jeanPerier

The approach makes sense to me, although I am not expert enough with the stages after lowering to give great feedback regarding atomic.

The way it is plugged here LGTM. Few questions inlined.

jeanPerier · 2025-01-22T09:25:31Z

flang/include/flang/Optimizer/Builder/IntrinsicCall.h

@@ -185,6 +185,7 @@ struct IntrinsicLibrary {
  mlir::Value genAnint(mlir::Type, llvm::ArrayRef<mlir::Value>);
  fir::ExtendedValue genAny(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
  mlir::Value genAtanpi(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genAtomAdd(mlir::Type, llvm::ArrayRef<mlir::Value>);


As someone that spending a lot of time grepping, I would much rather having the full "atomic" spelled at the cost of extra lines in the table.

jeanPerier · 2025-01-22T09:29:44Z

flang/lib/Optimizer/Builder/IntrinsicCall.cpp

+static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc,
+                                mlir::LLVM::AtomicBinOp binOp, mlir::Value arg0,
+                                mlir::Value arg1) {
+  auto llvmPointerType = mlir::LLVM::LLVMPointerType::get(builder.getContext());


Out of curiosity, do you now if FIR alias analysis is clever enough to go through convert from this LLVM type and deduce what the llvm.atomicrmw is reading/writing not (or mainly, to deduce what it is not reading/writing to)?

It makes sense to me that we interoperate with LLVM pointer type to avoid redefining all the intrinsics at the FIR level, I just wonder if we need to improve AA here.

We always skip through fir.convert

.Case<fir::ConvertOp>([&](auto op) { // Skip ConvertOp's and track further through the operand. v = op->getOperand(0); defOp = v.getDefiningOp(); }) ...

I was trying to make sense of the MemoryEffects of LLVM_AtomicRMWOp but couldn't quite make sense of it:

def LLVM_AtomicRMWOp : LLVM_MemAccessOpBase<"atomicrmw", [ TypesMatchWith<"result #0 and operand #1 have the same type", "val", "res", "$_self">]> {

We talked about these ops in the office hours today and while we need the hooks in place to create the appropriate instructions we may eventually replace them with higher level atomic operations. TDB.

Right, the effect description is not straightforward. If found this patch that explains some of it: feb7bea

From what I understand, it is giving atomic operation a global write effects because the synchronization aspect (e.g., could write some global lock).

jeanPerier · 2025-01-22T09:36:49Z

flang/module/cudadevice.f90


+  interface atomicadd
+    attributes(device) pure integer function atomicaddi(address, val)
+  !dir$ ignore_tkr (rd) address, (d) val


I see that one is allowed to call atomicadd with an address of any rank ("r"). What is the semantic of calling them with an array? Is it that the first element is modified, or that all elements are modified?

Brent mentioned 1 element arrays but could not be definitive. I can remove it and maybe we will learn something.

jeanPerier · 2025-01-22T09:44:06Z

flang/lib/Optimizer/Builder/IntrinsicCall.cpp

@@ -147,6 +147,10 @@ static constexpr IntrinsicHandler handlers[]{
    {"atan2pi", &I::genAtanpi},
    {"atand", &I::genAtand},
    {"atanpi", &I::genAtanpi},
+    {"atomicaddd", &I::genAtomAdd, {{{"addr", asAddr}, {"v", asValue}}}, false},


At some point, we should maybe move the CUF intrinsic module procedures into there own table (a bit like what is done for ppc intrinsics).

This would clarify what are regular Fortran intrinsics expected to be intercepted here, and what are CUF extensions (and if well done, could speed-up the lookup by only looking into the right table based on the intrinsic module).

You can leave that alone, I will try to split the table later (maybe ieee proc could also be taken out).

simoatze · 2025-01-22T18:15:44Z

LGTM, thanks!

The PR follows the earlier #123840 PR for atomic operation support in CUF

The PR follows the earlier llvm/llvm-project#123840 PR for atomic operation support in CUF

Renaud-K added 2 commits January 21, 2025 14:38

Adding atomicadd as a cudadevice intrinsic and converting it LLVM dia…

44a05d0

…lect

Corrected includes added by clangd

18120ce

Renaud-K requested review from clementval and jeanPerier January 21, 2025 22:58

llvmbot added flang Flang issues not falling into any other category flang:fir-hlfir flang:semantics labels Jan 21, 2025

clementval approved these changes Jan 21, 2025

View reviewed changes

Renaud-K force-pushed the cuf-atomic branch from 8c40bd4 to 18120ce Compare January 22, 2025 01:51

jeanPerier changed the title ~~Adding atomicadd as a cudadevice intrinsic and converting it LLVM dialect~~ [Flang] Adding atomicadd as a cudadevice intrinsic and converting it LLVM dialect Jan 22, 2025

jeanPerier approved these changes Jan 22, 2025

View reviewed changes

clementval changed the title ~~[Flang] Adding atomicadd as a cudadevice intrinsic and converting it LLVM dialect~~ [flang][cuda] Adding atomicadd as a cudadevice intrinsic and converting it LLVM dialect Jan 22, 2025

Review feedback

e6b1268

Renaud-K merged commit 652ff20 into llvm:main Jan 23, 2025
8 checks passed

Renaud-K deleted the cuf-atomic branch January 28, 2025 01:58

Renaud-K mentioned this pull request Jan 28, 2025

[flang][cuda] Adding support for more atomic calls #124671

Merged

Renaud-K added a commit that referenced this pull request Jan 28, 2025

[flang][cuda] Adding support for more atomic calls (#124671)

56a0a7f

The PR follows the earlier #123840 PR for atomic operation support in CUF

github-actions bot pushed a commit to arm/arm-toolchain that referenced this pull request Jan 28, 2025

Automerge: [flang][cuda] Adding support for more atomic calls (#124671)

6b5d283

The PR follows the earlier llvm/llvm-project#123840 PR for atomic operation support in CUF

[flang][cuda] Adding atomicadd as a cudadevice intrinsic and converting it LLVM dialect #123840

[flang][cuda] Adding atomicadd as a cudadevice intrinsic and converting it LLVM dialect #123840

Uh oh!

Conversation

Renaud-K commented Jan 21, 2025

Uh oh!

llvmbot commented Jan 21, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

clementval left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

jeanPerier Jan 22, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

jeanPerier left a comment

Choose a reason for hiding this comment

Uh oh!

jeanPerier Jan 22, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Renaud-K Jan 22, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

simoatze commented Jan 22, 2025

Uh oh!

Uh oh!

Uh oh!

llvmbot commented Jan 21, 2025 •

edited

Loading

jeanPerier Jan 22, 2025 •

edited

Loading

jeanPerier Jan 22, 2025 •

edited

Loading

Renaud-K Jan 22, 2025 •

edited

Loading