llvm · kaviya2510 · Apr 30, 2025 · Dec 6, 2024 · Mar 5, 2025 · Mar 12, 2025
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -305,7 +305,7 @@ def fir_LoadOp : fir_OneResultOp<"load", [FirAliasTagOpInterface,
   }];
 
   let arguments = (ins AnyReferenceLike:$memref,
-                  OptionalAttr<LLVM_TBAATagArrayAttr>:$tbaa);
+      OptionalAttr<LLVM_TBAATagArrayAttr>:$tbaa, UnitAttr:$nontemporal);
 
   let builders = [OpBuilder<(ins "mlir::Value":$refVal)>,
                   OpBuilder<(ins "mlir::Type":$resTy, "mlir::Value":$refVal)>];
@@ -337,9 +337,8 @@ def fir_StoreOp : fir_Op<"store", [FirAliasTagOpInterface,
     `%p`, is undefined or null.
   }];
 
-  let arguments = (ins AnyType:$value,
-                   AnyReferenceLike:$memref,
-                   OptionalAttr<LLVM_TBAATagArrayAttr>:$tbaa);
+  let arguments = (ins AnyType:$value, AnyReferenceLike:$memref,
+      OptionalAttr<LLVM_TBAATagArrayAttr>:$tbaa, UnitAttr:$nontemporal);
 
   let builders = [OpBuilder<(ins "mlir::Value":$value, "mlir::Value":$memref)>];
 

diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td b/flang/include/flang/Optimizer/OpenMP/Passes.td
@@ -81,6 +81,13 @@ def DoConcurrentConversionPass : Pass<"omp-do-concurrent-conversion", "mlir::fun
   ];
 }
 
+def LowerNontemporalPass : Pass<"lower-nontemporal", "mlir::func::FuncOp"> {
+  let summary =
+      "Adds nontemporal attribute to loads and stores performed on "
+      "the list items specified in the nontemporal clause of omp.simd.";
+  let dependentDialects = ["mlir::omp::OpenMPDialect"];
+}
+
 // Needs to be scheduled on Module as we create functions in it
 def LowerWorkshare : Pass<"lower-workshare", "::mlir::ModuleOp"> {
   let summary = "Lower workshare construct";

diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -3569,8 +3569,13 @@ struct StoreOpConversion : public fir::FIROpConversion<fir::StoreOp> {
     } else {
       mlir::LLVM::StoreOp storeOp =
           rewriter.create<mlir::LLVM::StoreOp>(loc, llvmValue, llvmMemref);
+
       if (isVolatile)
         storeOp.setVolatile_(true);
+
+      if (store.getNontemporal())
+        storeOp.setNontemporal(true);
+
       newOp = storeOp;
     }
     if (std::optional<mlir::ArrayAttr> optionalTag = store.getTbaa())

diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
@@ -8,6 +8,7 @@ add_flang_library(FlangOpenMPTransforms
   MapInfoFinalization.cpp
   MarkDeclareTarget.cpp
   LowerWorkshare.cpp
+  LowerNontemporal.cpp
 
   DEPENDS
   FIRDialect
@@ -17,7 +18,7 @@ add_flang_library(FlangOpenMPTransforms
   LINK_LIBS
   FIRAnalysis
   FIRBuilder
-  FIRCodeGen
+  FIRCodeGenDialect
   FIRDialect
   FIRDialectSupport
   FIRSupport

diff --git a/flang/lib/Optimizer/OpenMP/LowerNontemporal.cpp b/flang/lib/Optimizer/OpenMP/LowerNontemporal.cpp
@@ -0,0 +1,84 @@
+//===- LowerNontemporal.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Add nontemporal attributes to load and stores of variables marked as
+// nontemporal.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Dialect/FIRCG/CGOps.h"
+#include "flang/Optimizer/Dialect/FIROpsSupport.h"
+#include "flang/Optimizer/OpenMP/Passes.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+using namespace mlir;
+
+namespace flangomp {
+#define GEN_PASS_DEF_LOWERNONTEMPORALPASS
+#include "flang/Optimizer/OpenMP/Passes.h.inc"
+} // namespace flangomp
+
+namespace {
+class LowerNontemporalPass
+    : public flangomp::impl::LowerNontemporalPassBase<LowerNontemporalPass> {
+  void addNonTemporalAttr(omp::SimdOp simdOp) {
+    if (simdOp.getNontemporalVars().empty())
+      return;
+
+    std::function<mlir::Value(mlir::Value)> getBaseOperand =
+        [&](mlir::Value operand) -> mlir::Value {
+      auto *defOp = operand.getDefiningOp();
+      while (defOp) {
+        llvm::TypeSwitch<Operation *>(defOp)
+            .Case<fir::ArrayCoorOp, fir::cg::XArrayCoorOp, fir::LoadOp>(
+                [&](auto op) {
+                  operand = op.getMemref();
+                  defOp = operand.getDefiningOp();
+                })
+            .Case<fir::BoxAddrOp>([&](auto op) {
+              operand = op.getVal();
+              defOp = operand.getDefiningOp();
+            })
+            .Default([&](auto op) { defOp = nullptr; });
+      }
+      return operand;
+    };
+
+    // walk through the operations and mark the load and store as nontemporal
+    simdOp->walk([&](Operation *op) {
+      mlir::Value operand = nullptr;
+
+      if (auto loadOp = llvm::dyn_cast<fir::LoadOp>(op))
+        operand = loadOp.getMemref();
+      else if (auto storeOp = llvm::dyn_cast<fir::StoreOp>(op))
+        operand = storeOp.getMemref();
+
+      // Skip load and store operations involving boxes (allocatable or pointer
+      // types).
+      if (operand && !(fir::isAllocatableType(operand.getType()) ||
+                       fir::isPointerType((operand.getType())))) {
+        operand = getBaseOperand(operand);
+
+        // TODO : Handling of nontemporal clause inside atomic construct
+        if (llvm::is_contained(simdOp.getNontemporalVars(), operand)) {
+          if (auto loadOp = llvm::dyn_cast<fir::LoadOp>(op))
+            loadOp.setNontemporal(true);
+          else if (auto storeOp = llvm::dyn_cast<fir::StoreOp>(op))
+            storeOp.setNontemporal(true);
+        }
+      }
+    });
+  }
+
+  void runOnOperation() override {
+    Operation *op = getOperation();
+    op->walk([&](omp::SimdOp simdOp) { addNonTemporalAttr(simdOp); });
+  }
+};
+} // namespace
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -347,6 +347,11 @@ void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
        config.ApproxFuncFPMath, config.NoSignedZerosFPMath, config.UnsafeFPMath,
        ""}));
 
+  if (config.EnableOpenMP) {
+    pm.addNestedPass<mlir::func::FuncOp>(
+        flangomp::createLowerNontemporalPass());
+  }
+
   fir::addFIRToLLVMPass(pm, config);
 }
 

diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir
@@ -149,6 +149,7 @@ func.func @_QQmain() {
 // PASSES-NEXT: CompilerGeneratedNamesConversion
 // PASSES-NEXT: 'func.func' Pipeline
 // PASSES-NEXT:  FunctionAttr
+// PASSES-NEXT:  LowerNontemporalPass
 // PASSES-NEXT: FIRToLLVMLowering
 // PASSES-NEXT: ReconcileUnrealizedCasts
 // PASSES-NEXT: LLVMIRLoweringPass
diff --git a/flang/test/Fir/convert-nontemporal-to-llvm.fir b/flang/test/Fir/convert-nontemporal-to-llvm.fir
@@ -0,0 +1,111 @@
+// Test lower-nontemporal pass
+// RUN: fir-opt --fir-to-llvm-ir %s | FileCheck %s --check-prefixes=CHECK-LABEL,CHECK
+
+// CHECK-LABEL:  llvm.func @_QPtest() 
+// CHECK:    %[[CONST_VAL:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK:    %[[VAL1:.*]] = llvm.alloca %[[CONST_VAL]] x i32 {bindc_name = "n"} : (i64) -> !llvm.ptr
+// CHECK:    %[[CONST_VAL1:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK:    %[[VAL2:.*]] = llvm.alloca %[[CONST_VAL1]] x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+// CHECK:    %[[CONST_VAL2:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK:    %[[VAL3:.*]] = llvm.alloca %[[CONST_VAL2]] x i32 {bindc_name = "c"} : (i64) -> !llvm.ptr
+// CHECK:    %[[CONST_VAL3:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK:    %[[VAL4:.*]] = llvm.alloca %[[CONST_VAL3]] x i32 {bindc_name = "b"} : (i64) -> !llvm.ptr
+// CHECK:    %[[CONST_VAL4:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK:    %[[VAL5:.*]] = llvm.alloca %[[CONST_VAL4]] x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr
+// CHECK:    %[[CONST_VAL5:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:    %[[VAL6:.*]] = llvm.load %[[VAL1]] : !llvm.ptr -> i32
+// CHECK:    omp.simd nontemporal(%[[VAL5]], %[[VAL3]] : !llvm.ptr, !llvm.ptr) private(@_QFtestEi_private_i32 %[[VAL2]] -> %arg0 : !llvm.ptr) {
+// CHECK:      omp.loop_nest (%{{.*}}) : i32 = (%[[CONST_VAL5]]) to (%[[VAL6]]) inclusive step (%[[CONST_VAL5]]) {
+// CHECK:        llvm.store %{{.*}}, %{{.*}} : i32, !llvm.ptr
+// CHECK:        %[[VAL8:.*]] = llvm.load %[[VAL5]] {nontemporal} : !llvm.ptr -> i32
+// CHECK:        %[[VAL9:.*]] = llvm.load %[[VAL4]] : !llvm.ptr -> i32
+// CHECK:        %[[VAL10:.*]] = llvm.add %[[VAL8]], %[[VAL9]] : i32
+// CHECK:        llvm.store %[[VAL10]], %[[VAL3]] {nontemporal} : i32, !llvm.ptr
+// CHECK:        omp.yield
+// CHECK:      }
+// CHECK:    }
+
+ func.func @_QPtest() {
+    %c1_i32 = arith.constant 1 : i32
+    %0 = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFtestEa"}
+    %1 = fir.alloca i32 {bindc_name = "b", uniq_name = "_QFtestEb"}
+    %2 = fir.alloca i32 {bindc_name = "c", uniq_name = "_QFtestEc"}
+    %3 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtestEi"}
+    %4 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFtestEn"}
+    %5 = fir.load %4 : !fir.ref<i32>
+    omp.simd nontemporal(%0, %2 : !fir.ref<i32>, !fir.ref<i32>) private(@_QFtestEi_private_i32 %3 -> %arg0 : !fir.ref<i32>) {
+      omp.loop_nest (%arg1) : i32 = (%c1_i32) to (%5) inclusive step (%c1_i32) {
+        fir.store %arg1 to %arg0 : !fir.ref<i32>
+        %6 = fir.load %0 {nontemporal}: !fir.ref<i32>
+        %7 = fir.load %1 : !fir.ref<i32>
+        %8 = arith.addi %6, %7 : i32
+        fir.store %8 to %2 {nontemporal} : !fir.ref<i32>
+        omp.yield
+      }
+    }
+    return
+  }
+
+// CHECK-LABEL:  llvm.func @_QPsimd_nontemporal_allocatable
+// CHECK:    %[[CONST_VAL:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK:    %[[ALLOCA2:.*]] = llvm.alloca %[[CONST_VAL]] x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+// CHECK:    %[[IDX_VAL:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:    %[[CONST_VAL1:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK:    %[[END_IDX:.*]] = llvm.mlir.constant(100 : i32) : i32
+// CHECK:    omp.simd nontemporal(%[[ARG0:.*]] : !llvm.ptr) private(@_QFsimd_nontemporal_allocatableEi_private_i32 %[[ALLOCA2]] -> %[[ARG2:.*]] : !llvm.ptr) {
+// CHECK:      omp.loop_nest (%[[ARG3:.*]]) : i32 = (%[[IDX_VAL]]) to (%[[END_IDX]]) inclusive step (%[[IDX_VAL]]) {
+// CHECK:        llvm.store %[[ARG3]], %[[ARG2]] : i32, !llvm.ptr
+// CHECK:        %[[CONST_VAL2:.*]] = llvm.mlir.constant(48 : i32) : i32
+// CHECK:        "llvm.intr.memcpy"(%[[ALLOCA1:.*]], %[[ARG0]], %[[CONST_VAL2]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+// CHECK:        %[[VAL1:.*]] = llvm.load %[[ARG2]] : !llvm.ptr -> i32
+// CHECK:        %[[VAL2:.*]] = llvm.sext %[[VAL1]] : i32 to i64
+// CHECK:        %[[VAL3:.*]] = llvm.getelementptr %[[ALLOCA1]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK:        %[[VAL4:.*]] = llvm.load %[[VAL3]] : !llvm.ptr -> !llvm.ptr
+// CHECK:        %[[VAL5:.*]] = llvm.getelementptr %[[ALLOCA1]][0, 7, %[[CONST_VAL1]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK:        %[[VAL6:.*]] = llvm.load %[[VAL5]] : !llvm.ptr -> i64
+// CHECK:        %[[VAL7:.*]] = llvm.getelementptr %[[ALLOCA1]][0, 7, %[[CONST_VAL1]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK:        %[[VAL8:.*]] = llvm.load %[[VAL7]] : !llvm.ptr -> i64
+// CHECK:        %[[VAL10:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK:        %[[VAL11:.*]] = llvm.mlir.constant(0 : i64) : i64
+// CHECK:        %[[VAL12:.*]] = llvm.sub %[[VAL2]], %[[VAL6]] overflow<nsw> : i64
+// CHECK:        %[[VAL13:.*]] = llvm.mul %[[VAL12]], %[[VAL10]] overflow<nsw> : i64
+// CHECK:        %[[VAL14:.*]] = llvm.mul %[[VAL13]], %[[VAL10]] overflow<nsw> : i64
+// CHECK:        %[[VAL15:.*]] = llvm.add %[[VAL14]], %[[VAL11]] overflow<nsw> : i64
+// CHECK:        %[[VAL16:.*]] = llvm.mul %[[VAL10]], %[[VAL8]] overflow<nsw> : i64
+// CHECK:        %[[VAL17:.*]] = llvm.getelementptr %[[VAL4]][%[[VAL15]]] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+// CHECK:        %[[VAL18:.*]] = llvm.load %[[VAL17]] {nontemporal} : !llvm.ptr -> i32
+// CHECK:        %[[VAL19:.*]] = llvm.load %{{.*}} : !llvm.ptr -> i32
+// CHECK:        %[[VAL20:.*]] = llvm.add %[[VAL18]], %[[VAL19]] : i32
+// CHECK:        llvm.store %[[VAL20]], %[[VAL17]] {nontemporal} : i32, !llvm.ptr
+// CHECK:        omp.yield
+// CHECK:      }
+// CHECK:    }
+// CHECK:    llvm.return
+
+  func.func @_QPsimd_nontemporal_allocatable(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {fir.bindc_name = "x"}, %arg1: !fir.ref<i32> {fir.bindc_name = "y"}) {
+   %c100 = arith.constant 100 : index
+   %c1_i32 = arith.constant 1 : i32
+    %c0 = arith.constant 0 : index
+    %c100_i32 = arith.constant 100 : i32
+    %0 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimd_nontemporal_allocatableEi"}
+    %1 = fir.allocmem !fir.array<?xi32>, %c100 {fir.must_be_heap = true, uniq_name = "_QFsimd_nontemporal_allocatableEx.alloc"}
+    %2 = fircg.ext_embox %1(%c100) : (!fir.heap<!fir.array<?xi32>>, index) -> !fir.box<!fir.heap<!fir.array<?xi32>>>
+    fir.store %2 to %arg0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+    omp.simd nontemporal(%arg0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) private(@_QFsimd_nontemporal_allocatableEi_private_i32 %0 -> %arg2 : !fir.ref<i32>) {
+      omp.loop_nest (%arg3) : i32 = (%c1_i32) to (%c100_i32) inclusive step (%c1_i32) {
+        fir.store %arg3 to %arg2 : !fir.ref<i32>
+        %7 = fir.load %arg0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+        %8 = fir.load %arg2 : !fir.ref<i32>
+        %9 = fir.convert %8 : (i32) -> i64
+        %10 = fir.box_addr %7 : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+        %11:3 = fir.box_dims %7, %c0 : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
+        %12 = fircg.ext_array_coor %10(%11#1) origin %11#0<%9> : (!fir.heap<!fir.array<?xi32>>, index, index, i64) -> !fir.ref<i32>
+        %13 = fir.load %12 {nontemporal} : !fir.ref<i32> 
+        %14 = fir.load %arg1 : !fir.ref<i32>
+        %15 = arith.addi %13, %14 : i32
+        fir.store %15 to %12 {nontemporal} : !fir.ref<i32>
+        omp.yield
+      }
+    }
+    return
+  }