intel · martygrant · Sep 4, 2024 · Jul 10, 2024 · Jul 11, 2024 · Jul 15, 2024
@@ -6851,6 +6851,10 @@ class OffloadingActionBuilder final {
   /// Offload deps output is then forwarded to active device action builders so
   /// they can add it to the device linker inputs.
   void addDeviceLinkDependenciesFromHost(ActionList &LinkerInputs) {
+    if (isSYCLNativeCPU(C.getArgs())) {
+      // SYCL Native CPU doesn't need deps from clang-offload-deps.
+      return;
+    }
     // Link image for reading dependencies from it.
     auto *LA = C.MakeAction<LinkJobAction>(LinkerInputs,
                                            types::TY_Host_Dependencies_Image);

@@ -10809,7 +10809,7 @@ static bool shouldEmitOnlyKernelsAsEntryPoints(const ToolChain &TC,
                      options::OPT_fsycl_remove_unused_external_funcs, false))
     return false;
   if (isSYCLNativeCPU(TC))
-    return false;
+    return true;
   // When supporting dynamic linking, non-kernels in a device image can be
   // called.
   if (supportDynamicLinking(TCArgs))

@@ -48,7 +48,6 @@
 //CHECK_INVO:{{.*}}clang{{.*}}"-fsycl-is-host"{{.*}}
 //CHECK_INVO:{{.*}}clang{{.*}}"-x" "ir"
 //CHECK_INVO:{{.*}}sycl-post-link{{.*}}"-emit-program-metadata"
-//CHECK_INVO-NOT:{{.*}}sycl-post-link{{.*}}-emit-only-kernels-as-entry-points
 
 // checks that the device and host triple is correct in the generated actions when it is set explicitly
 //CHECK_ACTIONS-AARCH64:        +- 5: offload, "host-sycl (aarch64-unknown-linux-gnu)" {1}, "device-sycl (aarch64-unknown-linux-gnu)" {4}, c++-cpp-output

@@ -145,6 +145,7 @@ DefineGOp1(All, __mux_sub_group_all_i1)
 
 #define DefineFPGOps(Name, MuxName)                                            \
   DefineGOp(float, float, Name, MuxName##32)                                   \
+  DefineGOp(_Float16 , _Float16 , Name, MuxName##16)                           \
   DefineGOp(double, double, Name, MuxName##64)
 
 DefineIntGOps(IAdd,    add_i)
@@ -170,16 +171,23 @@ DefineBitwiseGroupOp(uint32_t, int32_t, i32)
 DefineBitwiseGroupOp(int64_t, int64_t, i64)
 DefineBitwiseGroupOp(uint64_t, int64_t, i64)
 
-#define DefineBroadCastImpl(Type, Sfx, MuxType, IDType)                        \
-  DEVICE_EXTERN_C MuxType __mux_work_group_broadcast_##Sfx(                    \
-      int32_t id, MuxType val, int64_t lidx, int64_t lidy, int64_t lidz);      \
-  DEVICE_EXTERN_C MuxType __mux_sub_group_broadcast_##Sfx(MuxType val,         \
-                                                          int32_t sg_lid);     \
-  DEVICE_EXTERNAL Type __spirv_GroupBroadcast(uint32_t g, Type v,              \
-                                              IDType l) {                      \
-    if (__spv::Scope::Flag::Subgroup == g)                                     \
-      return __mux_sub_group_broadcast_##Sfx(v, l);                            \
-    return Type(); /*todo: add support for other flags as they are tested*/    \
+#define DefineLogicalGroupOp(Type, MuxType, mux_sfx)                          \
+  DefineGOp(Type, MuxType, LogicalOrKHR, logical_or_##mux_sfx)                \
+  DefineGOp(Type, MuxType, LogicalXorKHR, logical_xor_##mux_sfx)              \
+  DefineGOp(Type, MuxType, LogicalAndKHR, logical_and_##mux_sfx)
+
+DefineLogicalGroupOp(bool, bool, i1)
+
+#define DefineBroadCastImpl(Type, Sfx, MuxType, IDType)                       \
+  DEVICE_EXTERN_C MuxType __mux_work_group_broadcast_##Sfx(                   \
+      int32_t id, MuxType val, int64_t lidx, int64_t lidy, int64_t lidz);     \
+  DEVICE_EXTERN_C MuxType __mux_sub_group_broadcast_##Sfx(MuxType val,        \
+                                                          int32_t sg_lid);    \
+  DEVICE_EXTERNAL Type __spirv_GroupBroadcast(uint32_t g, Type v,             \
+                                              IDType l) {                     \
+    if (__spv::Scope::Flag::Subgroup == g)                                    \
+      return __mux_sub_group_broadcast_##Sfx(v, l);                           \
+    return Type(); /*todo: add support for other flags as they are tested*/   \
   }
 
 #define DefineBroadcastMuxType(Type, Sfx, MuxType, IDType)                    \

@@ -12,9 +12,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/SYCLLowerIR/PrepareSYCLNativeCPU.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/SYCLLowerIR/SYCLUtils.h"
 
@@ -23,7 +26,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
@@ -35,13 +37,13 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <utility>
 #include <vector>
 
 #ifdef NATIVECPU_USE_OCK
 #include "compiler/utils/attributes.h"
-#include "compiler/utils/builtin_info.h"
 #include "compiler/utils/metadata.h"
 #endif
 
@@ -331,31 +333,85 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
     UsedBuiltins.push_back({Glob, Entry.second});
   }
 
-  SmallVector<Function *> NewKernels;
-  for (auto &OldF : OldKernels) {
 #ifdef NATIVECPU_USE_OCK
-    auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF);
-    OldF->setName(Name);
-    // if vectorization occurred, at this point we have a wrapper function that
-    // runs the vectorized kernel and peels using the scalar kernel. We make it
-    // so this wrapper steals the original kernel name.
-    std::optional<compiler::utils::LinkMetadataResult> veczR =
-        compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF);
-    if (veczR && veczR.value().first) {
-      auto ScalarF = veczR.value().first;
-      OldF->takeName(ScalarF);
-      ScalarF->setName(OldF->getName() + "_scalar");
-    } else if (Name != OldF->getName()) {
-      auto RealKernel = M.getFunction(Name);
-      if (RealKernel) {
-        // the real kernel was not inlined in the wrapper, steal its name
-        OldF->takeName(RealKernel);
+  {
+    SmallSet<Function *, 5> RemovableFuncs;
+    SmallVector<Function *, 5> WrapperFuncs;
+
+    // Retrieve the wrapper functions created by the WorkItemLoop pass.
+    for (auto &OldF : OldKernels) {
+      std::optional<compiler::utils::LinkMetadataResult> VeczR =
+          compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF);
+      if (VeczR && VeczR.value().first) {
+        WrapperFuncs.push_back(OldF);
       } else {
-        // the real kernel has been inlined, just use the name
-        OldF->setName(Name);
+        auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF);
+        if (Name != OldF->getName()) {
+          WrapperFuncs.push_back(OldF);
+        }
       }
     }
+
+    for (auto &OldF : WrapperFuncs) {
+      // If vectorization occurred, at this point we have a wrapper function
+      // that runs the vectorized kernel and peels using the scalar kernel. We
+      // make it so this wrapper steals the original kernel name.
+      std::optional<compiler::utils::LinkMetadataResult> VeczR =
+          compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF);
+      if (VeczR && VeczR.value().first) {
+        auto ScalarF = VeczR.value().first;
+        OldF->takeName(ScalarF);
+        if (ScalarF->use_empty())
+          RemovableFuncs.insert(ScalarF);
+      } else {
+        // The WorkItemLoops pass created a wrapper function for the original
+        // kernel. If we have a kernel named foo(), the wrapper will be called
+        // foo-wrapper(), and will have the original kernel name retrieved by
+        // getBaseFnNameOrFnName. We set the name of the wrapper function
+        // to the original kernel name and add the original kernel to the
+        // list of functions that can be removed from the module.
+        auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF);
+        Function *OrigF = M.getFunction(Name);
+        if (OrigF != nullptr) {
+          // The original kernel is inlined by the WorkItemLoops
+          // pass if it contained barriers or group collectives, otherwise
+          // we don't want to (and can't) remove it.
+          if (OrigF->use_empty())
+            RemovableFuncs.insert(OrigF);
+          OldF->takeName(OrigF);
+        } else {
+          OldF->setName(Name);
+        }
+      }
+    }
+
+    // Find any left over SYCL_EXTERNAL function that has no more uses
+    std::set<Function *> Kernelset(OldKernels.begin(), OldKernels.end());
+    for (auto &F : M) {
+      if (Kernelset.count(&F) == 0 &&
+          F.hasFnAttribute(sycl::utils::ATTR_SYCL_MODULE_ID) && F.use_empty() &&
+          !F.getName().starts_with("__dpcpp_nativecpu")) {
+        // SYCL_EXTERNAL functions end up in static array of function pointers,
+        // at this point we can remove them from the array and remove the
+        // function if no other uses are left.
+        RemovableFuncs.insert(&F);
+      }
+    }
+
+    // Remove unused functions. This is necessary in case they still contain
+    // calls to group collective functions that haven't been processed by the
+    // work item loops pass, which will lead to linker errors.
+    llvm::erase_if(OldKernels,
+                   [&](Function *F) { return RemovableFuncs.contains(F); });
+
+    for (Function *F : RemovableFuncs) {
+      F->eraseFromParent();
+    }
+  }
 #endif
+
+  SmallVector<Function *> NewKernels;
+  for (auto &OldF : OldKernels) {
     auto *NewF =
         cloneFunctionAndAddParam(OldF, StatePtrType, CurrentStatePointerTLS);
     NewF->takeName(OldF);
@@ -416,54 +472,26 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
       OldI->replaceAllUsesWith(NewI);
       OldI->eraseFromParent();
     }
-    for (auto temp : ToRemove2)
-      temp->eraseFromParent();
+    for (auto Temp : ToRemove2)
+      Temp->eraseFromParent();
 
     // Finally, we erase the builtin from the module
     Glob->eraseFromParent();
   }
 
-#ifdef NATIVECPU_USE_OCK
-  // Define __mux_mem_barrier here using the OCK
-  compiler::utils::BuiltinInfo BI;
-  for (auto &F : M) {
-    if (F.getName() == compiler::utils::MuxBuiltins::mem_barrier) {
-      BI.defineMuxBuiltin(compiler::utils::BaseBuiltinID::eMuxBuiltinMemBarrier,
-                          M);
-    }
-  }
-  // if we find calls to mux barrier now, it means that we had SYCL_EXTERNAL
-  // functions that called __mux_work_group_barrier, which didn't get processed
-  // by the WorkItemLoop pass. This means that the actual function call has been
-  // inlined into the kernel, and the call to __mux_work_group_barrier has been
-  // removed in the inlined call, but not in the original function. The original
-  // function will not be executed (since it has been inlined) and so we can
-  // just define __mux_work_group_barrier as a no-op to avoid linker errors.
-  // Todo: currently we can't remove the function here even if it has no uses,
-  // because we may still emit a declaration for it in the offload-wrapper.
-  auto BarrierF =
-      M.getFunction(compiler::utils::MuxBuiltins::work_group_barrier);
-  if (BarrierF && BarrierF->isDeclaration()) {
-    IRBuilder<> Builder(M.getContext());
-    auto BB = BasicBlock::Create(M.getContext(), "noop", BarrierF);
-    Builder.SetInsertPoint(BB);
-    Builder.CreateRetVoid();
-  }
-#endif
-
-  // removing unused builtins
+  // Removing unused builtins
   SmallVector<Function *> UnusedLibBuiltins;
   for (auto &F : M) {
     if (IsUnusedBuiltinOrPrivateDef(F)) {
       UnusedLibBuiltins.push_back(&F);
     }
   }
-  for (Function *f : UnusedLibBuiltins) {
-    f->eraseFromParent();
+  for (Function *F : UnusedLibBuiltins) {
+    F->eraseFromParent();
     ModuleChanged = true;
   }
-  for (auto it = M.begin(); it != M.end();) {
-    auto Curr = it++;
+  for (auto It = M.begin(); It != M.end();) {
+    auto Curr = It++;
     Function &F = *Curr;
     if (F.getNumUses() == 0 && F.isDeclaration() &&
         F.getName().starts_with("__mux_")) {

@@ -0,0 +1,119 @@
+// REQUIRES: native_cpu_ock
+
+// Tests that no linker errors occur when group collective functions are used
+// in conjuction with SYCL_EXTERNAL.
+
+// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -DFILE1 -c -o %t1.o %s
+// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -DFILE2 -c -o %t2.o %s
+// RUN: llvm-ar crv %t1.a %t1.o
+// RUN: %clangxx -fsycl -fsycl-targets=native_cpu %t2.o %t1.a -o %t.out
+// RUN: env ONEAPI_DEVICE_SELECTOR=native_cpu:cpu %t.out
+
+/*
+    test performs a lattice reduction.
+    sycl::vec<float> is sensitive to .get_size() vs .size() in SYCL headers
+    (ie, byte size versus vector size)
+*/
+
+#include <sycl/detail/core.hpp>
+#include <sycl/group_algorithm.hpp>
+#include <sycl/usm.hpp>
+
+using namespace sycl;
+
+#define NX 32
+#define NZ 2
+#define NV 8
+using vecn = sycl::vec<float, NV>; // 8 floats
+#ifdef FILE1
+
+SYCL_EXTERNAL void groupSum(vecn *r, const vecn &in, const int k,
+                            sycl::group<2> &grp, const int i) {
+
+  vecn tin = (i == k ? in : vecn(0));
+  auto out = reduce_over_group(grp, tin, sycl::plus<>());
+  if (i == k && grp.get_local_id()[1] == 0)
+    r[k] = out;
+}
+#endif
+
+#ifdef FILE2
+SYCL_EXTERNAL void groupSum(vecn *r, const vecn &in, const int k,
+                            sycl::group<2> &grp, const int i);
+void test(queue q, float *r, float *x,
+          int n) { // r is 16 floats, x is 256 floats. n is 256
+
+  sycl::range<2> globalSize(NZ, NX); // 2,32
+  sycl::range<2> localSize(1, NX);   // 1,8       so 16 iterations
+  sycl::nd_range<2> range{globalSize, localSize};
+
+  q.submit([&](sycl::handler &h) {
+    h.parallel_for<>(range, [=](sycl::nd_item<2> ndi) {
+      int i = ndi.get_global_id(1);
+      int k = ndi.get_global_id(0);
+
+      auto vx = reinterpret_cast<vecn *>(x);
+      auto vr = reinterpret_cast<vecn *>(r);
+
+      auto myg = ndi.get_group();
+
+      for (int iz = 0; iz < NZ; iz++) { // loop over Z (2)
+        groupSum(vr, vx[k * NX + i], k, myg, iz);
+      }
+    });
+  });
+  q.wait();
+}
+
+int main() {
+
+  queue q{default_selector_v};
+  auto dev = q.get_device();
+  std::cout << "Device: " << dev.get_info<info::device::name>() << std::endl;
+
+  auto ctx = q.get_context();
+  int n = NX * NZ * NV; // 16 * 8 * 2 => 256
+  auto *x = (float *)sycl::malloc_shared(n * sizeof(float), dev,
+                                         ctx); // 256 * sizeof(float)
+  auto *r = (float *)sycl::malloc_shared(
+      NZ * NV * sizeof(float), dev, ctx); // 2 * 8 => 16   ( * sizeof(float) )
+
+  for (int i = 0; i < n; i++) {
+    x[i] = i;
+  }
+
+  q.wait();
+
+  test(q, r, x, n);
+
+  int fails = 0;
+  for (int k = 0; k < NZ; k++) {
+    float s[NV] = {0};
+    for (int i = 0; i < NX; i++) {
+      for (int j = 0; j < NV; j++) {
+        s[j] += x[(k * NX + i) * NV + j];
+      }
+    }
+    for (int j = 0; j < NV; j++) {
+      auto d = s[j] - r[k * NV + j];
+      if (std::abs(d) > 1e-10) {
+        printf("partial fail ");
+        printf("%i\t%i\t%g\t%g\n", k, j, s[j], r[k * NV + j]);
+        fails++;
+      } else {
+        printf("partial pass ");
+        printf("%i\t%i\t%g\t%g\n", k, j, s[j], r[k * NV + j]);
+      }
+    }
+  }
+
+  if (fails == 0) {
+    printf("test passed!\n");
+  } else {
+    printf("test failed!\n");
+  }
+  free(x, ctx);
+  free(r, ctx);
+  return fails;
+}
+#endif