Skip to content

[SYCL][NATIVECPU] Fix linker errors for WorkGroup collective functions #15144

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Sep 4, 2024
Merged
4 changes: 4 additions & 0 deletions clang/lib/Driver/Driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6851,6 +6851,10 @@ class OffloadingActionBuilder final {
/// Offload deps output is then forwarded to active device action builders so
/// they can add it to the device linker inputs.
void addDeviceLinkDependenciesFromHost(ActionList &LinkerInputs) {
if (isSYCLNativeCPU(C.getArgs())) {
// SYCL Native CPU doesn't need deps from clang-offload-deps.
return;
}
// Link image for reading dependencies from it.
auto *LA = C.MakeAction<LinkJobAction>(LinkerInputs,
types::TY_Host_Dependencies_Image);
Expand Down
2 changes: 1 addition & 1 deletion clang/lib/Driver/ToolChains/Clang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10809,7 +10809,7 @@ static bool shouldEmitOnlyKernelsAsEntryPoints(const ToolChain &TC,
options::OPT_fsycl_remove_unused_external_funcs, false))
return false;
if (isSYCLNativeCPU(TC))
return false;
return true;
// When supporting dynamic linking, non-kernels in a device image can be
// called.
if (supportDynamicLinking(TCArgs))
Expand Down
1 change: 0 additions & 1 deletion clang/test/Driver/sycl-native-cpu-fsycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@
//CHECK_INVO:{{.*}}clang{{.*}}"-fsycl-is-host"{{.*}}
//CHECK_INVO:{{.*}}clang{{.*}}"-x" "ir"
//CHECK_INVO:{{.*}}sycl-post-link{{.*}}"-emit-program-metadata"
//CHECK_INVO-NOT:{{.*}}sycl-post-link{{.*}}-emit-only-kernels-as-entry-points

// checks that the device and host triple is correct in the generated actions when it is set explicitly
//CHECK_ACTIONS-AARCH64: +- 5: offload, "host-sycl (aarch64-unknown-linux-gnu)" {1}, "device-sycl (aarch64-unknown-linux-gnu)" {4}, c++-cpp-output
Expand Down
28 changes: 18 additions & 10 deletions libdevice/nativecpu_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ DefineGOp1(All, __mux_sub_group_all_i1)

#define DefineFPGOps(Name, MuxName) \
DefineGOp(float, float, Name, MuxName##32) \
DefineGOp(_Float16 , _Float16 , Name, MuxName##16) \
DefineGOp(double, double, Name, MuxName##64)

DefineIntGOps(IAdd, add_i)
Expand All @@ -170,16 +171,23 @@ DefineBitwiseGroupOp(uint32_t, int32_t, i32)
DefineBitwiseGroupOp(int64_t, int64_t, i64)
DefineBitwiseGroupOp(uint64_t, int64_t, i64)

#define DefineBroadCastImpl(Type, Sfx, MuxType, IDType) \
DEVICE_EXTERN_C MuxType __mux_work_group_broadcast_##Sfx( \
int32_t id, MuxType val, int64_t lidx, int64_t lidy, int64_t lidz); \
DEVICE_EXTERN_C MuxType __mux_sub_group_broadcast_##Sfx(MuxType val, \
int32_t sg_lid); \
DEVICE_EXTERNAL Type __spirv_GroupBroadcast(uint32_t g, Type v, \
IDType l) { \
if (__spv::Scope::Flag::Subgroup == g) \
return __mux_sub_group_broadcast_##Sfx(v, l); \
return Type(); /*todo: add support for other flags as they are tested*/ \
#define DefineLogicalGroupOp(Type, MuxType, mux_sfx) \
DefineGOp(Type, MuxType, LogicalOrKHR, logical_or_##mux_sfx) \
DefineGOp(Type, MuxType, LogicalXorKHR, logical_xor_##mux_sfx) \
DefineGOp(Type, MuxType, LogicalAndKHR, logical_and_##mux_sfx)

DefineLogicalGroupOp(bool, bool, i1)

#define DefineBroadCastImpl(Type, Sfx, MuxType, IDType) \
DEVICE_EXTERN_C MuxType __mux_work_group_broadcast_##Sfx( \
int32_t id, MuxType val, int64_t lidx, int64_t lidy, int64_t lidz); \
DEVICE_EXTERN_C MuxType __mux_sub_group_broadcast_##Sfx(MuxType val, \
int32_t sg_lid); \
DEVICE_EXTERNAL Type __spirv_GroupBroadcast(uint32_t g, Type v, \
IDType l) { \
if (__spv::Scope::Flag::Subgroup == g) \
return __mux_sub_group_broadcast_##Sfx(v, l); \
return Type(); /*todo: add support for other flags as they are tested*/ \
}

#define DefineBroadcastMuxType(Type, Sfx, MuxType, IDType) \
Expand Down
142 changes: 85 additions & 57 deletions llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,12 @@
//===----------------------------------------------------------------------===//

#include "llvm/SYCLLowerIR/PrepareSYCLNativeCPU.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/PassManager.h"
#include "llvm/SYCLLowerIR/SYCLUtils.h"

Expand All @@ -23,7 +26,6 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
Expand All @@ -35,13 +37,13 @@
#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/GlobalStatus.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <utility>
#include <vector>

#ifdef NATIVECPU_USE_OCK
#include "compiler/utils/attributes.h"
#include "compiler/utils/builtin_info.h"
#include "compiler/utils/metadata.h"
#endif

Expand Down Expand Up @@ -331,31 +333,85 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
UsedBuiltins.push_back({Glob, Entry.second});
}

SmallVector<Function *> NewKernels;
for (auto &OldF : OldKernels) {
#ifdef NATIVECPU_USE_OCK
auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF);
OldF->setName(Name);
// if vectorization occurred, at this point we have a wrapper function that
// runs the vectorized kernel and peels using the scalar kernel. We make it
// so this wrapper steals the original kernel name.
std::optional<compiler::utils::LinkMetadataResult> veczR =
compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF);
if (veczR && veczR.value().first) {
auto ScalarF = veczR.value().first;
OldF->takeName(ScalarF);
ScalarF->setName(OldF->getName() + "_scalar");
} else if (Name != OldF->getName()) {
auto RealKernel = M.getFunction(Name);
if (RealKernel) {
// the real kernel was not inlined in the wrapper, steal its name
OldF->takeName(RealKernel);
{
SmallSet<Function *, 5> RemovableFuncs;
SmallVector<Function *, 5> WrapperFuncs;

// Retrieve the wrapper functions created by the WorkItemLoop pass.
for (auto &OldF : OldKernels) {
std::optional<compiler::utils::LinkMetadataResult> VeczR =
compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF);
if (VeczR && VeczR.value().first) {
WrapperFuncs.push_back(OldF);
} else {
// the real kernel has been inlined, just use the name
OldF->setName(Name);
auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF);
if (Name != OldF->getName()) {
WrapperFuncs.push_back(OldF);
}
}
}

for (auto &OldF : WrapperFuncs) {
// If vectorization occurred, at this point we have a wrapper function
// that runs the vectorized kernel and peels using the scalar kernel. We
// make it so this wrapper steals the original kernel name.
std::optional<compiler::utils::LinkMetadataResult> VeczR =
compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF);
if (VeczR && VeczR.value().first) {
auto ScalarF = VeczR.value().first;
OldF->takeName(ScalarF);
if (ScalarF->use_empty())
RemovableFuncs.insert(ScalarF);
} else {
// The WorkItemLoops pass created a wrapper function for the original
// kernel. If we have a kernel named foo(), the wrapper will be called
// foo-wrapper(), and will have the original kernel name retrieved by
// getBaseFnNameOrFnName. We set the name of the wrapper function
// to the original kernel name and add the original kernel to the
// list of functions that can be removed from the module.
auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF);
Function *OrigF = M.getFunction(Name);
if (OrigF != nullptr) {
// The original kernel is inlined by the WorkItemLoops
// pass if it contained barriers or group collectives, otherwise
// we don't want to (and can't) remove it.
if (OrigF->use_empty())
RemovableFuncs.insert(OrigF);
OldF->takeName(OrigF);
} else {
OldF->setName(Name);
}
}
}

// Find any left over SYCL_EXTERNAL function that has no more uses
std::set<Function *> Kernelset(OldKernels.begin(), OldKernels.end());
for (auto &F : M) {
if (Kernelset.count(&F) == 0 &&
F.hasFnAttribute(sycl::utils::ATTR_SYCL_MODULE_ID) && F.use_empty() &&
!F.getName().starts_with("__dpcpp_nativecpu")) {
// SYCL_EXTERNAL functions end up in static array of function pointers,
// at this point we can remove them from the array and remove the
// function if no other uses are left.
RemovableFuncs.insert(&F);
}
}

// Remove unused functions. This is necessary in case they still contain
// calls to group collective functions that haven't been processed by the
// work item loops pass, which will lead to linker errors.
llvm::erase_if(OldKernels,
[&](Function *F) { return RemovableFuncs.contains(F); });

for (Function *F : RemovableFuncs) {
F->eraseFromParent();
}
}
#endif

SmallVector<Function *> NewKernels;
for (auto &OldF : OldKernels) {
auto *NewF =
cloneFunctionAndAddParam(OldF, StatePtrType, CurrentStatePointerTLS);
NewF->takeName(OldF);
Expand Down Expand Up @@ -416,54 +472,26 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
OldI->replaceAllUsesWith(NewI);
OldI->eraseFromParent();
}
for (auto temp : ToRemove2)
temp->eraseFromParent();
for (auto Temp : ToRemove2)
Temp->eraseFromParent();

// Finally, we erase the builtin from the module
Glob->eraseFromParent();
}

#ifdef NATIVECPU_USE_OCK
// Define __mux_mem_barrier here using the OCK
compiler::utils::BuiltinInfo BI;
for (auto &F : M) {
if (F.getName() == compiler::utils::MuxBuiltins::mem_barrier) {
BI.defineMuxBuiltin(compiler::utils::BaseBuiltinID::eMuxBuiltinMemBarrier,
M);
}
}
// if we find calls to mux barrier now, it means that we had SYCL_EXTERNAL
// functions that called __mux_work_group_barrier, which didn't get processed
// by the WorkItemLoop pass. This means that the actual function call has been
// inlined into the kernel, and the call to __mux_work_group_barrier has been
// removed in the inlined call, but not in the original function. The original
// function will not be executed (since it has been inlined) and so we can
// just define __mux_work_group_barrier as a no-op to avoid linker errors.
// Todo: currently we can't remove the function here even if it has no uses,
// because we may still emit a declaration for it in the offload-wrapper.
auto BarrierF =
M.getFunction(compiler::utils::MuxBuiltins::work_group_barrier);
if (BarrierF && BarrierF->isDeclaration()) {
IRBuilder<> Builder(M.getContext());
auto BB = BasicBlock::Create(M.getContext(), "noop", BarrierF);
Builder.SetInsertPoint(BB);
Builder.CreateRetVoid();
}
#endif

// removing unused builtins
// Removing unused builtins
SmallVector<Function *> UnusedLibBuiltins;
for (auto &F : M) {
if (IsUnusedBuiltinOrPrivateDef(F)) {
UnusedLibBuiltins.push_back(&F);
}
}
for (Function *f : UnusedLibBuiltins) {
f->eraseFromParent();
for (Function *F : UnusedLibBuiltins) {
F->eraseFromParent();
ModuleChanged = true;
}
for (auto it = M.begin(); it != M.end();) {
auto Curr = it++;
for (auto It = M.begin(); It != M.end();) {
auto Curr = It++;
Function &F = *Curr;
if (F.getNumUses() == 0 && F.isDeclaration() &&
F.getName().starts_with("__mux_")) {
Expand Down
119 changes: 119 additions & 0 deletions sycl/test/native_cpu/reduce_multi_tu.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
// REQUIRES: native_cpu_ock

// Tests that no linker errors occur when group collective functions are used
// in conjuction with SYCL_EXTERNAL.

// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -DFILE1 -c -o %t1.o %s
// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -DFILE2 -c -o %t2.o %s
// RUN: llvm-ar crv %t1.a %t1.o
// RUN: %clangxx -fsycl -fsycl-targets=native_cpu %t2.o %t1.a -o %t.out
// RUN: env ONEAPI_DEVICE_SELECTOR=native_cpu:cpu %t.out

/*
test performs a lattice reduction.
sycl::vec<float> is sensitive to .get_size() vs .size() in SYCL headers
(ie, byte size versus vector size)
*/

#include <sycl/detail/core.hpp>
#include <sycl/group_algorithm.hpp>
#include <sycl/usm.hpp>

using namespace sycl;

#define NX 32
#define NZ 2
#define NV 8
using vecn = sycl::vec<float, NV>; // 8 floats
#ifdef FILE1

SYCL_EXTERNAL void groupSum(vecn *r, const vecn &in, const int k,
sycl::group<2> &grp, const int i) {

vecn tin = (i == k ? in : vecn(0));
auto out = reduce_over_group(grp, tin, sycl::plus<>());
if (i == k && grp.get_local_id()[1] == 0)
r[k] = out;
}
#endif

#ifdef FILE2
SYCL_EXTERNAL void groupSum(vecn *r, const vecn &in, const int k,
sycl::group<2> &grp, const int i);
void test(queue q, float *r, float *x,
int n) { // r is 16 floats, x is 256 floats. n is 256

sycl::range<2> globalSize(NZ, NX); // 2,32
sycl::range<2> localSize(1, NX); // 1,8 so 16 iterations
sycl::nd_range<2> range{globalSize, localSize};

q.submit([&](sycl::handler &h) {
h.parallel_for<>(range, [=](sycl::nd_item<2> ndi) {
int i = ndi.get_global_id(1);
int k = ndi.get_global_id(0);

auto vx = reinterpret_cast<vecn *>(x);
auto vr = reinterpret_cast<vecn *>(r);

auto myg = ndi.get_group();

for (int iz = 0; iz < NZ; iz++) { // loop over Z (2)
groupSum(vr, vx[k * NX + i], k, myg, iz);
}
});
});
q.wait();
}

int main() {

queue q{default_selector_v};
auto dev = q.get_device();
std::cout << "Device: " << dev.get_info<info::device::name>() << std::endl;

auto ctx = q.get_context();
int n = NX * NZ * NV; // 16 * 8 * 2 => 256
auto *x = (float *)sycl::malloc_shared(n * sizeof(float), dev,
ctx); // 256 * sizeof(float)
auto *r = (float *)sycl::malloc_shared(
NZ * NV * sizeof(float), dev, ctx); // 2 * 8 => 16 ( * sizeof(float) )

for (int i = 0; i < n; i++) {
x[i] = i;
}

q.wait();

test(q, r, x, n);

int fails = 0;
for (int k = 0; k < NZ; k++) {
float s[NV] = {0};
for (int i = 0; i < NX; i++) {
for (int j = 0; j < NV; j++) {
s[j] += x[(k * NX + i) * NV + j];
}
}
for (int j = 0; j < NV; j++) {
auto d = s[j] - r[k * NV + j];
if (std::abs(d) > 1e-10) {
printf("partial fail ");
printf("%i\t%i\t%g\t%g\n", k, j, s[j], r[k * NV + j]);
fails++;
} else {
printf("partial pass ");
printf("%i\t%i\t%g\t%g\n", k, j, s[j], r[k * NV + j]);
}
}
}

if (fails == 0) {
printf("test passed!\n");
} else {
printf("test failed!\n");
}
free(x, ctx);
free(r, ctx);
return fails;
}
#endif
Loading
Loading