|
12 | 12 | //===----------------------------------------------------------------------===//
|
13 | 13 |
|
14 | 14 | #include "llvm/SYCLLowerIR/PrepareSYCLNativeCPU.h"
|
| 15 | +#include "llvm/ADT/SmallSet.h" |
| 16 | +#include "llvm/ADT/StringRef.h" |
15 | 17 | #include "llvm/IR/BasicBlock.h"
|
16 | 18 | #include "llvm/IR/Constant.h"
|
17 | 19 | #include "llvm/IR/DebugInfoMetadata.h"
|
| 20 | +#include "llvm/IR/GlobalValue.h" |
18 | 21 | #include "llvm/IR/PassManager.h"
|
19 | 22 | #include "llvm/SYCLLowerIR/SYCLUtils.h"
|
20 | 23 |
|
|
23 | 26 | #include "llvm/ADT/SmallVector.h"
|
24 | 27 | #include "llvm/IR/Attributes.h"
|
25 | 28 | #include "llvm/IR/CallingConv.h"
|
26 |
| -#include "llvm/IR/Constants.h" |
27 | 29 | #include "llvm/IR/DerivedTypes.h"
|
28 | 30 | #include "llvm/IR/IRBuilder.h"
|
29 | 31 | #include "llvm/IR/Instruction.h"
|
|
35 | 37 | #include "llvm/Support/Casting.h"
|
36 | 38 | #include "llvm/Support/ErrorHandling.h"
|
37 | 39 | #include "llvm/Transforms/Utils/Cloning.h"
|
| 40 | +#include "llvm/Transforms/Utils/GlobalStatus.h" |
38 | 41 | #include "llvm/Transforms/Utils/ValueMapper.h"
|
39 | 42 | #include <utility>
|
40 | 43 | #include <vector>
|
41 | 44 |
|
42 | 45 | #ifdef NATIVECPU_USE_OCK
|
43 | 46 | #include "compiler/utils/attributes.h"
|
44 |
| -#include "compiler/utils/builtin_info.h" |
45 | 47 | #include "compiler/utils/metadata.h"
|
46 | 48 | #endif
|
47 | 49 |
|
@@ -331,31 +333,85 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
|
331 | 333 | UsedBuiltins.push_back({Glob, Entry.second});
|
332 | 334 | }
|
333 | 335 |
|
334 |
| - SmallVector<Function *> NewKernels; |
335 |
| - for (auto &OldF : OldKernels) { |
336 | 336 | #ifdef NATIVECPU_USE_OCK
|
337 |
| - auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF); |
338 |
| - OldF->setName(Name); |
339 |
| - // if vectorization occurred, at this point we have a wrapper function that |
340 |
| - // runs the vectorized kernel and peels using the scalar kernel. We make it |
341 |
| - // so this wrapper steals the original kernel name. |
342 |
| - std::optional<compiler::utils::LinkMetadataResult> veczR = |
343 |
| - compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF); |
344 |
| - if (veczR && veczR.value().first) { |
345 |
| - auto ScalarF = veczR.value().first; |
346 |
| - OldF->takeName(ScalarF); |
347 |
| - ScalarF->setName(OldF->getName() + "_scalar"); |
348 |
| - } else if (Name != OldF->getName()) { |
349 |
| - auto RealKernel = M.getFunction(Name); |
350 |
| - if (RealKernel) { |
351 |
| - // the real kernel was not inlined in the wrapper, steal its name |
352 |
| - OldF->takeName(RealKernel); |
| 337 | + { |
| 338 | + SmallSet<Function *, 5> RemovableFuncs; |
| 339 | + SmallVector<Function *, 5> WrapperFuncs; |
| 340 | + |
| 341 | + // Retrieve the wrapper functions created by the WorkItemLoop pass. |
| 342 | + for (auto &OldF : OldKernels) { |
| 343 | + std::optional<compiler::utils::LinkMetadataResult> VeczR = |
| 344 | + compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF); |
| 345 | + if (VeczR && VeczR.value().first) { |
| 346 | + WrapperFuncs.push_back(OldF); |
353 | 347 | } else {
|
354 |
| - // the real kernel has been inlined, just use the name |
355 |
| - OldF->setName(Name); |
| 348 | + auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF); |
| 349 | + if (Name != OldF->getName()) { |
| 350 | + WrapperFuncs.push_back(OldF); |
| 351 | + } |
356 | 352 | }
|
357 | 353 | }
|
| 354 | + |
| 355 | + for (auto &OldF : WrapperFuncs) { |
| 356 | + // If vectorization occurred, at this point we have a wrapper function |
| 357 | + // that runs the vectorized kernel and peels using the scalar kernel. We |
| 358 | + // make it so this wrapper steals the original kernel name. |
| 359 | + std::optional<compiler::utils::LinkMetadataResult> VeczR = |
| 360 | + compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF); |
| 361 | + if (VeczR && VeczR.value().first) { |
| 362 | + auto ScalarF = VeczR.value().first; |
| 363 | + OldF->takeName(ScalarF); |
| 364 | + if (ScalarF->use_empty()) |
| 365 | + RemovableFuncs.insert(ScalarF); |
| 366 | + } else { |
| 367 | + // The WorkItemLoops pass created a wrapper function for the original |
| 368 | + // kernel. If we have a kernel named foo(), the wrapper will be called |
| 369 | + // foo-wrapper(), and will have the original kernel name retrieved by |
| 370 | + // getBaseFnNameOrFnName. We set the name of the wrapper function |
| 371 | + // to the original kernel name and add the original kernel to the |
| 372 | + // list of functions that can be removed from the module. |
| 373 | + auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF); |
| 374 | + Function *OrigF = M.getFunction(Name); |
| 375 | + if (OrigF != nullptr) { |
| 376 | + // The original kernel is inlined by the WorkItemLoops |
| 377 | + // pass if it contained barriers or group collectives, otherwise |
| 378 | + // we don't want to (and can't) remove it. |
| 379 | + if (OrigF->use_empty()) |
| 380 | + RemovableFuncs.insert(OrigF); |
| 381 | + OldF->takeName(OrigF); |
| 382 | + } else { |
| 383 | + OldF->setName(Name); |
| 384 | + } |
| 385 | + } |
| 386 | + } |
| 387 | + |
| 388 | + // Find any left over SYCL_EXTERNAL function that has no more uses |
| 389 | + std::set<Function *> Kernelset(OldKernels.begin(), OldKernels.end()); |
| 390 | + for (auto &F : M) { |
| 391 | + if (Kernelset.count(&F) == 0 && |
| 392 | + F.hasFnAttribute(sycl::utils::ATTR_SYCL_MODULE_ID) && F.use_empty() && |
| 393 | + !F.getName().starts_with("__dpcpp_nativecpu")) { |
| 394 | + // SYCL_EXTERNAL functions end up in static array of function pointers, |
| 395 | + // at this point we can remove them from the array and remove the |
| 396 | + // function if no other uses are left. |
| 397 | + RemovableFuncs.insert(&F); |
| 398 | + } |
| 399 | + } |
| 400 | + |
| 401 | + // Remove unused functions. This is necessary in case they still contain |
| 402 | + // calls to group collective functions that haven't been processed by the |
| 403 | + // work item loops pass, which will lead to linker errors. |
| 404 | + llvm::erase_if(OldKernels, |
| 405 | + [&](Function *F) { return RemovableFuncs.contains(F); }); |
| 406 | + |
| 407 | + for (Function *F : RemovableFuncs) { |
| 408 | + F->eraseFromParent(); |
| 409 | + } |
| 410 | + } |
358 | 411 | #endif
|
| 412 | + |
| 413 | + SmallVector<Function *> NewKernels; |
| 414 | + for (auto &OldF : OldKernels) { |
359 | 415 | auto *NewF =
|
360 | 416 | cloneFunctionAndAddParam(OldF, StatePtrType, CurrentStatePointerTLS);
|
361 | 417 | NewF->takeName(OldF);
|
@@ -416,54 +472,26 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
|
416 | 472 | OldI->replaceAllUsesWith(NewI);
|
417 | 473 | OldI->eraseFromParent();
|
418 | 474 | }
|
419 |
| - for (auto temp : ToRemove2) |
420 |
| - temp->eraseFromParent(); |
| 475 | + for (auto Temp : ToRemove2) |
| 476 | + Temp->eraseFromParent(); |
421 | 477 |
|
422 | 478 | // Finally, we erase the builtin from the module
|
423 | 479 | Glob->eraseFromParent();
|
424 | 480 | }
|
425 | 481 |
|
426 |
| -#ifdef NATIVECPU_USE_OCK |
427 |
| - // Define __mux_mem_barrier here using the OCK |
428 |
| - compiler::utils::BuiltinInfo BI; |
429 |
| - for (auto &F : M) { |
430 |
| - if (F.getName() == compiler::utils::MuxBuiltins::mem_barrier) { |
431 |
| - BI.defineMuxBuiltin(compiler::utils::BaseBuiltinID::eMuxBuiltinMemBarrier, |
432 |
| - M); |
433 |
| - } |
434 |
| - } |
435 |
| - // if we find calls to mux barrier now, it means that we had SYCL_EXTERNAL |
436 |
| - // functions that called __mux_work_group_barrier, which didn't get processed |
437 |
| - // by the WorkItemLoop pass. This means that the actual function call has been |
438 |
| - // inlined into the kernel, and the call to __mux_work_group_barrier has been |
439 |
| - // removed in the inlined call, but not in the original function. The original |
440 |
| - // function will not be executed (since it has been inlined) and so we can |
441 |
| - // just define __mux_work_group_barrier as a no-op to avoid linker errors. |
442 |
| - // Todo: currently we can't remove the function here even if it has no uses, |
443 |
| - // because we may still emit a declaration for it in the offload-wrapper. |
444 |
| - auto BarrierF = |
445 |
| - M.getFunction(compiler::utils::MuxBuiltins::work_group_barrier); |
446 |
| - if (BarrierF && BarrierF->isDeclaration()) { |
447 |
| - IRBuilder<> Builder(M.getContext()); |
448 |
| - auto BB = BasicBlock::Create(M.getContext(), "noop", BarrierF); |
449 |
| - Builder.SetInsertPoint(BB); |
450 |
| - Builder.CreateRetVoid(); |
451 |
| - } |
452 |
| -#endif |
453 |
| - |
454 |
| - // removing unused builtins |
| 482 | + // Removing unused builtins |
455 | 483 | SmallVector<Function *> UnusedLibBuiltins;
|
456 | 484 | for (auto &F : M) {
|
457 | 485 | if (IsUnusedBuiltinOrPrivateDef(F)) {
|
458 | 486 | UnusedLibBuiltins.push_back(&F);
|
459 | 487 | }
|
460 | 488 | }
|
461 |
| - for (Function *f : UnusedLibBuiltins) { |
462 |
| - f->eraseFromParent(); |
| 489 | + for (Function *F : UnusedLibBuiltins) { |
| 490 | + F->eraseFromParent(); |
463 | 491 | ModuleChanged = true;
|
464 | 492 | }
|
465 |
| - for (auto it = M.begin(); it != M.end();) { |
466 |
| - auto Curr = it++; |
| 493 | + for (auto It = M.begin(); It != M.end();) { |
| 494 | + auto Curr = It++; |
467 | 495 | Function &F = *Curr;
|
468 | 496 | if (F.getNumUses() == 0 && F.isDeclaration() &&
|
469 | 497 | F.getName().starts_with("__mux_")) {
|
|
0 commit comments