Skip to content

[sycl-post-link] Enable ESIMD-specific lowering #3195

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions llvm/test/tools/sycl-post-link/sycl-esimd/basic-esimd-lower.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
; This is a basic test for Lowering ESIMD constructs after splitting.
; This test also implicitly checks that input module is not reused
; for ESIMD kernels in any case.

; No lowering
; RUN: sycl-post-link -split-esimd -S %s -o %t.table
; RUN: FileCheck %s -input-file=%t_esimd_0.ll --check-prefixes CHECK-NO-LOWERING

; Default lowering
; RUN: sycl-post-link -split-esimd -lower-esimd -S %s -o %t.table
; RUN: FileCheck %s -input-file=%t_esimd_0.ll --check-prefixes CHECK-O2

; -O2 lowering
; RUN: sycl-post-link -split-esimd -lower-esimd -O2 -S %s -o %t.table
; RUN: FileCheck %s -input-file=%t_esimd_0.ll --check-prefixes CHECK-O2

; -O0 lowering
; RUN: sycl-post-link -split-esimd -lower-esimd -O0 -S %s -o %t.table
; RUN: FileCheck %s -input-file=%t_esimd_0.ll --check-prefixes CHECK-O0

target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "spir64-unknown-linux-sycldevice"

declare dso_local spir_func i64 @_Z28__spirv_GlobalInvocationId_xv()

define dso_local spir_kernel void @ESIMD_kernel() #0 !sycl_explicit_simd !3 {
entry:
%call = tail call spir_func i64 @_Z28__spirv_GlobalInvocationId_xv()
ret void
}

attributes #0 = { "sycl-module-id"="a.cpp" }

!llvm.module.flags = !{!0}
!opencl.spir.version = !{!1}
!spirv.Source = !{!2}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 1, i32 2}
!2 = !{i32 0, i32 100000}
!3 = !{}

; By default, no lowering is performed
; CHECK-NO-LOWERING: declare dso_local spir_func i64 @_Z28__spirv_GlobalInvocationId_xv()
; CHECK-NO-LOWERING: define dso_local spir_kernel void @ESIMD_kernel()
; CHECK-NO-LOWERING: entry:
; CHECK-NO-LOWERING: %call = tail call spir_func i64 @_Z28__spirv_GlobalInvocationId_xv()
; CHECK-NO-LOWERING: ret void
; CHECK-NO-LOWERING: }

; With -O0, we only lower ESIMD code, but no other optimizations
; CHECK-O0: declare dso_local spir_func i64 @_Z28__spirv_GlobalInvocationId_xv()
; CHECK-O0: define dso_local spir_kernel void @ESIMD_kernel() #1 !sycl_explicit_simd !3 !intel_reqd_sub_group_size !4 {
; CHECK-O0: entry:
; CHECK-O0: call <3 x i32> @llvm.genx.local.id.v3i32()
; CHECK-O0: call <3 x i32> @llvm.genx.local.size.v3i32()
; CHECK-O0: call i32 @llvm.genx.group.id.x()
; CHECK-O0: ret void
; CHECK-O0: }

; With -O2, unused call was optimized away
; CHECK-O2: declare dso_local spir_func i64 @_Z28__spirv_GlobalInvocationId_xv()
; CHECK-O2: define dso_local spir_kernel void @ESIMD_kernel()
; CHECK-O2: entry:
; CHECK-O2: ret void
; CHECK-O2: }
13 changes: 13 additions & 0 deletions llvm/tools/sycl-post-link/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,26 @@ set(LLVM_LINK_COMPONENTS
IRReader
Support
TransformUtils
SYCLLowerIR
)

get_property(LLVMGenXIntrinsics_SOURCE_DIR GLOBAL PROPERTY LLVMGenXIntrinsics_SOURCE_PROP)
get_property(LLVMGenXIntrinsics_BINARY_DIR GLOBAL PROPERTY LLVMGenXIntrinsics_BINARY_PROP)

include_directories(
${LLVMGenXIntrinsics_SOURCE_DIR}/GenXIntrinsics/include
${LLVMGenXIntrinsics_BINARY_DIR}/GenXIntrinsics/include)

add_llvm_tool(sycl-post-link
sycl-post-link.cpp
SPIRKernelParamOptInfo.cpp
SpecConstants.cpp

ADDITIONAL_HEADER_DIRS
${LLVMGenXIntrinsics_SOURCE_DIR}/GenXIntrinsics/include
${LLVMGenXIntrinsics_BINARY_DIR}/GenXIntrinsics/include

DEPENDS
intrinsics_gen
LLVMGenXIntrinsics
)
116 changes: 112 additions & 4 deletions llvm/tools/sycl-post-link/sycl-post-link.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,15 @@
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Triple.h"
#include "llvm/Bitcode/BitcodeWriterPass.h"
#include "llvm/GenXIntrinsics/GenXSPIRVWriterAdaptor.h"
#include "llvm/IR/IRPrintingPasses.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Module.h"
#include "llvm/IRReader/IRReader.h"
#include "llvm/SYCLLowerIR/LowerESIMD.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/InitLLVM.h"
#include "llvm/Support/Path.h"
Expand All @@ -35,6 +37,8 @@
#include "llvm/Support/WithColor.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/GlobalDCE.h"
#include "llvm/Transforms/InstCombine/InstCombine.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Cloning.h"

#include <memory>
Expand Down Expand Up @@ -92,6 +96,45 @@ static cl::opt<bool> SplitEsimd{"split-esimd",
cl::desc("Split SYCL and ESIMD kernels"),
cl::cat(PostLinkCat)};

// TODO Design note: sycl-post-link should probably separate different kinds of
// its functionality on logical and source level:
// - LLVM IR module splitting
// - Running LLVM IR passes on resulting modules
// - Generating additional files (like spec constants, dead arg info,...)
// The tool itself could be just a "driver" creating needed pipelines from the
// above actions. This could help make the tool structure clearer and more
// maintainable.

static cl::opt<bool> LowerEsimd{
"lower-esimd", cl::desc("Lower ESIMD constructs"), cl::cat(PostLinkCat)};

static cl::opt<bool>
OptLevelO0("O0", cl::desc("Optimization level 0. Similar to clang -O0"),
cl::cat(PostLinkCat));

static cl::opt<bool>
OptLevelO1("O1", cl::desc("Optimization level 1. Similar to clang -O1"),
cl::cat(PostLinkCat));

static cl::opt<bool>
OptLevelO2("O2", cl::desc("Optimization level 2. Similar to clang -O2"),
cl::cat(PostLinkCat));

static cl::opt<bool> OptLevelOs(
"Os",
cl::desc(
"Like -O2 with extra optimizations for size. Similar to clang -Os"),
cl::cat(PostLinkCat));

static cl::opt<bool> OptLevelOz(
"Oz",
cl::desc("Like -Os but reduces code size further. Similar to clang -Oz"),
cl::cat(PostLinkCat));

static cl::opt<bool>
OptLevelO3("O3", cl::desc("Optimization level 3. Similar to clang -O3"),
cl::cat(PostLinkCat));
Comment on lines +111 to +136
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How these are expected to be used? Will they be propagated to the tool based on the same options passed to clang driver?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I will submit the patch for that later and put the link here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is the driver portion that will pass the optimization level to sycl-post-link: #3207.


enum IRSplitMode {
SPLIT_PER_TU, // one module per translation unit
SPLIT_PER_KERNEL, // one module per kernel
Expand Down Expand Up @@ -627,6 +670,59 @@ static string_vector saveResultSymbolsLists(string_vector &ResSymbolsLists,
} \
}

// Helper function for creating Inliner pass.
// The approach is taken from opt tool.
static Pass *createFunctionInliningPassHelper() {
if (OptLevelO0)
return createFunctionInliningPass(0, 0, false);

if (OptLevelO1)
return createFunctionInliningPass(1, 0, false);

if (OptLevelO2)
return createFunctionInliningPass(2, 0, false);

if (OptLevelOs)
return createFunctionInliningPass(2, 1, false);

if (OptLevelOz)
return createFunctionInliningPass(2, 2, false);

if (OptLevelO3)
return createFunctionInliningPass(3, 0, false);

return createFunctionInliningPass();
}

// When ESIMD code was separated from the regular SYCL code,
// we can safely process ESIMD part.
Comment on lines +697 to +698
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably an architecture question/concern: why can't we leave a single optimization pipeline which is executed before sycl-post-link invocation?

I understand that IGC VC backend most likely expects LLVM IR in some particular form to do its job right, but can't it be improved? It seems a bit strange to me that we have to maintain two optimization pipelines in two different places. Moreover, I guess that the first pipeline before sycl-post-link will still be executed to optimize regular non-ESIMD code, right?

Note: I'm okay if we have to launch a few more passes in order to launch some ESIMD-specific ones, but I would like to have a single optimization pipeline (potentially customized compared to the upstream one) for general-purpose optimizations.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@AlexeySachkov, yes, those are ESIMD-specific transformations that should be applied to ESIMD code only, otherwise, it will break regular SYCL code. So, it should be done after splitting. Or we should clone the code that is shared between ESIMD and regular SYCL kernels and then do the ESIMD lowering. Splitting first and then transform, eliminates the need to clone shared code, so I guess this is definitely an advantage.

Yes, we have general-purposes clean-up passes after ESIMD lowering. So, yes, we have 2 sets of general-purpose passes running on ESIMD code. I believe we have to have the second set after ESIMD lowering. @cmc-rep can confirm.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

those are ESIMD-specific transformations that should be applied to ESIMD code only, otherwise, it will break regular SYCL code.

I'm not against ESIMD-specific transformation, I'm concerned about additional optimization passes we launch in sycl-post-link

Yes, we have general-purposes clean-up passes after ESIMD lowering. So, yes, we have 2 sets of general-purpose passes running on ESIMD code. I believe we have to have the second set after ESIMD lowering. @cmc-rep can confirm.

To me it seems better if those post-processing is moved into vc backend as pre-processing passes, because this is a thing specific to a single backend, not even to a language constructs.

// TODO: support options like -debug-pass, -print-[before|after], and others
static void LowerEsimdConstructs(Module &M) {
legacy::PassManager MPM;
MPM.add(createSYCLLowerESIMDPass());
if (!OptLevelO0) {
// Inlining and SROA passes are required to make
// ESIMD/accessor_gather_scatter.cpp test work.
MPM.add(createFunctionInliningPassHelper());
MPM.add(createSROAPass());
}
MPM.add(createESIMDLowerVecArgPass());
MPM.add(createESIMDLowerLoadStorePass());
if (!OptLevelO0) {
MPM.add(createSROAPass());
MPM.add(createEarlyCSEPass(true));
MPM.add(createInstructionCombiningPass());
MPM.add(createDeadCodeEliminationPass());
MPM.add(createFunctionInliningPassHelper());
MPM.add(createSROAPass());
MPM.add(createEarlyCSEPass(true));
MPM.add(createInstructionCombiningPass());
MPM.add(createDeadCodeEliminationPass());
}
MPM.add(createGenXSPIRVWriterAdaptorPass());
MPM.run(M);
}

using TableFiles = std::map<StringRef, string_vector>;

static TableFiles processOneModule(std::unique_ptr<Module> M, bool IsEsimd,
Expand All @@ -635,6 +731,9 @@ static TableFiles processOneModule(std::unique_ptr<Module> M, bool IsEsimd,
if (!M)
return TblFiles;

if (IsEsimd && LowerEsimd)
LowerEsimdConstructs(*M);

std::map<StringRef, std::vector<Function *>> GlobalsSet;

bool DoSplit = SplitMode.getNumOccurrences() > 0;
Expand Down Expand Up @@ -687,11 +786,16 @@ static TableFiles processOneModule(std::unique_ptr<Module> M, bool IsEsimd,
ResultModules.push_back(std::move(M));

{
// reuse input module if there were no spec constants and no splitting
// Reuse input module with only regular SYCL kernels if there were
// no spec constants and no splitting.
// We cannot reuse input module for ESIMD code since it was transformed.
bool CanReuseInputModule = !SpecConstsMet && (ResultModules.size() == 1) &&
!SyclAndEsimdKernels && !IsEsimd;
string_vector Files =
SpecConstsMet || (ResultModules.size() > 1) || SyclAndEsimdKernels
? saveResultModules(ResultModules, IsEsimd ? "esimd_" : "")
: string_vector{InputFilename};
CanReuseInputModule
? string_vector{InputFilename}
: saveResultModules(ResultModules, IsEsimd ? "esimd_" : "");

// "Code" column is always output
std::copy(Files.begin(), Files.end(),
std::back_inserter(TblFiles[COL_CODE]));
Expand Down Expand Up @@ -818,6 +922,10 @@ int main(int argc, char **argv) {
"- Specialization constant intrinsic transformer. Replaces symbolic\n"
" ID-based intrinsics to integer ID-based ones to make them friendly\n"
" for the SPIRV translator\n"
"When the tool splits input module into regular SYCL and ESIMD kernels,\n"
"it performs a set of specific lowering and transformation passes on\n"
"ESIMD module, which is enabled by the '-lower-esimd' option. Regular\n"
"optimization level options are supported, e.g. -O[0|1|2|3|s|z].\n"
"Normally, the tool generates a number of files and \"file table\"\n"
"file listing all generated files in a table manner. For example, if\n"
"the input file 'example.bc' contains two kernels, then the command\n"
Expand Down