-
Notifications
You must be signed in to change notification settings - Fork 787
[SYCL] Extend -fsycl-device-obj to dump assembly #17390
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This allows one to compile SYCL device code to plain PTX directly: -fsycl-targets=nvptx64-nvidia-cuda -fsycl-device-obj=ptx -S
Please note that the original patch (represented as the first commit in this PR) was written by @stefanatwork, and it would be good to keep the attribution of work correct. |
Fixes: #8797 |
What is the expectation when a fat object created with |
@jchlanda, what do you mean? Please, take a look at #17113 or #16834. Do they look like satisfying your requirement? |
Hi @bader |
hi @mdtoguchi This is meant to provide a convenient way of inspecting kernel's assembly/PTX without going to the effort of saving temps. I personally would always use it alongside Q.submit([&](sycl::handler &cgh) {
sycl::accessor AccA(BufA, cgh, sycl::write_only);
cgh.parallel_for(sycl::range<1>{32},
[=](sycl::id<1> idx) { AccA[idx] = 42; });
}).wait_and_throw();
//
// Generated by LLVM NVPTX Back-End
//
.version 8.5
.target sm_50
.address_size 64
// .weak _ZTSN4sycl3_V16detail18RoundedRangeKernelINS0_4itemILi1ELb1EEELi1EZZ4mainENKUlRNS0_7handlerEE_clES6_EUlNS0_2idILi1EEEE_EE // -- Begin function _ZTSN4sycl3_V16detail18RoundedRangeKernelINS0_4itemILi1ELb1EEELi1EZZ4mainENKUlRNS0_7handlerEE_clES6_EUlNS0_2idILi1EEEE_EE
.extern .func (.param .b64 func_retval0) _Z28__spirv_GlobalInvocationId_xv
()
;
.extern .func (.param .b64 func_retval0) _Z20__spirv_GlobalSize_xv
()
;
.extern .func (.param .b64 func_retval0) _Z22__spirv_GlobalOffset_xv
()
;
// @_ZTSN4sycl3_V16detail18RoundedRangeKernelINS0_4itemILi1ELb1EEELi1EZZ4mainENKUlRNS0_7handlerEE_clES6_EUlNS0_2idILi1EEEE_EE
.weak .entry _ZTSN4sycl3_V16detail18RoundedRangeKernelINS0_4itemILi1ELb1EEELi1EZZ4mainENKUlRNS0_7handlerEE_clES6_EUlNS0_2idILi1EEEE_EE(
.param .align 8 .b8 _ZTSN4sycl3_V16detail18RoundedRangeKernelINS0_4itemILi1ELb1EEELi1EZZ4mainENKUlRNS0_7handlerEE_clES6_EUlNS0_2idILi1EEEE_EE_param_0[8],
.param .u64 .ptr .global .align 4 _ZTSN4sycl3_V16detail18RoundedRangeKernelINS0_4itemILi1ELb1EEELi1EZZ4mainENKUlRNS0_7handlerEE_clES6_EUlNS0_2idILi1EEEE_EE_param_1,
.param .align 8 .b8 _ZTSN4sycl3_V16detail18RoundedRangeKernelINS0_4itemILi1ELb1EEELi1EZZ4mainENKUlRNS0_7handlerEE_clES6_EUlNS0_2idILi1EEEE_EE_param_2[8]
)
{
.reg .pred %p<3>;
.reg .b32 %r<2>;
.reg .b64 %rd<21>;
// %bb.0: // %entry
ld.param.u64 %rd1, [_ZTSN4sycl3_V16detail18RoundedRangeKernelINS0_4itemILi1ELb1EEELi1EZZ4mainENKUlRNS0_7handlerEE_clES6_EUlNS0_2idILi1EEEE_EE_param_0];
{ // callseq 0, 0
.param .b64 retval0;
call.uni (retval0),
_Z28__spirv_GlobalInvocationId_xv,
(
);
ld.param.b64 %rd20, [retval0];
} // callseq 0
{ // callseq 1, 0
.param .b64 retval0;
call.uni (retval0),
_Z20__spirv_GlobalSize_xv,
(
);
ld.param.b64 %rd13, [retval0];
} // callseq 1
{ // callseq 2, 0
.param .b64 retval0;
call.uni (retval0),
_Z22__spirv_GlobalOffset_xv,
(
);
ld.param.b64 %rd14, [retval0];
} // callseq 2
setp.ge.u64 %p1, %rd20, %rd1;
@%p1 bra $L__BB0_3;
// %bb.1: // %for.body.i.preheader
ld.param.u64 %rd11, [_ZTSN4sycl3_V16detail18RoundedRangeKernelINS0_4itemILi1ELb1EEELi1EZZ4mainENKUlRNS0_7handlerEE_clES6_EUlNS0_2idILi1EEEE_EE_param_1];
ld.param.u64 %rd2, [_ZTSN4sycl3_V16detail18RoundedRangeKernelINS0_4itemILi1ELb1EEELi1EZZ4mainENKUlRNS0_7handlerEE_clES6_EUlNS0_2idILi1EEEE_EE_param_2];
shl.b64 %rd16, %rd20, 2;
shl.b64 %rd17, %rd2, 2;
add.s64 %rd18, %rd16, %rd17;
add.s64 %rd19, %rd11, %rd18;
shl.b64 %rd6, %rd13, 2;
$L__BB0_2: // %for.body.i
// =>This Inner Loop Header: Depth=1
mov.b32 %r1, 42;
st.global.u32 [%rd19], %r1;
add.s64 %rd20, %rd20, %rd13;
add.s64 %rd19, %rd19, %rd6;
setp.lt.u64 %p2, %rd20, %rd1;
@%p2 bra $L__BB0_2;
$L__BB0_3: // %_ZNK4sycl3_V16detail18RoundedRangeKernelINS0_4itemILi1ELb1EEELi1EZZ4mainENKUlRNS0_7handlerEE_clES6_EUlNS0_2idILi1EEEE_EclES4_.exit
ret;
// -- End function
}
// .weak _ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlNS0_2idILi1EEEE_ // -- Begin function _ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlNS0_2idILi1EEEE_
.weak .entry _ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlNS0_2idILi1EEEE_(
.param .u64 .ptr .global .align 4 _ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlNS0_2idILi1EEEE__param_0,
.param .align 8 .b8 _ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlNS0_2idILi1EEEE__param_1[8]
) // @_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlNS0_2idILi1EEEE_
{
.reg .b32 %r<2>;
.reg .b64 %rd<13>;
// %bb.0: // %entry
ld.param.u64 %rd1, [_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlNS0_2idILi1EEEE__param_0];
ld.param.u64 %rd2, [_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlNS0_2idILi1EEEE__param_1];
shl.b64 %rd3, %rd2, 2;
add.s64 %rd4, %rd1, %rd3;
{ // callseq 3, 0
.param .b64 retval0;
call.uni (retval0),
_Z28__spirv_GlobalInvocationId_xv,
(
);
ld.param.b64 %rd5, [retval0];
} // callseq 3
{ // callseq 4, 0
.param .b64 retval0;
call.uni (retval0),
_Z20__spirv_GlobalSize_xv,
(
);
ld.param.b64 %rd7, [retval0];
} // callseq 4
{ // callseq 5, 0
.param .b64 retval0;
call.uni (retval0),
_Z22__spirv_GlobalOffset_xv,
(
);
ld.param.b64 %rd9, [retval0];
} // callseq 5
shl.b64 %rd11, %rd5, 2;
add.s64 %rd12, %rd4, %rd11;
mov.b32 %r1, 42;
st.global.u32 [%rd12], %r1;
ret;
// -- End function
} Which is what I'd expect to see. # __CLANG_OFFLOAD_BUNDLE____START__ sycl-nvptx64-nvidia-cuda-sm_50
... the same PTX as above ...
# __CLANG_OFFLOAD_BUNDLE____END__ sycl-nvptx64-nvidia-cuda-sm_50
# __CLANG_OFFLOAD_BUNDLE____START__ host-x86_64-unknown-linux-gnu
.file "ptx_outpu.cpp"
.text
.globl main # -- Begin function main
... the rest of X86 asm ...
# __CLANG_OFFLOAD_BUNDLE____END__ host-x86_64-unknown-linux-gnu And finally, when targeting both host and device, but the |
I understand that. Both mentioned PRs have commits done by multiple authors. My understanding is that git log for squashed commit will look like this:
Does it work for you? |
Yeap, works for me. Thank you. |
@jchlanda, thanks for the usage information. Your usage model has some overlap with the use of As for the consumption, I'm more concerned about usage of something like: |
@mdtoguchi, so it seems that
Seems like |
@intel/dpcpp-doc-reviewers @mdtoguchi is there anything else you'd like me to do for this PR? Thank you. |
ping @intel/dpcpp-doc-reviewers @mdtoguchi this is ready to review |
@intel/dpcpp-doc-reviewers could you please have a look at this PR? |
@@ -348,7 +348,8 @@ and not recommended to use in production environment. | |||
**`-fsycl-device-obj=<arg>`** [EXPERIMENTAL] | |||
|
|||
Specify format of device code stored in the resulting object. The <arg> can | |||
be one of the following: "spirv" - SPIR-V is emitted, "llvmir" - LLVM-IR | |||
be one of the following: "spirv" - SPIR-V, "asm" - assembly output when | |||
possible (PTX, when targetting Nvidia devices) , or "llvmir" - LLVM-IR |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
possible (PTX, when targetting Nvidia devices) , or "llvmir" - LLVM-IR | |
possible (PTX, when targeting NVIDIA devices), or "llvmir" - LLVM-IR |
@intel/llvm-gatekeepers this is ready to merge |
@intel/llvm-gatekeepers this should be ready to go. Thank you. |
This patch adds
asm
value to-fsycl-device-obj
to enable dumping assembly (or PTX) of kernels.