Skip to content

[NVPTX] Fix internal indirect call prototypes not obeying the ABI #100131

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 4 additions & 11 deletions libc/config/gpu/entrypoints.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,3 @@
if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
set(extra_entrypoints
# stdio.h entrypoints
libc.src.stdio.snprintf
libc.src.stdio.sprintf
libc.src.stdio.vsnprintf
libc.src.stdio.vsprintf
)
endif()

set(TARGET_LIBC_ENTRYPOINTS
# assert.h entrypoints
libc.src.assert.__assert_fail
Expand Down Expand Up @@ -186,13 +176,16 @@ set(TARGET_LIBC_ENTRYPOINTS
libc.src.errno.errno

# stdio.h entrypoints
${extra_entrypoints}
libc.src.stdio.clearerr
libc.src.stdio.fclose
libc.src.stdio.printf
libc.src.stdio.vprintf
libc.src.stdio.fprintf
libc.src.stdio.vfprintf
libc.src.stdio.snprintf
libc.src.stdio.sprintf
libc.src.stdio.vsnprintf
libc.src.stdio.vsprintf
libc.src.stdio.feof
libc.src.stdio.ferror
libc.src.stdio.fflush
Expand Down
5 changes: 3 additions & 2 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1429,7 +1429,6 @@ std::string NVPTXTargetLowering::getPrototype(

bool first = true;

const Function *F = CB.getFunction();
unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
Type *Ty = Args[i].Ty;
Expand Down Expand Up @@ -1471,10 +1470,12 @@ std::string NVPTXTargetLowering::getPrototype(
continue;
}

// Indirect calls need strict ABI alignment so we disable optimizations by
// not providing a function to optimize.
Type *ETy = Args[i].IndirectType;
Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
Align ParamByValAlign =
getFunctionByValParamAlign(F, ETy, InitialAlign, DL);
getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment above needs to mantion that it's the nullptr here that forces strict ABI-compliant alignment. Otherwise it's not clear which part of the code controls this.

Ideally, it would be even nicer to make "ABI-compliant alignment" a parameter.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't really make sense to me as a fix. Indirect calls shouldn't require special handling vs. direct

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The NVPTX backend thinks that internal functions can have their ABI modified, so they up-scale alignment on things. This doesn't work for these indirect calls, which seem to strictly require the ABI.


O << ".param .align " << ParamByValAlign.value() << " .b8 ";
O << "_";
Expand Down
94 changes: 94 additions & 0 deletions llvm/test/CodeGen/NVPTX/indirect_byval.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | %ptxas-verify %}

target triple = "nvptx64-nvidia-cuda"

%struct.S = type { i8 }
%struct.U = type { i64 }

@ptr = external global ptr, align 8

define internal i32 @foo() {
; CHECK-LABEL: foo(
; CHECK: {
; CHECK-NEXT: .local .align 1 .b8 __local_depot0[2];
; CHECK-NEXT: .reg .b64 %SP;
; CHECK-NEXT: .reg .b64 %SPL;
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: mov.u64 %SPL, __local_depot0;
; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
; CHECK-NEXT: ld.global.u64 %rd1, [ptr];
; CHECK-NEXT: ld.u8 %rs1, [%SP+1];
; CHECK-NEXT: add.u64 %rd2, %SP, 0;
; CHECK-NEXT: { // callseq 0, 0
; CHECK-NEXT: .param .align 1 .b8 param0[1];
; CHECK-NEXT: st.param.b8 [param0+0], %rs1;
; CHECK-NEXT: .param .b64 param1;
; CHECK-NEXT: st.param.b64 [param1+0], %rd2;
; CHECK-NEXT: .param .b32 retval0;
; CHECK-NEXT: prototype_0 : .callprototype (.param .b32 _) _ (.param .align 1 .b8 _[1], .param .b64 _);
; CHECK-NEXT: call (retval0),
; CHECK-NEXT: %rd1,
; CHECK-NEXT: (
; CHECK-NEXT: param0,
; CHECK-NEXT: param1
; CHECK-NEXT: )
; CHECK-NEXT: , prototype_0;
; CHECK-NEXT: ld.param.b32 %r1, [retval0+0];
; CHECK-NEXT: } // callseq 0
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
; CHECK-NEXT: ret;
entry:
%s = alloca %struct.S, align 1
%agg.tmp = alloca %struct.S, align 1
%0 = load ptr, ptr @ptr, align 8
%call = call i32 %0(ptr byval(%struct.S) align 1 %agg.tmp, ptr noundef %s)
ret i32 %call
}

define internal i32 @bar() {
; CHECK-LABEL: bar(
; CHECK: // @bar
; CHECK-NEXT: {
; CHECK-NEXT: .local .align 8 .b8 __local_depot1[16];
; CHECK-NEXT: .reg .b64 %SP;
; CHECK-NEXT: .reg .b64 %SPL;
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: mov.u64 %SPL, __local_depot1;
; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
; CHECK-NEXT: ld.global.u64 %rd1, [ptr];
; CHECK-NEXT: ld.u64 %rd2, [%SP+8];
; CHECK-NEXT: add.u64 %rd3, %SP, 0;
; CHECK-NEXT: { // callseq 1, 0
; CHECK-NEXT: .param .align 8 .b8 param0[8];
; CHECK-NEXT: st.param.b64 [param0+0], %rd2;
; CHECK-NEXT: .param .b64 param1;
; CHECK-NEXT: st.param.b64 [param1+0], %rd3;
; CHECK-NEXT: .param .b32 retval0;
; CHECK-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .align 8 .b8 _[8], .param .b64 _);
; CHECK-NEXT: call (retval0),
; CHECK-NEXT: %rd1,
; CHECK-NEXT: (
; CHECK-NEXT: param0,
; CHECK-NEXT: param1
; CHECK-NEXT: )
; CHECK-NEXT: , prototype_1;
; CHECK-NEXT: ld.param.b32 %r1, [retval0+0];
; CHECK-NEXT: } // callseq 1
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
; CHECK-NEXT: ret;
entry:
%s = alloca %struct.U, align 8
%agg.tmp = alloca %struct.U, align 8
%0 = load ptr, ptr @ptr, align 8
%call = call noundef i32 %0(ptr byval(%struct.U) align 8 %agg.tmp, ptr %s)
ret i32 %call
}
Loading