Skip to content

Commit d77c620

Browse files
committed
[clang][AMDGPU]: Don't use byval for struct arguments in function ABI
Summary: Byval requires allocating additional stack space, and always requires an implicit copy to be inserted in codegen, where it can be difficult to optimize. In this work, we use byref/IndirectAliased promotion method instead of byval with the implicit copy semantics. Reviewers: arsenm Differential Revision: https://reviews.llvm.org/D155986
1 parent 9e3d9c9 commit d77c620

File tree

10 files changed

+363
-36
lines changed

10 files changed

+363
-36
lines changed

clang/docs/ReleaseNotes.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,10 @@ Target Specific Changes
193193

194194
AMDGPU Support
195195
^^^^^^^^^^^^^^
196+
- Use pass-by-reference (byref) in stead of pass-by-value (byval) for struct
197+
arguments in C ABI. Callee is responsible for allocating stack memory and
198+
copying the value of the struct if modified. Note that AMDGPU backend still
199+
supports byval for struct arguments.
196200

197201
X86 Support
198202
^^^^^^^^^^^

clang/lib/CodeGen/CGCall.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2156,7 +2156,8 @@ static bool DetermineNoUndef(QualType QTy, CodeGenTypes &Types,
21562156
const llvm::DataLayout &DL, const ABIArgInfo &AI,
21572157
bool CheckCoerce = true) {
21582158
llvm::Type *Ty = Types.ConvertTypeForMem(QTy);
2159-
if (AI.getKind() == ABIArgInfo::Indirect)
2159+
if (AI.getKind() == ABIArgInfo::Indirect ||
2160+
AI.getKind() == ABIArgInfo::IndirectAliased)
21602161
return true;
21612162
if (AI.getKind() == ABIArgInfo::Extend)
21622163
return true;
@@ -5126,20 +5127,23 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
51265127
auto LV = I->getKnownLValue();
51275128
auto AS = LV.getAddressSpace();
51285129

5129-
if (!ArgInfo.getIndirectByVal() ||
5130+
bool isByValOrRef =
5131+
ArgInfo.isIndirectAliased() || ArgInfo.getIndirectByVal();
5132+
5133+
if (!isByValOrRef ||
51305134
(LV.getAlignment() < getContext().getTypeAlignInChars(I->Ty))) {
51315135
NeedCopy = true;
51325136
}
51335137
if (!getLangOpts().OpenCL) {
5134-
if ((ArgInfo.getIndirectByVal() &&
5138+
if ((isByValOrRef &&
51355139
(AS != LangAS::Default &&
51365140
AS != CGM.getASTAllocaAddressSpace()))) {
51375141
NeedCopy = true;
51385142
}
51395143
}
51405144
// For OpenCL even if RV is located in default or alloca address space
51415145
// we don't want to perform address space cast for it.
5142-
else if ((ArgInfo.getIndirectByVal() &&
5146+
else if ((isByValOrRef &&
51435147
Addr.getType()->getAddressSpace() != IRFuncTy->
51445148
getParamType(FirstIRArg)->getPointerAddressSpace())) {
51455149
NeedCopy = true;

clang/lib/CodeGen/Targets/AMDGPU.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,12 @@ ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty,
248248
return ABIArgInfo::getDirect();
249249
}
250250
}
251+
252+
// Use pass-by-reference in stead of pass-by-value for struct arguments in
253+
// function ABI.
254+
return ABIArgInfo::getIndirectAliased(
255+
getContext().getTypeAlignInChars(Ty),
256+
getContext().getTargetAddressSpace(LangAS::opencl_private));
251257
}
252258

253259
// Otherwise just do the default thing.

clang/test/CodeGenCUDA/kernel-args.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@ struct A {
99
float *p;
1010
};
1111

12-
// AMDGCN: define{{.*}} amdgpu_kernel void @_Z6kernel1A(ptr addrspace(4) byref(%struct.A) align 8 %{{.+}})
12+
// AMDGCN: define{{.*}} amdgpu_kernel void @_Z6kernel1A(ptr addrspace(4) noundef byref(%struct.A) align 8 %{{.+}})
1313
// NVPTX: define{{.*}} void @_Z6kernel1A(ptr noundef byval(%struct.A) align 8 %x)
1414
__global__ void kernel(A x) {
1515
}
1616

1717
class Kernel {
1818
public:
19-
// AMDGCN: define{{.*}} amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(ptr addrspace(4) byref(%struct.A) align 8 %{{.+}})
19+
// AMDGCN: define{{.*}} amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(ptr addrspace(4) noundef byref(%struct.A) align 8 %{{.+}})
2020
// NVPTX: define{{.*}} void @_ZN6Kernel12memberKernelE1A(ptr noundef byval(%struct.A) align 8 %x)
2121
static __global__ void memberKernel(A x){}
2222
template<typename T> static __global__ void templateMemberKernel(T x) {}
@@ -30,11 +30,11 @@ void launch(void*);
3030

3131
void test() {
3232
Kernel K;
33-
// AMDGCN: define{{.*}} amdgpu_kernel void @_Z14templateKernelI1AEvT_(ptr addrspace(4) byref(%struct.A) align 8 %{{.+}}
33+
// AMDGCN: define{{.*}} amdgpu_kernel void @_Z14templateKernelI1AEvT_(ptr addrspace(4) noundef byref(%struct.A) align 8 %{{.+}}
3434
// NVPTX: define{{.*}} void @_Z14templateKernelI1AEvT_(ptr noundef byval(%struct.A) align 8 %x)
3535
launch((void*)templateKernel<A>);
3636

37-
// AMDGCN: define{{.*}} amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(ptr addrspace(4) byref(%struct.A) align 8 %{{.+}}
37+
// AMDGCN: define{{.*}} amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(ptr addrspace(4) noundef byref(%struct.A) align 8 %{{.+}}
3838
// NVPTX: define{{.*}} void @_ZN6Kernel20templateMemberKernelI1AEEvT_(ptr noundef byval(%struct.A) align 8 %x)
3939
launch((void*)Kernel::templateMemberKernel<A>);
4040
}

clang/test/CodeGenCXX/amdgcn-func-arg.cpp

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,13 @@ void func_with_ref_arg(A &a);
1919
void func_with_ref_arg(B &b);
2020

2121
// CHECK-LABEL: @_Z22func_with_indirect_arg1A(
22-
// CHECK-SAME: ptr addrspace(5) noundef [[ARG:%.*]])
2322
// CHECK-NEXT: entry:
24-
// CHECK-NEXT: [[INDIRECT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
23+
// CHECK-NEXT: [[A_INDIRECT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
2524
// CHECK-NEXT: [[P:%.*]] = alloca ptr, align 8, addrspace(5)
26-
// CHECK-NEXT: [[INDIRECT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[INDIRECT_ADDR]] to ptr
25+
// CHECK-NEXT: [[A_INDIRECT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_INDIRECT_ADDR]] to ptr
2726
// CHECK-NEXT: [[P_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P]] to ptr
28-
// CHECK-NEXT: store ptr addrspace(5) [[ARG]], ptr [[INDIRECT_ADDR_ASCAST]]
29-
// CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A:%.*]] to ptr
27+
// CHECK-NEXT: store ptr addrspace(5) [[A:%.*]], ptr [[A_INDIRECT_ADDR_ASCAST]], align 8
28+
// CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr
3029
// CHECK-NEXT: store ptr [[A_ASCAST]], ptr [[P_ASCAST]], align 8
3130
// CHECK-NEXT: ret void
3231
//
@@ -73,10 +72,12 @@ void test_indirect_arg_global() {
7372

7473
// CHECK-LABEL: @_Z19func_with_byval_arg1B(
7574
// CHECK-NEXT: entry:
75+
// CHECK-NEXT: [[COERCE:%.*]] = alloca [[CLASS_B:%.*]], align 4, addrspace(5)
7676
// CHECK-NEXT: [[P:%.*]] = alloca ptr, align 8, addrspace(5)
77+
// CHECK-NEXT: [[B:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
7778
// CHECK-NEXT: [[P_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P]] to ptr
78-
// CHECK-NEXT: [[B_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B:%.*]] to ptr
79-
// CHECK-NEXT: store ptr [[B_ASCAST]], ptr [[P_ASCAST]], align 8
79+
// CHECK-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[B]], ptr addrspace(5) align 4 [[TMP0:%.*]], i64 400, i1 false)
80+
// CHECK-NEXT: store ptr [[B]], ptr [[P_ASCAST]], align 8
8081
// CHECK-NEXT: ret void
8182
//
8283
void func_with_byval_arg(B b) {
@@ -91,7 +92,7 @@ void func_with_byval_arg(B b) {
9192
// CHECK-NEXT: [[AGG_TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AGG_TMP]] to ptr
9293
// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[AGG_TMP_ASCAST]], ptr align 4 [[B_ASCAST]], i64 400, i1 false)
9394
// CHECK-NEXT: [[AGG_TMP_ASCAST_ASCAST:%.*]] = addrspacecast ptr [[AGG_TMP_ASCAST]] to ptr addrspace(5)
94-
// CHECK-NEXT: call void @_Z19func_with_byval_arg1B(ptr addrspace(5) noundef byval([[CLASS_B]]) align 4 [[AGG_TMP_ASCAST_ASCAST]])
95+
// CHECK-NEXT: call void @_Z19func_with_byval_arg1B(ptr addrspace(5) noundef byref([[CLASS_B]]) align 4 [[AGG_TMP_ASCAST_ASCAST]])
9596
// CHECK-NEXT: call void @_Z17func_with_ref_argR1B(ptr noundef nonnull align 4 dereferenceable(400) [[B_ASCAST]])
9697
// CHECK-NEXT: ret void
9798
//
@@ -107,7 +108,7 @@ void test_byval_arg_auto() {
107108
// CHECK-NEXT: [[AGG_TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AGG_TMP]] to ptr
108109
// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[AGG_TMP_ASCAST]], ptr align 4 addrspacecast (ptr addrspace(1) @g_b to ptr), i64 400, i1 false)
109110
// CHECK-NEXT: [[AGG_TMP_ASCAST_ASCAST:%.*]] = addrspacecast ptr [[AGG_TMP_ASCAST]] to ptr addrspace(5)
110-
// CHECK-NEXT: call void @_Z19func_with_byval_arg1B(ptr addrspace(5) noundef byval([[CLASS_B]]) align 4 [[AGG_TMP_ASCAST_ASCAST]])
111+
// CHECK-NEXT: call void @_Z19func_with_byval_arg1B(ptr addrspace(5) noundef byref([[CLASS_B]]) align 4 [[AGG_TMP_ASCAST_ASCAST]])
111112
// CHECK-NEXT: call void @_Z17func_with_ref_argR1B(ptr noundef nonnull align 4 dereferenceable(400) addrspacecast (ptr addrspace(1) @g_b to ptr))
112113
// CHECK-NEXT: ret void
113114
//

clang/test/CodeGenOpenCL/addr-space-struct-arg.cl

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,9 @@ kernel void ker(global Mat3X3 *in, global Mat4X4 *out) {
6666
}
6767

6868
// X86-LABEL: define{{.*}} void @foo_large(ptr noalias sret(%struct.Mat64X64) align 4 %agg.result, ptr noundef byval(%struct.Mat32X32) align 4 %in)
69-
// AMDGCN-LABEL: define{{.*}} void @foo_large(ptr addrspace(5) noalias sret(%struct.Mat64X64) align 4 %agg.result, ptr addrspace(5) noundef byval(%struct.Mat32X32) align 4 %in)
69+
// AMDGCN-LABEL: define{{.*}} void @foo_large(ptr addrspace(5) noalias sret(%struct.Mat64X64) align 4 %agg.result, ptr addrspace(5) noundef byref(%struct.Mat32X32) align 4 %{{.*}}
70+
// AMDGCN: %in = alloca %struct.Mat32X32, align 4, addrspace(5)
71+
// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 %in, ptr addrspace(5) align 4 %{{.*}}, i64 4096, i1 false)
7072
Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) {
7173
Mat64X64 out;
7274
return out;
@@ -88,7 +90,9 @@ void FuncOneMember(struct StructOneMember u) {
8890
u.x = (int2)(0, 0);
8991
}
9092

91-
// AMDGCN-LABEL: define{{.*}} void @FuncOneLargeMember(ptr addrspace(5) noundef byval(%struct.LargeStructOneMember) align 8 %u)
93+
// AMDGCN-LABEL: define{{.*}} void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %{{.*}}
94+
// AMDGCN: %u = alloca %struct.LargeStructOneMember, align 8, addrspace(5)
95+
// AMDGCN: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 %u, ptr addrspace(5) align 8 %{{.*}}, i64 800, i1 false)
9296
// AMDGCN-NOT: addrspacecast
9397
// AMDGCN: store <2 x i32> %{{.*}}, ptr addrspace(5)
9498
void FuncOneLargeMember(struct LargeStructOneMember u) {
@@ -98,7 +102,7 @@ void FuncOneLargeMember(struct LargeStructOneMember u) {
98102
// AMDGCN20-LABEL: define{{.*}} void @test_indirect_arg_globl()
99103
// AMDGCN20: %[[byval_temp:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5)
100104
// AMDGCN20: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 %[[byval_temp]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false)
101-
// AMDGCN20: call void @FuncOneLargeMember(ptr addrspace(5) noundef byval(%struct.LargeStructOneMember) align 8 %[[byval_temp]])
105+
// AMDGCN20: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[byval_temp]])
102106
#if (__OPENCL_C_VERSION__ == 200) || (__OPENCL_C_VERSION__ >= 300 && defined(__opencl_c_program_scope_global_variables))
103107
void test_indirect_arg_globl(void) {
104108
FuncOneLargeMember(g_s);
@@ -108,7 +112,7 @@ void test_indirect_arg_globl(void) {
108112
// AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @test_indirect_arg_local()
109113
// AMDGCN: %[[byval_temp:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5)
110114
// AMDGCN: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 %[[byval_temp]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false)
111-
// AMDGCN: call void @FuncOneLargeMember(ptr addrspace(5) noundef byval(%struct.LargeStructOneMember) align 8 %[[byval_temp]])
115+
// AMDGCN: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[byval_temp]])
112116
kernel void test_indirect_arg_local(void) {
113117
local struct LargeStructOneMember l_s;
114118
FuncOneLargeMember(l_s);
@@ -117,7 +121,7 @@ kernel void test_indirect_arg_local(void) {
117121
// AMDGCN-LABEL: define{{.*}} void @test_indirect_arg_private()
118122
// AMDGCN: %[[p_s:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5)
119123
// AMDGCN-NOT: @llvm.memcpy
120-
// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byval(%struct.LargeStructOneMember) align 8 %[[p_s]])
124+
// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[p_s]])
121125
void test_indirect_arg_private(void) {
122126
struct LargeStructOneMember p_s;
123127
FuncOneLargeMember(p_s);
@@ -142,7 +146,7 @@ kernel void KernelOneMemberSpir(global struct StructOneMember* u) {
142146
// AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @KernelLargeOneMember(
143147
// AMDGCN: %[[U:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5)
144148
// AMDGCN: store %struct.LargeStructOneMember %u.coerce, ptr addrspace(5) %[[U]], align 8
145-
// AMDGCN: call void @FuncOneLargeMember(ptr addrspace(5) noundef byval(%struct.LargeStructOneMember) align 8 %[[U]])
149+
// AMDGCN: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[U]])
146150
kernel void KernelLargeOneMember(struct LargeStructOneMember u) {
147151
FuncOneLargeMember(u);
148152
}
@@ -152,7 +156,10 @@ void FuncTwoMember(struct StructTwoMember u) {
152156
u.y = (int2)(0, 0);
153157
}
154158

155-
// AMDGCN-LABEL: define{{.*}} void @FuncLargeTwoMember(ptr addrspace(5) noundef byval(%struct.LargeStructTwoMember) align 8 %u)
159+
// AMDGCN-LABEL: define dso_local void @FuncLargeTwoMember
160+
// AMDGCN-SAME: (ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]])
161+
// AMDGCN: %[[U:.*]] = alloca %struct.LargeStructTwoMember, align 8, addrspace(5)
162+
// AMDGCN: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 %[[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
156163
void FuncLargeTwoMember(struct LargeStructTwoMember u) {
157164
u.y[0] = (int2)(0, 0);
158165
}
@@ -171,7 +178,7 @@ kernel void KernelTwoMember(struct StructTwoMember u) {
171178
// AMDGCN-SAME: (%struct.LargeStructTwoMember %[[u_coerce:.*]])
172179
// AMDGCN: %[[u:.*]] = alloca %struct.LargeStructTwoMember, align 8, addrspace(5)
173180
// AMDGCN: store %struct.LargeStructTwoMember %[[u_coerce]], ptr addrspace(5) %[[u]]
174-
// AMDGCN: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byval(%struct.LargeStructTwoMember) align 8 %[[u]])
181+
// AMDGCN: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref(%struct.LargeStructTwoMember) align 8 %[[u]])
175182
kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
176183
FuncLargeTwoMember(u);
177184
}

0 commit comments

Comments
 (0)