Skip to content

Commit 4b50ec4

Browse files
authored
[Clang] Avoid Using byval for ndrange_t when emitting __enqueue_kernel_basic (#116435)
AMDGPU disabled the use of `byval` for struct argument passing in commit d77c620. However, when emitting `__enqueue_kernel_basic`, Clang still adds the `byval` attribute by default. Emitting the `byval` attribute by default in this context doesn’t seem like a good idea, as argument-passing conventions are highly target-dependent, and assumptions here could lead to issues. This PR removes the addition of the `byval` attribute, aligning the behavior with other `__enqueue_kernel_*` functions.
1 parent 34ebfab commit 4b50ec4

File tree

4 files changed

+18
-25
lines changed

4 files changed

+18
-25
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5985,15 +5985,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
59855985
llvm::Value *Block =
59865986
Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
59875987

5988-
AttrBuilder B(Builder.getContext());
5989-
B.addByValAttr(NDRangeL.getAddress().getElementType());
5990-
llvm::AttributeList ByValAttrSet =
5991-
llvm::AttributeList::get(CGM.getModule().getContext(), 3U, B);
5992-
5993-
auto RTCall =
5994-
EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name, ByValAttrSet),
5995-
{Queue, Flags, Range, Kernel, Block});
5996-
RTCall->setAttributes(ByValAttrSet);
5988+
auto RTCall = EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
5989+
{Queue, Flags, Range, Kernel, Block});
59975990
return RValue::get(RTCall);
59985991
}
59995992
assert(NumArgs >= 5 && "Invalid enqueue_kernel signature");

clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ kernel void test_target_features_kernel(global int *i) {
140140
// NOCPU-NEXT: [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 4
141141
// NOCPU-NEXT: [[TMP3:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1
142142
// NOCPU-NEXT: store i8 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 8
143-
// NOCPU-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr byval([[STRUCT_NDRANGE_T]]) [[TMP_ASCAST]], ptr @__test_block_invoke_kernel, ptr [[BLOCK_ASCAST]])
143+
// NOCPU-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr [[TMP_ASCAST]], ptr @__test_block_invoke_kernel, ptr [[BLOCK_ASCAST]])
144144
// NOCPU-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8
145145
// NOCPU-NEXT: [[TMP6:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4
146146
// NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP2_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false)
@@ -162,7 +162,7 @@ kernel void test_target_features_kernel(global int *i) {
162162
// NOCPU-NEXT: [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 5
163163
// NOCPU-NEXT: [[TMP10:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8
164164
// NOCPU-NEXT: store i64 [[TMP10]], ptr [[BLOCK_CAPTURED10]], align 8
165-
// NOCPU-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr byval([[STRUCT_NDRANGE_T]]) [[TMP2_ASCAST]], ptr @__test_block_invoke_2_kernel, ptr [[BLOCK3_ASCAST]])
165+
// NOCPU-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr [[TMP2_ASCAST]], ptr @__test_block_invoke_2_kernel, ptr [[BLOCK3_ASCAST]])
166166
// NOCPU-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8
167167
// NOCPU-NEXT: [[TMP13:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4
168168
// NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP11_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false)
@@ -204,7 +204,7 @@ kernel void test_target_features_kernel(global int *i) {
204204
// NOCPU-NEXT: [[TMP23:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4
205205
// NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP27_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false)
206206
// NOCPU-NEXT: [[TMP24:%.*]] = load ptr, ptr [[BLOCK20_ASCAST]], align 8
207-
// NOCPU-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr byval([[STRUCT_NDRANGE_T]]) [[TMP27_ASCAST]], ptr @__test_block_invoke_4_kernel, ptr [[BLOCK21_ASCAST]])
207+
// NOCPU-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr [[TMP27_ASCAST]], ptr @__test_block_invoke_4_kernel, ptr [[BLOCK21_ASCAST]])
208208
// NOCPU-NEXT: ret void
209209
//
210210
//
@@ -365,7 +365,7 @@ kernel void test_target_features_kernel(global int *i) {
365365
// NOCPU-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8
366366
// NOCPU-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4
367367
// NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false)
368-
// NOCPU-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr byval([[STRUCT_NDRANGE_T]]) [[TMP_ASCAST]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
368+
// NOCPU-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr [[TMP_ASCAST]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
369369
// NOCPU-NEXT: ret void
370370
//
371371
//
@@ -486,7 +486,7 @@ kernel void test_target_features_kernel(global int *i) {
486486
// GFX900-NEXT: [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 4
487487
// GFX900-NEXT: [[TMP3:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA13]]
488488
// GFX900-NEXT: store i8 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 8, !tbaa [[TBAA13]]
489-
// GFX900-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr byval([[STRUCT_NDRANGE_T]]) [[TMP_ASCAST]], ptr @__test_block_invoke_kernel, ptr [[BLOCK_ASCAST]])
489+
// GFX900-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr [[TMP_ASCAST]], ptr @__test_block_invoke_kernel, ptr [[BLOCK_ASCAST]])
490490
// GFX900-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA16]]
491491
// GFX900-NEXT: [[TMP6:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA14]]
492492
// GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP2_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18]]
@@ -508,7 +508,7 @@ kernel void test_target_features_kernel(global int *i) {
508508
// GFX900-NEXT: [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 5
509509
// GFX900-NEXT: [[TMP10:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]]
510510
// GFX900-NEXT: store i64 [[TMP10]], ptr [[BLOCK_CAPTURED10]], align 8, !tbaa [[TBAA3]]
511-
// GFX900-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr byval([[STRUCT_NDRANGE_T]]) [[TMP2_ASCAST]], ptr @__test_block_invoke_2_kernel, ptr [[BLOCK3_ASCAST]])
511+
// GFX900-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr [[TMP2_ASCAST]], ptr @__test_block_invoke_2_kernel, ptr [[BLOCK3_ASCAST]])
512512
// GFX900-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA16]]
513513
// GFX900-NEXT: [[TMP13:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA14]]
514514
// GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP11_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18]]
@@ -553,7 +553,7 @@ kernel void test_target_features_kernel(global int *i) {
553553
// GFX900-NEXT: [[TMP23:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA14]]
554554
// GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP27_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18]]
555555
// GFX900-NEXT: [[TMP24:%.*]] = load ptr, ptr [[BLOCK20_ASCAST]], align 8, !tbaa [[TBAA13]]
556-
// GFX900-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr byval([[STRUCT_NDRANGE_T]]) [[TMP27_ASCAST]], ptr @__test_block_invoke_4_kernel, ptr [[BLOCK21_ASCAST]])
556+
// GFX900-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr [[TMP27_ASCAST]], ptr @__test_block_invoke_4_kernel, ptr [[BLOCK21_ASCAST]])
557557
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR8]]
558558
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]]
559559
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]
@@ -709,7 +709,7 @@ kernel void test_target_features_kernel(global int *i) {
709709
// GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA16]]
710710
// GFX900-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA14]]
711711
// GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18]]
712-
// GFX900-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr byval([[STRUCT_NDRANGE_T]]) [[TMP_ASCAST]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
712+
// GFX900-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr [[TMP_ASCAST]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
713713
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]]
714714
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]
715715
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8]]

clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) {
5555
// SPIR32-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 4
5656
// SPIR32-NEXT: store ptr addrspace(1) [[TMP4]], ptr [[BLOCK_CAPTURED2]], align 4
5757
// SPIR32-NEXT: [[TMP5:%.*]] = addrspacecast ptr [[BLOCK]] to ptr addrspace(4)
58-
// SPIR32-NEXT: [[TMP6:%.*]] = call spir_func i32 @__enqueue_kernel_basic(target("spirv.Queue") [[TMP0]], i32 [[TMP1]], ptr byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke_kernel to ptr addrspace(4)), ptr addrspace(4) [[TMP5]])
58+
// SPIR32-NEXT: [[TMP6:%.*]] = call spir_func i32 @__enqueue_kernel_basic(target("spirv.Queue") [[TMP0]], i32 [[TMP1]], ptr [[TMP]], ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke_kernel to ptr addrspace(4)), ptr addrspace(4) [[TMP5]])
5959
// SPIR32-NEXT: ret void
6060
//
6161
//
@@ -126,7 +126,7 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) {
126126
// STRICTFP-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 4
127127
// STRICTFP-NEXT: store ptr addrspace(1) [[TMP4]], ptr [[BLOCK_CAPTURED2]], align 4
128128
// STRICTFP-NEXT: [[TMP5:%.*]] = addrspacecast ptr [[BLOCK]] to ptr addrspace(4)
129-
// STRICTFP-NEXT: [[TMP6:%.*]] = call spir_func i32 @__enqueue_kernel_basic(target("spirv.Queue") [[TMP0]], i32 [[TMP1]], ptr byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke_kernel to ptr addrspace(4)), ptr addrspace(4) [[TMP5]])
129+
// STRICTFP-NEXT: [[TMP6:%.*]] = call spir_func i32 @__enqueue_kernel_basic(target("spirv.Queue") [[TMP0]], i32 [[TMP1]], ptr [[TMP]], ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke_kernel to ptr addrspace(4)), ptr addrspace(4) [[TMP5]]) #[[ATTR5]]
130130
// STRICTFP-NEXT: ret void
131131
//
132132
//

clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,8 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) {
8989
// COMMON: store ptr addrspace(4) addrspacecast (ptr [[INVL1:@__device_side_enqueue_block_invoke[^ ]*]] to ptr addrspace(4)), ptr %block.invoke
9090
// COMMON: [[BL_I8:%[0-9]+]] ={{.*}} addrspacecast ptr %block to ptr addrspace(4)
9191
// COMMON-LABEL: call {{(spir_func )?}}i32 @__enqueue_kernel_basic(
92-
// SPIR-SAME: target("spirv.Queue") [[DEF_Q]], i32 [[FLAGS]], ptr byval(%struct.ndrange_t) [[NDR]]{{([0-9]+)?}},
93-
// X86-SAME: ptr [[DEF_Q]], i32 [[FLAGS]], ptr byval(%struct.ndrange_t) [[NDR]]{{([0-9]+)?}},
92+
// SPIR-SAME: target("spirv.Queue") [[DEF_Q]], i32 [[FLAGS]], ptr [[NDR]]{{([0-9]+)?}},
93+
// X86-SAME: ptr [[DEF_Q]], i32 [[FLAGS]], ptr [[NDR]]{{([0-9]+)?}},
9494
// COMMON-SAME: ptr addrspace(4) addrspacecast (ptr [[INVLK1:[^ ]+_kernel]] to ptr addrspace(4)),
9595
// COMMON-SAME: ptr addrspace(4) [[BL_I8]])
9696
enqueue_kernel(default_queue, flags, ndrange,
@@ -333,8 +333,8 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) {
333333
// X86: [[DEF_Q:%[0-9]+]] = load ptr, ptr %default_queue
334334
// COMMON: [[FLAGS:%[0-9]+]] = load i32, ptr %flags
335335
// COMMON-LABEL: call {{(spir_func )?}}i32 @__enqueue_kernel_basic(
336-
// SPIR-SAME: target("spirv.Queue") [[DEF_Q]], i32 [[FLAGS]], ptr byval(%struct.ndrange_t) [[NDR]]{{([0-9]+)?}},
337-
// X86-SAME: ptr [[DEF_Q]], i32 [[FLAGS]], ptr byval(%struct.ndrange_t) [[NDR]]{{([0-9]+)?}},
336+
// SPIR-SAME: target("spirv.Queue") [[DEF_Q]], i32 [[FLAGS]], ptr [[NDR]]{{([0-9]+)?}},
337+
// X86-SAME: ptr [[DEF_Q]], i32 [[FLAGS]], ptr [[NDR]]{{([0-9]+)?}},
338338
// COMMON-SAME: ptr addrspace(4) addrspacecast (ptr [[INVGK8:[^ ]+_kernel]] to ptr addrspace(4)),
339339
// COMMON-SAME: ptr addrspace(4) addrspacecast (ptr addrspace(1) [[BLG8]] to ptr addrspace(4)))
340340
enqueue_kernel(default_queue, flags, ndrange, block_A);
@@ -382,8 +382,8 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) {
382382
// COMMON: [[FLAGS:%[0-9]+]] = load i32, ptr %flags
383383
// COMMON: [[BL_I8:%[0-9]+]] ={{.*}} addrspacecast ptr {{.*}} to ptr addrspace(4)
384384
// COMMON-LABEL: call {{(spir_func )?}}i32 @__enqueue_kernel_basic(
385-
// SPIR-SAME: target("spirv.Queue") [[DEF_Q]], i32 [[FLAGS]], ptr byval(%struct.ndrange_t) [[NDR]]{{([0-9]+)?}},
386-
// X86-SAME: ptr [[DEF_Q]], i32 [[FLAGS]], ptr byval(%struct.ndrange_t) [[NDR]]{{([0-9]+)?}},
385+
// SPIR-SAME: target("spirv.Queue") [[DEF_Q]], i32 [[FLAGS]], ptr [[NDR]]{{([0-9]+)?}},
386+
// X86-SAME: ptr [[DEF_Q]], i32 [[FLAGS]], ptr [[NDR]]{{([0-9]+)?}},
387387
// COMMON-SAME: ptr addrspace(4) addrspacecast (ptr [[INVLK3:[^ ]+_kernel]] to ptr addrspace(4)),
388388
// COMMON-SAME: ptr addrspace(4) [[BL_I8]])
389389
enqueue_kernel(default_queue, flags, ndrange, block_C);

0 commit comments

Comments
 (0)