@@ -988,6 +988,77 @@ merge: ; preds = %second, %first
988
988
ret void
989
989
}
990
990
991
+ define ptx_kernel void @test_forward_byval_arg (ptr byval (i32 ) align 4 %input ) {
992
+ ; COMMON-LABEL: define ptx_kernel void @test_forward_byval_arg(
993
+ ; COMMON-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR3]] {
994
+ ; COMMON-NEXT: [[INPUT1:%.*]] = alloca i32, align 4
995
+ ; COMMON-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
996
+ ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT1]], ptr addrspace(101) align 4 [[INPUT2]], i64 4, i1 false)
997
+ ; COMMON-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT1]])
998
+ ; COMMON-NEXT: ret void
999
+ ;
1000
+ ; PTX-LABEL: test_forward_byval_arg(
1001
+ ; PTX: {
1002
+ ; PTX-NEXT: .local .align 4 .b8 __local_depot17[4];
1003
+ ; PTX-NEXT: .reg .b64 %SP;
1004
+ ; PTX-NEXT: .reg .b64 %SPL;
1005
+ ; PTX-NEXT: .reg .b32 %r<2>;
1006
+ ; PTX-NEXT: .reg .b64 %rd<3>;
1007
+ ; PTX-EMPTY:
1008
+ ; PTX-NEXT: // %bb.0:
1009
+ ; PTX-NEXT: mov.b64 %SPL, __local_depot17;
1010
+ ; PTX-NEXT: add.u64 %rd2, %SPL, 0;
1011
+ ; PTX-NEXT: ld.param.u32 %r1, [test_forward_byval_arg_param_0];
1012
+ ; PTX-NEXT: st.local.u32 [%rd2], %r1;
1013
+ ; PTX-NEXT: { // callseq 2, 0
1014
+ ; PTX-NEXT: .param .align 4 .b8 param0[4];
1015
+ ; PTX-NEXT: st.param.b32 [param0], %r1;
1016
+ ; PTX-NEXT: call.uni
1017
+ ; PTX-NEXT: device_func,
1018
+ ; PTX-NEXT: (
1019
+ ; PTX-NEXT: param0
1020
+ ; PTX-NEXT: );
1021
+ ; PTX-NEXT: } // callseq 2
1022
+ ; PTX-NEXT: ret;
1023
+ call void @device_func (ptr byval (i32 ) align 4 %input )
1024
+ ret void
1025
+ }
1026
+
1027
+ define void @device_func (ptr byval (i32 ) align 4 %input ) {
1028
+ ; LOWER-ARGS-LABEL: define void @device_func(
1029
+ ; LOWER-ARGS-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR3]] {
1030
+ ; LOWER-ARGS-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT]])
1031
+ ; LOWER-ARGS-NEXT: ret void
1032
+ ;
1033
+ ; COPY-LABEL: define void @device_func(
1034
+ ; COPY-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR3]] {
1035
+ ; COPY-NEXT: [[INPUT1:%.*]] = alloca i32, align 4
1036
+ ; COPY-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
1037
+ ; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT1]], ptr addrspace(101) align 4 [[INPUT2]], i64 4, i1 false)
1038
+ ; COPY-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT1]])
1039
+ ; COPY-NEXT: ret void
1040
+ ;
1041
+ ; PTX-LABEL: device_func(
1042
+ ; PTX: {
1043
+ ; PTX-NEXT: .reg .b32 %r<2>;
1044
+ ; PTX-NEXT: .reg .b64 %rd<3>;
1045
+ ; PTX-EMPTY:
1046
+ ; PTX-NEXT: // %bb.0:
1047
+ ; PTX-NEXT: ld.param.u32 %r1, [device_func_param_0];
1048
+ ; PTX-NEXT: { // callseq 3, 0
1049
+ ; PTX-NEXT: .param .align 4 .b8 param0[4];
1050
+ ; PTX-NEXT: st.param.b32 [param0], %r1;
1051
+ ; PTX-NEXT: call.uni
1052
+ ; PTX-NEXT: device_func,
1053
+ ; PTX-NEXT: (
1054
+ ; PTX-NEXT: param0
1055
+ ; PTX-NEXT: );
1056
+ ; PTX-NEXT: } // callseq 3
1057
+ ; PTX-NEXT: ret;
1058
+ call void @device_func (ptr byval (i32 ) align 4 %input )
1059
+ ret void
1060
+ }
1061
+
991
1062
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "no-trapping-math" ="true" "target-cpu" ="sm_60" "target-features" ="+ptx78,+sm_60" "uniform-work-group-size" ="true" }
992
1063
attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
993
1064
attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
0 commit comments