Skip to content

[SYCL][FPGA] Fix invalid memory copying of struct using fpga_reg #3865

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 6 additions & 10 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17856,22 +17856,18 @@ RValue CodeGenFunction::EmitIntelFPGARegBuiltin(const CallExpr *E,
ReturnValueSlot ReturnValue) {
const Expr *PtrArg = E->getArg(0);
QualType ArgType = PtrArg->getType();
llvm::Value *V = nullptr;
StringRef AnnotStr = "__builtin_intel_fpga_reg";

if (ArgType->isStructureOrClassType() || ArgType->isUnionType()) {
RValue RV = EmitAnyExpr(PtrArg);
Address A = EmitIntelFPGAFieldAnnotations(E->getExprLoc(),
RV.getAggregateAddress(),
AnnotStr);
llvm::Type *VTy = ReturnValue.getValue().getPointer()->getType();
uint64_t SizeVal = CGM.getDataLayout().getTypeAllocSize(VTy);
Builder.CreateMemCpy(ReturnValue.getValue(), A, SizeVal, false);
if (ArgType->isRecordType()) {
Address DstAddr = ReturnValue.getValue();
EmitAnyExprToMem(PtrArg, DstAddr, ArgType.getQualifiers(), true);
Address A =
EmitIntelFPGAFieldAnnotations(E->getExprLoc(), DstAddr, AnnotStr);
return RValue::getAggregate(A);
}

// if scalar type
V = EmitScalarExpr(PtrArg);
llvm::Value *V = EmitScalarExpr(PtrArg);

// llvm.annotation does not accept anything but integer types.
llvm::Type *OrigVType = V->getType();
Expand Down
61 changes: 21 additions & 40 deletions clang/test/CodeGenSYCL/intel-fpga-reg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
struct st {
int a;
float b;
char c;
};
// CHECK: [[T_ST:%struct[a-zA-Z0-9_.]*.st]] = type { i32, float }
// CHECK: [[T_ST:%struct[a-zA-Z0-9_.]*.st]] = type { i32, float, i8 }

union un {
int a;
Expand Down Expand Up @@ -67,41 +68,31 @@ void structs() {
// CHECK-NEXT: [[S1_ASCAST:%.*]] = addrspacecast [[T_ST]]* [[S1]] to [[T_ST]] addrspace(4)*
// CHECK-NEXT: [[S2:%.*]] = alloca [[T_ST]], align 4
// CHECK-NEXT: [[S2_ASCAST:%.*]] = addrspacecast [[T_ST]]* [[S2]] to [[T_ST]] addrspace(4)*
// CHECK-NEXT: [[AGG_TEMP:%.*]] = alloca [[T_ST]], align 4
// CHECK-NEXT: [[AGG_TEMP_ASCAST:%.*]] = addrspacecast [[T_ST]]* [[AGG_TEMP]] to [[T_ST]] addrspace(4)*
// CHECK-NEXT: [[S3:%.*]] = alloca [[T_ST]], align 4
// CHECK-NEXT: [[S3_ASCAST:%.*]] = addrspacecast [[T_ST]]* [[S3]] to [[T_ST]] addrspace(4)*
// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[T_ST]], align 4
// CHECK-NEXT: [[REF_TMP_ASCAST:%.*]] = addrspacecast [[T_ST]]* [[REF_TMP]] to [[T_ST]] addrspace(4)*
// CHECK-NEXT: [[AGG_TEMP2:%.*]] = alloca [[T_ST]], align 4
// CHECK-NEXT: [[AGG_TEMP2_ASCAST:%.*]] = addrspacecast [[T_ST]]* [[AGG_TEMP2]] to [[T_ST]] addrspace(4)*
struct st s1;

struct st s2 = __builtin_intel_fpga_reg(s1);
// CHECK: [[TMP_S1:%.*]] = bitcast [[T_ST]] addrspace(4)* [[AGG_TEMP_ASCAST]] to i8 addrspace(4)*
// CHECK: [[TMP_S1:%.*]] = bitcast [[T_ST]] addrspace(4)* [[S2_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: [[TMP_S2:%.*]] = bitcast [[T_ST]] addrspace(4)* [[S1_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* align 4 [[TMP_S1]], i8 addrspace(4)* align 4 [[TMP_S2]], i64 8, i1 false)
// CHECK-NEXT: [[TMP_S3:%.*]] = bitcast [[T_ST]] addrspace(4)* [[AGG_TEMP_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* align 4 [[TMP_S1]], i8 addrspace(4)* align 4 [[TMP_S2]], i64 12, i1 false)
// CHECK-NEXT: [[TMP_S3:%.*]] = bitcast [[T_ST]] addrspace(4)* [[S2_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: [[TMP_S4:%.*]] = call i8 addrspace(4)* @llvm.ptr.annotation.p4i8(i8 addrspace(4)* [[TMP_S3]], [[BIFR_STR]]
// CHECK-NEXT: [[TMP_S5:%.*]] = bitcast i8 addrspace(4)* [[TMP_S4]] to [[T_ST]] addrspace(4)*
// CHECK-NEXT: [[TMP_S6:%.*]] = bitcast [[T_ST]] addrspace(4)* [[S2_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: [[TMP_S7:%.*]] = bitcast [[T_ST]] addrspace(4)* [[TMP_S5]] to i8 addrspace(4)*
// CHECK-NEXT: call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* align 4 [[TMP_S6]], i8 addrspace(4)* align 4 [[TMP_S7]], i64 8, i1 false)

struct st s3;
s3 = __builtin_intel_fpga_reg(s2);
// CHECK: [[TMP_S8:%.*]] = bitcast [[T_ST]] addrspace(4)* [[AGG_TEMP2_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: [[TMP_S9:%.*]] = bitcast [[T_ST]] addrspace(4)* [[S2_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* align 4 [[TMP_S8]], i8 addrspace(4)* align 4 [[TMP_S9]], i64 8, i1 false)
// CHECK-NEXT: [[TMP_S10:%.*]] = bitcast [[T_ST]] addrspace(4)* [[AGG_TEMP2_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: [[TMP_S11:%.*]] = call i8 addrspace(4)* @llvm.ptr.annotation.p4i8(i8 addrspace(4)* [[TMP_S10]], [[BIFR_STR]]
// CHECK-NEXT: [[TMP_S12:%.*]] = bitcast i8 addrspace(4)* [[TMP_S11]] to [[T_ST]] addrspace(4)*
// CHECK-NEXT: [[TMP_S13:%.*]] = bitcast [[T_ST]] addrspace(4)* [[REF_TMP_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: [[TMP_S14:%.*]] = bitcast [[T_ST]] addrspace(4)* [[TMP_S12]] to i8 addrspace(4)*
// CHECK-NEXT: call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* align 4 [[TMP_S13]], i8 addrspace(4)* align 4 [[TMP_S14]], i64 8, i1 false)
// CHECK-NEXT: [[TMP_S15:%.*]] = bitcast [[T_ST]] addrspace(4)* [[S3_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: [[TMP_S16:%.*]] = bitcast [[T_ST]] addrspace(4)* [[REF_TMP_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* align 4 [[TMP_S15]], i8 addrspace(4)* align 4 [[TMP_S16]], i64 8, i1 false)
// CHECK: [[TMP_S6:%.*]] = bitcast [[T_ST]] addrspace(4)* [[REF_TMP_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: [[TMP_S7:%.*]] = bitcast [[T_ST]] addrspace(4)* [[S2_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* align 4 [[TMP_S6]], i8 addrspace(4)* align 4 [[TMP_S7]], i64 12, i1 false)
// CHECK-NEXT: [[TMP_S8:%.*]] = bitcast [[T_ST]] addrspace(4)* [[REF_TMP_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: [[TMP_S9:%.*]] = call i8 addrspace(4)* @llvm.ptr.annotation.p4i8(i8 addrspace(4)* [[TMP_S8]], [[BIFR_STR]]
// CHECK-NEXT: [[TMP_S10:%.*]] = bitcast i8 addrspace(4)* [[TMP_S9]] to [[T_ST]] addrspace(4)*
// CHECK-NEXT: [[TMP_S11:%.*]] = bitcast [[T_ST]] addrspace(4)* [[S3_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: [[TMP_S12:%.*]] = bitcast [[T_ST]] addrspace(4)* [[REF_TMP_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* align 4 [[TMP_S11]], i8 addrspace(4)* align 4 [[TMP_S12]], i64 12, i1 false)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are two memcpy's still needed here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AST for the following instruction: struct st s2 = __builtin_intel_fpga_reg(s1); is:

DeclStmt 0x555565a32cf8
`-VarDecl 0x555565a326d0  used s2 'struct st':'struct st' cinit
  `-ExprWithCleanups 0x555565a32ce0 'struct st':'struct st'
    `-CXXConstructExpr 0x555565a32cb0 'struct st':'struct st' 'void (struct st &&) noexcept' elidable
      `-MaterializeTemporaryExpr 0x555565a329e8 'struct st':'struct st' xvalue
        `-CallExpr 0x555565a329c0 'struct st':'struct st'
          |-ImplicitCastExpr 0x555565a329a8 'void (*)(...) noexcept' <BuiltinFnToFnPtr>
          | `-DeclRefExpr 0x555565a32910 '<builtin fn type>' Function 0x555565a327f0 '__builtin_intel_fpga_reg' 'void (...) noexcept'
          `-DeclRefExpr 0x555565a32930 'struct st':'struct st' lvalue Var 0x555565a0b090 's1' 'struct st':'struct st'

AST for the following instruction: s3 = __builtin_intel_fpga_reg(s2); is:

ExprWithCleanups 0x555565a34a78 'struct st' lvalue
`-CXXOperatorCallExpr 0x555565a34a40 'struct st' lvalue '='
  |-ImplicitCastExpr 0x555565a34a28 'struct st &(*)(struct st &&) noexcept' <FunctionToPointerDecay>
  | `-DeclRefExpr 0x555565a331d0 'struct st &(struct st &&) noexcept' lvalue CXXMethod 0x555565a33028 'operator=' 'struct st &(struct st &&) noexcept'
  |-DeclRefExpr 0x555565a32dd8 'struct st':'struct st' lvalue Var 0x555565a32d30 's3' 'struct st':'struct st'
  `-MaterializeTemporaryExpr 0x555565a331b8 'struct st':'struct st' xvalue
    `-CallExpr 0x555565a32e50 'struct st':'struct st'
      |-ImplicitCastExpr 0x555565a32e38 'void (*)(...) noexcept' <BuiltinFnToFnPtr>
      | `-DeclRefExpr 0x555565a32df8 '<builtin fn type>' Function 0x555565a327f0 '__builtin_intel_fpga_reg' 'void (...) noexcept'
      `-DeclRefExpr 0x555565a32e18 'struct st':'struct st' lvalue Var 0x555565a326d0 's2' 'struct st':'struct st'

In second case clang generates temporary variable ref.tmp to store value of s2 and call __builtin_intel_fpga_reg with ref.tmp argument instead of s2. I suppose this is generic code generation pattern in clang, but I don't have much llvm/clang knowledge to eliminate generation of temporary variable in this particular case for __builtin_intel_fpga_reg intrinsic.

}

void unions() {
Expand All @@ -111,45 +102,35 @@ void unions() {
// CHECK-NEXT: [[U2_ASCAST:%.*]] = addrspacecast [[T_UN]]* [[U2]] to [[T_UN]] addrspace(4)*
// CHECK-NEXT: [[REF_TMP2:%.*]] = alloca [[T_UN]], align 4
// CHECK-NEXT: [[REF_TMP2_ASCAST:%.*]] = addrspacecast [[T_UN]]* [[REF_TMP2]] to [[T_UN]] addrspace(4)*
// CHECK-NEXT: [[AGG_TEMP3:%.*]] = alloca [[T_UN]], align 4
// CHECK-NEXT: [[AGG_TEMP3_ASCAST:%.*]] = addrspacecast [[T_UN]]* [[AGG_TEMP3]] to [[T_UN]] addrspace(4)*
union un u1;
union un u2;

u2 = __builtin_intel_fpga_reg(u1);
// CHECK: [[TMP_U1:%.*]] = bitcast [[T_UN]] addrspace(4)* [[AGG_TEMP3_ASCAST]] to i8 addrspace(4)*
// CHECK: [[TMP_U1:%.*]] = bitcast [[T_UN]] addrspace(4)* [[REF_TMP2_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: [[TMP_U2:%.*]] = bitcast [[T_UN]] addrspace(4)* [[U1_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* align 4 [[TMP_U1]], i8 addrspace(4)* align 4 [[TMP_U2]], i64 4, i1 false)
// CHECK-NEXT: [[TMP_U3:%.*]] = bitcast [[T_UN]] addrspace(4)* [[AGG_TEMP3_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: [[TMP_U3:%.*]] = bitcast [[T_UN]] addrspace(4)* [[REF_TMP2_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: [[TMP_U4:%.*]] = call i8 addrspace(4)* @llvm.ptr.annotation.p4i8(i8 addrspace(4)* [[TMP_U3]], [[BIFR_STR]]
// CHECK-NEXT: [[TMP_U5:%.*]] = bitcast i8 addrspace(4)* [[TMP_U4]] to [[T_UN]] addrspace(4)*
// CHECK-NEXT: [[TMP_U6:%.*]] = bitcast [[T_UN]] addrspace(4)* [[REF_TMP2_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: [[TMP_U7:%.*]] = bitcast [[T_UN]] addrspace(4)* [[TMP_U5]] to i8 addrspace(4)*
// CHECK-NEXT: call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* align 4 [[TMP_U6]], i8 addrspace(4)* align 4 [[TMP_U7]], i64 8, i1 false)
// CHECK-NEXT: [[TMP_U8:%.*]] = bitcast [[T_UN]] addrspace(4)* [[U2_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: [[TMP_U9:%.*]] = bitcast [[T_UN]] addrspace(4)* [[REF_TMP2_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* align 4 [[TMP_U8]], i8 addrspace(4)* align 4 [[TMP_U9]], i64 4, i1 false)
// CHECK-NEXT: [[TMP_U6:%.*]] = bitcast [[T_UN]] addrspace(4)* [[U2_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: [[TMP_U7:%.*]] = bitcast [[T_UN]] addrspace(4)* [[REF_TMP2_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* align 4 [[TMP_U6]], i8 addrspace(4)* align 4 [[TMP_U7]], i64 4, i1 false)
}

void classes() {
// CHECK: [[CA:%.*]] = alloca [[T_CL:%.*]], align 4
// CHECK-NEXT: [[CA_ASCAST:%.*]] = addrspacecast [[T_CL]]* [[CA]] to [[T_CL]] addrspace(4)*
// CHECK-NEXT: [[CB:%.*]] = alloca [[T_CL]], align 4
// CHECK-NEXT: [[CB_ASCAST:%.*]] = addrspacecast [[T_CL]]* [[CB]] to [[T_CL]] addrspace(4)*
// CHECK-NEXT: [[AGG_TEMP5:%.*]] = alloca [[T_CL]], align 4
// CHECK-NEXT: [[AGG_TEMP5_ASCAST:%.*]] = addrspacecast [[T_CL]]*
A ca(213);

A cb = __builtin_intel_fpga_reg(ca);
// CHECK: [[TMP_C1:%.*]] = bitcast [[T_CL]] addrspace(4)* [[AGG_TEMP5_ASCAST]] to i8 addrspace(4)*
// CHECK: [[TMP_C1:%.*]] = bitcast [[T_CL]] addrspace(4)* [[CB_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: [[TMP_C2:%.*]] = bitcast [[T_CL]] addrspace(4)* [[CA_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* align 4 [[TMP_C1]], i8 addrspace(4)* align 4 [[TMP_C2]], i64 4, i1 false)
// CHECK-NEXT: [[TMP_C3:%.*]] = bitcast [[T_CL]] addrspace(4)* [[AGG_TEMP5_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: [[TMP_C3:%.*]] = bitcast [[T_CL]] addrspace(4)* [[CB_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: [[TMP_C4:%.*]] = call i8 addrspace(4)* @llvm.ptr.annotation.p4i8(i8 addrspace(4)* [[TMP_C3]], [[BIFR_STR]]
// CHECK-NEXT: [[TMP_C5:%.*]] = bitcast i8 addrspace(4)* [[TMP_C4]] to [[T_CL]] addrspace(4)*
// CHECK-NEXT: [[TMP_C6:%.*]] = bitcast [[T_CL]] addrspace(4)* [[CB_ASCAST]] to i8 addrspace(4)*
// CHECK-NEXT: [[TMP_C7:%.*]] = bitcast [[T_CL]] addrspace(4)* [[TMP_C5]] to i8 addrspace(4)*
// CHECK-NEXT: call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* align 4 [[TMP_C6]], i8 addrspace(4)* align 4 [[TMP_C7]], i64 8, i1 false)
}

void pointers() {
Expand Down