Skip to content

Commit fb33af0

Browse files
authored
[NVPTX] Remove nvvm.ldg.global.* intrinsics (#112834)
Remove these intrinsics which can be better represented by load instructions with `!invariant.load` metadata: - llvm.nvvm.ldg.global.i - llvm.nvvm.ldg.global.f - llvm.nvvm.ldg.global.p
1 parent 7b3da7b commit fb33af0

File tree

11 files changed

+197
-248
lines changed

11 files changed

+197
-248
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20492,8 +20492,8 @@ static NVPTXMmaInfo getNVPTXMmaInfo(unsigned BuiltinID) {
2049220492
#undef MMA_VARIANTS_B1_XOR
2049320493
}
2049420494

20495-
static Value *MakeLdgLdu(unsigned IntrinsicID, CodeGenFunction &CGF,
20496-
const CallExpr *E) {
20495+
static Value *MakeLdu(unsigned IntrinsicID, CodeGenFunction &CGF,
20496+
const CallExpr *E) {
2049720497
Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
2049820498
QualType ArgType = E->getArg(0)->getType();
2049920499
clang::CharUnits Align = CGF.CGM.getNaturalPointeeTypeAlignment(ArgType);
@@ -20503,6 +20503,21 @@ static Value *MakeLdgLdu(unsigned IntrinsicID, CodeGenFunction &CGF,
2050320503
{Ptr, ConstantInt::get(CGF.Builder.getInt32Ty(), Align.getQuantity())});
2050420504
}
2050520505

20506+
static Value *MakeLdg(CodeGenFunction &CGF, const CallExpr *E) {
20507+
Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
20508+
QualType ArgType = E->getArg(0)->getType();
20509+
clang::CharUnits AlignV = CGF.CGM.getNaturalPointeeTypeAlignment(ArgType);
20510+
llvm::Type *ElemTy = CGF.ConvertTypeForMem(ArgType->getPointeeType());
20511+
20512+
// Use addrspace(1) for NVPTX ADDRESS_SPACE_GLOBAL
20513+
auto *ASC = CGF.Builder.CreateAddrSpaceCast(Ptr, CGF.Builder.getPtrTy(1));
20514+
auto *LD = CGF.Builder.CreateAlignedLoad(ElemTy, ASC, AlignV.getAsAlign());
20515+
MDNode *MD = MDNode::get(CGF.Builder.getContext(), {});
20516+
LD->setMetadata(LLVMContext::MD_invariant_load, MD);
20517+
20518+
return LD;
20519+
}
20520+
2050620521
static Value *MakeScopedAtomic(unsigned IntrinsicID, CodeGenFunction &CGF,
2050720522
const CallExpr *E) {
2050820523
Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
@@ -20536,9 +20551,11 @@ static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
2053620551
return nullptr;
2053720552
}
2053820553

20539-
if (IntrinsicID == Intrinsic::nvvm_ldg_global_f ||
20540-
IntrinsicID == Intrinsic::nvvm_ldu_global_f)
20541-
return MakeLdgLdu(IntrinsicID, CGF, E);
20554+
if (BuiltinID == NVPTX::BI__nvvm_ldg_h || BuiltinID == NVPTX::BI__nvvm_ldg_h2)
20555+
return MakeLdg(CGF, E);
20556+
20557+
if (IntrinsicID == Intrinsic::nvvm_ldu_global_f)
20558+
return MakeLdu(IntrinsicID, CGF, E);
2054220559

2054320560
SmallVector<Value *, 16> Args;
2054420561
auto *F = CGF.CGM.getIntrinsic(IntrinsicID);
@@ -20675,16 +20692,15 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
2067520692
case NVPTX::BI__nvvm_ldg_ul2:
2067620693
case NVPTX::BI__nvvm_ldg_ull:
2067720694
case NVPTX::BI__nvvm_ldg_ull2:
20678-
// PTX Interoperability section 2.2: "For a vector with an even number of
20679-
// elements, its alignment is set to number of elements times the alignment
20680-
// of its member: n*alignof(t)."
20681-
return MakeLdgLdu(Intrinsic::nvvm_ldg_global_i, *this, E);
2068220695
case NVPTX::BI__nvvm_ldg_f:
2068320696
case NVPTX::BI__nvvm_ldg_f2:
2068420697
case NVPTX::BI__nvvm_ldg_f4:
2068520698
case NVPTX::BI__nvvm_ldg_d:
2068620699
case NVPTX::BI__nvvm_ldg_d2:
20687-
return MakeLdgLdu(Intrinsic::nvvm_ldg_global_f, *this, E);
20700+
// PTX Interoperability section 2.2: "For a vector with an even number of
20701+
// elements, its alignment is set to number of elements times the alignment
20702+
// of its member: n*alignof(t)."
20703+
return MakeLdg(*this, E);
2068820704

2068920705
case NVPTX::BI__nvvm_ldu_c:
2069020706
case NVPTX::BI__nvvm_ldu_sc:
@@ -20715,13 +20731,13 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
2071520731
case NVPTX::BI__nvvm_ldu_ul2:
2071620732
case NVPTX::BI__nvvm_ldu_ull:
2071720733
case NVPTX::BI__nvvm_ldu_ull2:
20718-
return MakeLdgLdu(Intrinsic::nvvm_ldu_global_i, *this, E);
20734+
return MakeLdu(Intrinsic::nvvm_ldu_global_i, *this, E);
2071920735
case NVPTX::BI__nvvm_ldu_f:
2072020736
case NVPTX::BI__nvvm_ldu_f2:
2072120737
case NVPTX::BI__nvvm_ldu_f4:
2072220738
case NVPTX::BI__nvvm_ldu_d:
2072320739
case NVPTX::BI__nvvm_ldu_d2:
20724-
return MakeLdgLdu(Intrinsic::nvvm_ldu_global_f, *this, E);
20740+
return MakeLdu(Intrinsic::nvvm_ldu_global_f, *this, E);
2072520741

2072620742
case NVPTX::BI__nvvm_atom_cta_add_gen_i:
2072720743
case NVPTX::BI__nvvm_atom_cta_add_gen_l:
@@ -21195,14 +21211,11 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
2119521211
return MakeHalfType(Intrinsic::nvvm_fmin_xorsign_abs_f16x2, BuiltinID, E,
2119621212
*this);
2119721213
case NVPTX::BI__nvvm_ldg_h:
21198-
return MakeHalfType(Intrinsic::nvvm_ldg_global_f, BuiltinID, E, *this);
2119921214
case NVPTX::BI__nvvm_ldg_h2:
21200-
return MakeHalfType(Intrinsic::nvvm_ldg_global_f, BuiltinID, E, *this);
21215+
return MakeHalfType(Intrinsic::not_intrinsic, BuiltinID, E, *this);
2120121216
case NVPTX::BI__nvvm_ldu_h:
21217+
case NVPTX::BI__nvvm_ldu_h2:
2120221218
return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
21203-
case NVPTX::BI__nvvm_ldu_h2: {
21204-
return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
21205-
}
2120621219
case NVPTX::BI__nvvm_cp_async_ca_shared_global_4:
2120721220
return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_4,
2120821221
Intrinsic::nvvm_cp_async_ca_shared_global_4_s, *this, E,

clang/test/CodeGen/builtins-nvptx-native-half-type-native.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,8 @@ typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2)));
5252
// CHECK: call <2 x half> @llvm.nvvm.fmax.ftz.xorsign.abs.f16x2(<2 x half> {{.*}}, <2 x half> {{.*}})
5353
// CHECK: call <2 x half> @llvm.nvvm.fmax.nan.xorsign.abs.f16x2(<2 x half> {{.*}}, <2 x half> {{.*}})
5454
// CHECK: call <2 x half> @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16x2(<2 x half> {{.*}}, <2 x half> {{.*}})
55-
// CHECK: call half @llvm.nvvm.ldg.global.f.f16.p0(ptr {{.*}}, i32 2)
56-
// CHECK: call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr {{.*}}, i32 4)
55+
// CHECK: load half, ptr addrspace(1) {{.*}}, align 2, !invariant.load
56+
// CHECK: load <2 x half>, ptr addrspace(1) {{.*}}, align 4, !invariant.load
5757
// CHECK: call half @llvm.nvvm.ldu.global.f.f16.p0(ptr {{.*}}, i32 2)
5858
// CHECK: call <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p0(ptr {{.*}}, i32 4)
5959
__device__ void nvvm_native_half_types(void *a, void*b, void*c, __fp16* out) {

clang/test/CodeGen/builtins-nvptx-native-half-type.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,9 +177,9 @@ typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2)));
177177

178178
// CHECK-LABEL: nvvm_ldg_native_half_types
179179
__device__ void nvvm_ldg_native_half_types(const void *p) {
180-
// CHECK: call half @llvm.nvvm.ldg.global.f.f16.p0
180+
// CHECK: load half, ptr addrspace(1) {{.*}}, align 2, !invariant.load
181181
__nvvm_ldg_h((const __fp16 *)p);
182-
// CHECK: call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0
182+
// CHECK: load <2 x half>, ptr addrspace(1) {{.*}}, align 4, !invariant.load
183183
__nvvm_ldg_h2((const __fp16v2 *)p);
184184
}
185185

clang/test/CodeGen/builtins-nvptx.c

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -598,33 +598,33 @@ __device__ void nvvm_atom(float *fp, float f, double *dfp, double df,
598598

599599
// CHECK-LABEL: nvvm_ldg
600600
__device__ void nvvm_ldg(const void *p) {
601-
// CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
602-
// CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
603-
// CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
601+
// CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
602+
// CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
603+
// CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
604604
__nvvm_ldg_c((const char *)p);
605605
__nvvm_ldg_uc((const unsigned char *)p);
606606
__nvvm_ldg_sc((const signed char *)p);
607607

608-
// CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
609-
// CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
608+
// CHECK: load i16, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
609+
// CHECK: load i16, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
610610
__nvvm_ldg_s((const short *)p);
611611
__nvvm_ldg_us((const unsigned short *)p);
612612

613-
// CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
614-
// CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
613+
// CHECK: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
614+
// CHECK: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
615615
__nvvm_ldg_i((const int *)p);
616616
__nvvm_ldg_ui((const unsigned int *)p);
617617

618-
// LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
619-
// LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
620-
// LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
621-
// LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
618+
// LP32: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
619+
// LP32: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
620+
// LP64: load i64, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
621+
// LP64: load i64, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
622622
__nvvm_ldg_l((const long *)p);
623623
__nvvm_ldg_ul((const unsigned long *)p);
624624

625-
// CHECK: call float @llvm.nvvm.ldg.global.f.f32.p0(ptr {{%[0-9]+}}, i32 4)
625+
// CHECK: load float, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
626626
__nvvm_ldg_f((const float *)p);
627-
// CHECK: call double @llvm.nvvm.ldg.global.f.f64.p0(ptr {{%[0-9]+}}, i32 8)
627+
// CHECK: load double, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
628628
__nvvm_ldg_d((const double *)p);
629629

630630
// In practice, the pointers we pass to __ldg will be aligned as appropriate
@@ -636,79 +636,79 @@ __device__ void nvvm_ldg(const void *p) {
636636
// elements, its alignment is set to number of elements times the alignment of
637637
// its member: n*alignof(t)."
638638

639-
// CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
640-
// CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
641-
// CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
639+
// CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
640+
// CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
641+
// CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
642642
typedef char char2 __attribute__((ext_vector_type(2)));
643643
typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
644644
typedef signed char schar2 __attribute__((ext_vector_type(2)));
645645
__nvvm_ldg_c2((const char2 *)p);
646646
__nvvm_ldg_uc2((const uchar2 *)p);
647647
__nvvm_ldg_sc2((const schar2 *)p);
648648

649-
// CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
650-
// CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
651-
// CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
649+
// CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
650+
// CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
651+
// CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
652652
typedef char char4 __attribute__((ext_vector_type(4)));
653653
typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
654654
typedef signed char schar4 __attribute__((ext_vector_type(4)));
655655
__nvvm_ldg_c4((const char4 *)p);
656656
__nvvm_ldg_uc4((const uchar4 *)p);
657657
__nvvm_ldg_sc4((const schar4 *)p);
658658

659-
// CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
660-
// CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
659+
// CHECK: load <2 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
660+
// CHECK: load <2 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
661661
typedef short short2 __attribute__((ext_vector_type(2)));
662662
typedef unsigned short ushort2 __attribute__((ext_vector_type(2)));
663663
__nvvm_ldg_s2((const short2 *)p);
664664
__nvvm_ldg_us2((const ushort2 *)p);
665665

666-
// CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
667-
// CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
666+
// CHECK: load <4 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
667+
// CHECK: load <4 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
668668
typedef short short4 __attribute__((ext_vector_type(4)));
669669
typedef unsigned short ushort4 __attribute__((ext_vector_type(4)));
670670
__nvvm_ldg_s4((const short4 *)p);
671671
__nvvm_ldg_us4((const ushort4 *)p);
672672

673-
// CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
674-
// CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
673+
// CHECK: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
674+
// CHECK: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
675675
typedef int int2 __attribute__((ext_vector_type(2)));
676676
typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
677677
__nvvm_ldg_i2((const int2 *)p);
678678
__nvvm_ldg_ui2((const uint2 *)p);
679679

680-
// CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
681-
// CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
680+
// CHECK: load <4 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
681+
// CHECK: load <4 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
682682
typedef int int4 __attribute__((ext_vector_type(4)));
683683
typedef unsigned int uint4 __attribute__((ext_vector_type(4)));
684684
__nvvm_ldg_i4((const int4 *)p);
685685
__nvvm_ldg_ui4((const uint4 *)p);
686686

687-
// LP32: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
688-
// LP32: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
689-
// LP64: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
690-
// LP64: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
687+
// LP32: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
688+
// LP32: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
689+
// LP64: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
690+
// LP64: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
691691
typedef long long2 __attribute__((ext_vector_type(2)));
692692
typedef unsigned long ulong2 __attribute__((ext_vector_type(2)));
693693
__nvvm_ldg_l2((const long2 *)p);
694694
__nvvm_ldg_ul2((const ulong2 *)p);
695695

696-
// CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
697-
// CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
696+
// CHECK: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
697+
// CHECK: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
698698
typedef long long longlong2 __attribute__((ext_vector_type(2)));
699699
typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2)));
700700
__nvvm_ldg_ll2((const longlong2 *)p);
701701
__nvvm_ldg_ull2((const ulonglong2 *)p);
702702

703-
// CHECK: call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0(ptr {{%[0-9]+}}, i32 8)
703+
// CHECK: load <2 x float>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
704704
typedef float float2 __attribute__((ext_vector_type(2)));
705705
__nvvm_ldg_f2((const float2 *)p);
706706

707-
// CHECK: call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0(ptr {{%[0-9]+}}, i32 16)
707+
// CHECK: load <4 x float>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
708708
typedef float float4 __attribute__((ext_vector_type(4)));
709709
__nvvm_ldg_f4((const float4 *)p);
710710

711-
// CHECK: call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr {{%[0-9]+}}, i32 16)
711+
// CHECK: load <2 x double>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
712712
typedef double double2 __attribute__((ext_vector_type(2)));
713713
__nvvm_ldg_d2((const double2 *)p);
714714
}

llvm/docs/ReleaseNotes.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,14 @@ Changes to the LLVM IR
8888
* `llvm.nvvm.ptr.shared.to.gen`
8989
* `llvm.nvvm.ptr.constant.to.gen`
9090
* `llvm.nvvm.ptr.local.to.gen`
91-
91+
92+
* Remove the following intrinsics which can be relaced with a load from
93+
addrspace(1) with an !invariant.load metadata
94+
95+
* `llvm.nvvm.ldg.global.i`
96+
* `llvm.nvvm.ldg.global.f`
97+
* `llvm.nvvm.ldg.global.p`
98+
9299
* Operand bundle values can now be metadata strings.
93100

94101
Changes to LLVM infrastructure

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@
4242
// * llvm.nvvm.ptr.shared.to.gen --> ibid.
4343
// * llvm.nvvm.ptr.constant.to.gen --> ibid.
4444
// * llvm.nvvm.ptr.local.to.gen --> ibid.
45+
// * llvm.nvvm.ldg.global.i --> load addrspace(1) !load.invariant
46+
// * llvm.nvvm.ldg.global.f --> ibid.
47+
// * llvm.nvvm.ldg.global.p --> ibid.
4548

4649
def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
4750
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
@@ -1605,21 +1608,6 @@ def int_nvvm_ldu_global_p : Intrinsic<[llvm_anyptr_ty],
16051608
[IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
16061609
"llvm.nvvm.ldu.global.p">;
16071610

1608-
// Generated within nvvm. Use for ldg on sm_35 or later. Second arg is the
1609-
// pointer's alignment.
1610-
def int_nvvm_ldg_global_i : Intrinsic<[llvm_anyint_ty],
1611-
[llvm_anyptr_ty, llvm_i32_ty],
1612-
[IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
1613-
"llvm.nvvm.ldg.global.i">;
1614-
def int_nvvm_ldg_global_f : Intrinsic<[llvm_anyfloat_ty],
1615-
[llvm_anyptr_ty, llvm_i32_ty],
1616-
[IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
1617-
"llvm.nvvm.ldg.global.f">;
1618-
def int_nvvm_ldg_global_p : Intrinsic<[llvm_anyptr_ty],
1619-
[llvm_anyptr_ty, llvm_i32_ty],
1620-
[IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
1621-
"llvm.nvvm.ldg.global.p">;
1622-
16231611
// Used in nvvm internally to help address space opt and ptx code generation
16241612
// This is for params that are passed to kernel functions by pointer by-val.
16251613
def int_nvvm_ptr_gen_to_param: Intrinsic<[llvm_anyptr_ty],

llvm/lib/IR/AutoUpgrade.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include "llvm/IR/MDBuilder.h"
3838
#include "llvm/IR/Metadata.h"
3939
#include "llvm/IR/Module.h"
40+
#include "llvm/IR/Value.h"
4041
#include "llvm/IR/Verifier.h"
4142
#include "llvm/Support/AMDGPUAddrSpace.h"
4243
#include "llvm/Support/CommandLine.h"
@@ -1301,6 +1302,10 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
13011302
(Name.consume_front("local") || Name.consume_front("shared") ||
13021303
Name.consume_front("global") || Name.consume_front("constant")) &&
13031304
Name.starts_with(".to.gen");
1305+
else if (Name.consume_front("ldg.global."))
1306+
// nvvm.ldg.global.{i,p,f}
1307+
Expand = (Name.starts_with("i.") || Name.starts_with("f.") ||
1308+
Name.starts_with("p."));
13041309
else
13051310
Expand = false;
13061311

@@ -2363,6 +2368,15 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
23632368
Name.consume_front("constant")) &&
23642369
Name.starts_with(".to.gen"))) {
23652370
Rep = Builder.CreateAddrSpaceCast(CI->getArgOperand(0), CI->getType());
2371+
} else if (Name.consume_front("ldg.global")) {
2372+
Value *Ptr = CI->getArgOperand(0);
2373+
Align PtrAlign = cast<ConstantInt>(CI->getArgOperand(1))->getAlignValue();
2374+
// Use addrspace(1) for NVPTX ADDRESS_SPACE_GLOBAL
2375+
Value *ASC = Builder.CreateAddrSpaceCast(Ptr, Builder.getPtrTy(1));
2376+
Instruction *LD = Builder.CreateAlignedLoad(CI->getType(), ASC, PtrAlign);
2377+
MDNode *MD = MDNode::get(Builder.getContext(), {});
2378+
LD->setMetadata(LLVMContext::MD_invariant_load, MD);
2379+
return LD;
23662380
} else {
23672381
Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name);
23682382
if (IID != Intrinsic::not_intrinsic &&

0 commit comments

Comments
 (0)