Skip to content

Commit 065760a

Browse files
committed
[AMDGPU] Use a target feature to enable __builtin_amdgcn_global_load_lds on gfx9/10 (llvm#133055)
This patch introduces the `vmem-to-lds-load-insts` target feature, which can be used to enable builtins `__builtin_amdgcn_global_load_lds` and `__builtin_amdgcn_raw_ptr_buffer_load_lds` on platforms which have this feature. This feature is only available on gfx9/10. A limitation of using a common target feature for both builtins is that we could have made `__builtin_amdgcn_raw_ptr_buffer_load_lds` available on gfx6,7,8.
1 parent a84c256 commit 065760a

File tree

14 files changed

+97
-20
lines changed

14 files changed

+97
-20
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,7 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "at
254254
TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst")
255255
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts")
256256
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts")
257-
TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3IUiIiIUi", "t", "gfx940-insts")
257+
TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3IUiIiIUi", "t", "vmem-to-lds-load-insts")
258258

259259
//===----------------------------------------------------------------------===//
260260
// Deep learning builtins.

clang/lib/Basic/Targets/AMDGPU.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ AMDGPUTargetInfo::AMDGPUTargetInfo(const llvm::Triple &Triple,
252252

253253
MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
254254
CUMode = !(GPUFeatures & llvm::AMDGPU::FEATURE_WGP);
255-
for (auto F : {"image-insts", "gws"})
255+
for (auto F : {"image-insts", "gws", "vmem-to-lds-load-insts"})
256256
ReadOnlyFeatures.insert(F);
257257
HalfArgsAndReturns = true;
258258
}

clang/test/CodeGen/link-builtin-bitcode.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,6 @@ int bar() { return no_attr() + attr_in_target() + attr_not_in_target() + attr_in
4444
// CHECK-SAME: () #[[ATTR_INCOMPATIBLE:[0-9]+]] {
4545

4646
// CHECK: attributes #[[ATTR_BAR]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
47-
// CHECK: attributes #[[ATTR_COMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
48-
// CHECK: attributes #[[ATTR_EXTEND]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+extended-image-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
49-
// CHECK: attributes #[[ATTR_INCOMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx90a-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-gfx9-insts" }
47+
// CHECK: attributes #[[ATTR_COMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize64" }
48+
// CHECK: attributes #[[ATTR_EXTEND]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+extended-image-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize64" }
49+
// CHECK: attributes #[[ATTR_INCOMPATIBLE]] = { {{.*}} "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx90a-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize64,-gfx9-insts" }

clang/test/CodeGenCXX/dynamic-cast-address-space.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,9 +112,9 @@ const B& f(A *a) {
112112
// CHECK: attributes #[[ATTR3]] = { nounwind }
113113
// CHECK: attributes #[[ATTR4]] = { noreturn }
114114
//.
115-
// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR0]] = { mustprogress noinline optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" }
115+
// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR0]] = { mustprogress noinline optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" }
116116
// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR1:[0-9]+]] = { nounwind willreturn memory(read) }
117-
// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR2:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" }
117+
// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR2:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gws,+image-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" }
118118
// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR3]] = { nounwind }
119119
// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR4]] = { noreturn }
120120
//.

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl renamed to clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-lds.cl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2-
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx940 -emit-llvm -o - %s | FileCheck %s
2+
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s
3+
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s
4+
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s
35
// REQUIRES: amdgpu-registered-target
46

57
typedef unsigned int u32;

clang/test/SemaOpenCL/builtins-amdgcn-gfx940-err.cl renamed to clang/test/SemaOpenCL/builtins-amdgcn-global-load-lds-err.cl

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -verify=gfx940,expected -o - %s
1+
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -S -verify=gfx,expected -o - %s
2+
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -S -verify=gfx,expected -o - %s
3+
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -S -verify=gfx,expected -o - %s
24
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -S -verify=gfx950,expected -o - %s
35
// REQUIRES: amdgpu-registered-target
46

@@ -8,12 +10,12 @@ void test_global_load_lds_unsupported_size(global u32* src, local u32 *dst, u32
810
__builtin_amdgcn_global_load_lds(src, dst, size, /*offset=*/0, /*aux=*/0); // expected-error{{argument to '__builtin_amdgcn_global_load_lds' must be a constant integer}}
911
__builtin_amdgcn_global_load_lds(src, dst, /*size=*/4, offset, /*aux=*/0); // expected-error{{argument to '__builtin_amdgcn_global_load_lds' must be a constant integer}}
1012
__builtin_amdgcn_global_load_lds(src, dst, /*size=*/4, /*offset=*/0, aux); // expected-error{{argument to '__builtin_amdgcn_global_load_lds' must be a constant integer}}
11-
__builtin_amdgcn_global_load_lds(src, dst, /*size=*/5, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}}
12-
__builtin_amdgcn_global_load_lds(src, dst, /*size=*/0, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}}
13-
__builtin_amdgcn_global_load_lds(src, dst, /*size=*/3, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}}
14-
__builtin_amdgcn_global_load_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0); // gfx940-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}}
15-
__builtin_amdgcn_global_load_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); // gfx940-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}}
16-
__builtin_amdgcn_global_load_lds(src, dst, /*size=*/-1, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}}
13+
__builtin_amdgcn_global_load_lds(src, dst, /*size=*/5, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}}
14+
__builtin_amdgcn_global_load_lds(src, dst, /*size=*/0, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}}
15+
__builtin_amdgcn_global_load_lds(src, dst, /*size=*/3, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}}
16+
__builtin_amdgcn_global_load_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0); // gfx-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}}
17+
__builtin_amdgcn_global_load_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); // gfx-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}}
18+
__builtin_amdgcn_global_load_lds(src, dst, /*size=*/-1, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}}
1719
}
1820

1921
__attribute__((target("gfx950-insts")))

flang/test/Lower/OpenMP/target_cpu_features.f90

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
!AMDGCN-SAME: "+dl-insts", "+dot1-insts", "+dot10-insts", "+dot2-insts", "+dot3-insts",
1313
!AMDGCN-SAME: "+dot4-insts", "+dot5-insts", "+dot6-insts", "+dot7-insts", "+dpp",
1414
!AMDGCN-SAME: "+gfx8-insts", "+gfx9-insts", "+gws", "+image-insts", "+mai-insts",
15-
!AMDGCN-SAME: "+s-memrealtime", "+s-memtime-inst", "+wavefrontsize64"]>
15+
!AMDGCN-SAME: "+s-memrealtime", "+s-memtime-inst", "+vmem-to-lds-load-insts", "+wavefrontsize64"]>
1616

1717
!NVPTX: module attributes {
1818
!NVPTX-SAME: fir.target_cpu = "sm_80"

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1251,6 +1251,12 @@ def FeatureXF32Insts : SubtargetFeature<"xf32-insts",
12511251
"v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32"
12521252
>;
12531253

1254+
def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts",
1255+
"HasVMemToLDSLoad",
1256+
"true",
1257+
"The platform has memory to lds instructions (global_load w/lds bit set, buffer_load w/lds bit set or global_load_lds. This does not include scratch_load_lds."
1258+
>;
1259+
12541260
// Dummy feature used to disable assembler instructions.
12551261
def FeatureDisable : SubtargetFeature<"",
12561262
"FeatureDisable","true",
@@ -1319,7 +1325,7 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
13191325
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
13201326
FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
13211327
FeatureUnalignedDSAccess, FeatureNegativeScratchOffsetBug, FeatureGWS,
1322-
FeatureDefaultComponentZero
1328+
FeatureDefaultComponentZero, FeatureMemToLDSLoad
13231329
]
13241330
>;
13251331

@@ -1342,7 +1348,8 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
13421348
FeatureUnalignedDSAccess, FeatureImageInsts, FeatureGDS, FeatureGWS,
13431349
FeatureDefaultComponentZero, FeatureMaxHardClauseLength63,
13441350
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
1345-
FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts
1351+
FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts,
1352+
FeatureMemToLDSLoad
13461353
]
13471354
>;
13481355

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3234,7 +3234,8 @@ bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
32343234
}
32353235

32363236
bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3237-
assert(!AMDGPU::isGFX12Plus(STI));
3237+
if (!Subtarget->hasVMemToLDSLoad())
3238+
return false;
32383239
unsigned Opc;
32393240
unsigned Size = MI.getOperand(3).getImm();
32403241

@@ -3365,6 +3366,9 @@ static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
33653366
}
33663367

33673368
bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3369+
if (!Subtarget->hasVMemToLDSLoad())
3370+
return false;
3371+
33683372
unsigned Opc;
33693373
unsigned Size = MI.getOperand(3).getImm();
33703374

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
188188
/// indicates a lack of S_CLAUSE support.
189189
unsigned MaxHardClauseLength = 0;
190190
bool SupportsSRAMECC = false;
191+
bool HasVMemToLDSLoad = false;
191192

192193
// This should not be used directly. 'TargetID' tracks the dynamic settings
193194
// for SRAMECC.
@@ -1317,6 +1318,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
13171318
return hasGFX950Insts();
13181319
}
13191320

1321+
bool hasVMemToLDSLoad() const { return HasVMemToLDSLoad; }
1322+
13201323
bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
13211324

13221325
bool hasVGPRSingleUseHintInsts() const { return HasVGPRSingleUseHintInsts; }

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10001,7 +10001,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
1000110001
case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1000210002
case Intrinsic::amdgcn_struct_buffer_load_lds:
1000310003
case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
10004-
assert(!AMDGPU::isGFX12Plus(*Subtarget));
10004+
if (!Subtarget->hasVMemToLDSLoad())
10005+
return SDValue();
1000510006
unsigned Opc;
1000610007
bool HasVIndex =
1000710008
IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
@@ -10103,6 +10104,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
1010310104
return SDValue(Load, 0);
1010410105
}
1010510106
case Intrinsic::amdgcn_global_load_lds: {
10107+
if (!Subtarget->hasVMemToLDSLoad())
10108+
return SDValue();
10109+
1010610110
unsigned Opc;
1010710111
unsigned Size = Op->getConstantOperandVal(4);
1010810112
switch (Size) {

llvm/lib/TargetParser/TargetParser.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
362362
Features["mai-insts"] = true;
363363
Features["wavefrontsize32"] = true;
364364
Features["wavefrontsize64"] = true;
365+
Features["vmem-to-lds-load-insts"] = true;
365366
} else if (T.isAMDGCN()) {
366367
AMDGPU::GPUKind Kind = parseArchAMDGCN(GPU);
367368
switch (Kind) {
@@ -447,6 +448,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
447448
Features["s-memrealtime"] = true;
448449
Features["s-memtime-inst"] = true;
449450
Features["gws"] = true;
451+
Features["vmem-to-lds-load-insts"] = true;
450452
break;
451453
case GK_GFX1012:
452454
case GK_GFX1011:
@@ -471,6 +473,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
471473
Features["s-memrealtime"] = true;
472474
Features["s-memtime-inst"] = true;
473475
Features["gws"] = true;
476+
Features["vmem-to-lds-load-insts"] = true;
474477
break;
475478
case GK_GFX950:
476479
Features["bitop3-insts"] = true;
@@ -523,6 +526,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
523526
Features["ci-insts"] = true;
524527
Features["s-memtime-inst"] = true;
525528
Features["gws"] = true;
529+
Features["vmem-to-lds-load-insts"] = true;
526530
break;
527531
case GK_GFX90A:
528532
Features["gfx90a-insts"] = true;
@@ -575,6 +579,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
575579
Features["image-insts"] = true;
576580
Features["s-memtime-inst"] = true;
577581
Features["gws"] = true;
582+
Features["vmem-to-lds-load-insts"] = true;
578583
break;
579584
case GK_NONE:
580585
break;
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
; RUN: split-file %s %t
2+
;
3+
; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 %t/struct.ll 2>&1 | FileCheck --ignore-case %s
4+
; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 %t/struct.ll 2>&1 | FileCheck --ignore-case %s
5+
; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 %t/struct.ptr.ll 2>&1 | FileCheck --ignore-case --check-prefix=LEGALIZER-FAIL %s
6+
; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 %t/struct.ptr.ll 2>&1 | FileCheck --ignore-case %s
7+
; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 %t/raw.ll 2>&1 | FileCheck --ignore-case %s
8+
; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 %t/raw.ll 2>&1 | FileCheck --ignore-case %s
9+
; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 %t/raw.ptr.ll 2>&1 | FileCheck --ignore-case --check-prefix=LEGALIZER-FAIL %s
10+
; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 %t/raw.ptr.ll 2>&1 | FileCheck --ignore-case %s
11+
;
12+
; CHECK: LLVM ERROR: Cannot select
13+
; LEGALIZER-FAIL: Do not know how to expand this operator's operand!
14+
15+
;--- struct.ll
16+
define amdgpu_ps void @buffer_load_lds(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) {
17+
call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0)
18+
ret void
19+
}
20+
21+
;--- struct.ptr.ll
22+
define amdgpu_ps void @buffer_load_lds(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
23+
call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0)
24+
ret void
25+
}
26+
27+
;--- raw.ll
28+
define amdgpu_ps void @buffer_load_lds(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) {
29+
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0)
30+
ret void
31+
}
32+
33+
;--- raw.ptr.ll
34+
define amdgpu_ps void @buffer_load_lds(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
35+
call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0)
36+
ret void
37+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx810 %s 2>&1 | FileCheck --ignore-case %s
2+
; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx810 %s 2>&1 | FileCheck --ignore-case %s
3+
; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck --ignore-case %s
4+
; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck --ignore-case %s
5+
;
6+
; CHECK: LLVM ERROR: Cannot select
7+
8+
declare void @llvm.amdgcn.global.load.lds(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
9+
10+
define amdgpu_ps void @global_load_lds_dword(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr) {
11+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0)
12+
ret void
13+
}

0 commit comments

Comments
 (0)