Skip to content

Commit 932f628

Browse files
committed
[AMDGPU] new gfx940 fp atomics
Differential Revision: https://reviews.llvm.org/D121028
1 parent 89d5c31 commit 932f628

File tree

16 files changed

+790
-4
lines changed

16 files changed

+790
-4
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,12 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fmax_f64, "dd*0d", "t", "gfx90a-inst
209209
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_f64, "dd*3d", "t", "gfx90a-insts")
210210
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_f32, "ff*3f", "t", "gfx8-insts")
211211

212+
TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_f32, "ff*0f", "t", "gfx940-insts")
213+
TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2f16, "V2hV2h*0V2h", "t", "gfx940-insts")
214+
TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "gfx940-insts")
215+
TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "gfx940-insts")
216+
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "gfx940-insts")
217+
212218
//===----------------------------------------------------------------------===//
213219
// Deep learning builtins.
214220
//===----------------------------------------------------------------------===//

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16513,7 +16513,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1651316513
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
1651416514
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
1651516515
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
16516-
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
16516+
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
16517+
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
16518+
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: {
1651716519
Intrinsic::ID IID;
1651816520
llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
1651916521
switch (BuiltinID) {
@@ -16544,13 +16546,38 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1654416546
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
1654516547
IID = Intrinsic::amdgcn_flat_atomic_fmax;
1654616548
break;
16549+
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
16550+
ArgTy = llvm::Type::getFloatTy(getLLVMContext());
16551+
IID = Intrinsic::amdgcn_flat_atomic_fadd;
16552+
break;
16553+
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
16554+
ArgTy = llvm::FixedVectorType::get(
16555+
llvm::Type::getHalfTy(getLLVMContext()), 2);
16556+
IID = Intrinsic::amdgcn_flat_atomic_fadd;
16557+
break;
1654716558
}
1654816559
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
1654916560
llvm::Value *Val = EmitScalarExpr(E->getArg(1));
1655016561
llvm::Function *F =
1655116562
CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()});
1655216563
return Builder.CreateCall(F, {Addr, Val});
1655316564
}
16565+
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
16566+
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
16567+
Intrinsic::ID IID;
16568+
switch (BuiltinID) {
16569+
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
16570+
IID = Intrinsic::amdgcn_global_atomic_fadd_v2bf16;
16571+
break;
16572+
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
16573+
IID = Intrinsic::amdgcn_flat_atomic_fadd_v2bf16;
16574+
break;
16575+
}
16576+
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
16577+
llvm::Value *Val = EmitScalarExpr(E->getArg(1));
16578+
llvm::Function *F = CGM.getIntrinsic(IID, {Addr->getType()});
16579+
return Builder.CreateCall(F, {Addr, Val});
16580+
}
1655416581
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1655516582
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32: {
1655616583
Intrinsic::ID IID;
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \
2+
// RUN: -verify -S -o - %s
3+
4+
// REQUIRES: amdgpu-registered-target
5+
6+
typedef half __attribute__((ext_vector_type(2))) half2;
7+
typedef short __attribute__((ext_vector_type(2))) short2;
8+
9+
void test_atomic_fadd(__global half2 *addrh2, half2 xh2,
10+
__global short2 *addrs2, __local short2 *addrs2l, short2 xs2,
11+
__global float *addrf, float xf) {
12+
__builtin_amdgcn_flat_atomic_fadd_f32(addrf, xf); // expected-error{{'__builtin_amdgcn_flat_atomic_fadd_f32' needs target feature gfx940-insts}}
13+
__builtin_amdgcn_flat_atomic_fadd_v2f16(addrh2, xh2); // expected-error{{'__builtin_amdgcn_flat_atomic_fadd_v2f16' needs target feature gfx940-insts}}
14+
__builtin_amdgcn_flat_atomic_fadd_v2bf16(addrs2, xs2); // expected-error{{'__builtin_amdgcn_flat_atomic_fadd_v2bf16' needs target feature gfx940-insts}}
15+
__builtin_amdgcn_global_atomic_fadd_v2bf16(addrs2, xs2); // expected-error{{'__builtin_amdgcn_global_atomic_fadd_v2bf16' needs target feature gfx940-insts}}
16+
__builtin_amdgcn_ds_atomic_fadd_v2bf16(addrs2l, xs2); // expected-error{{'__builtin_amdgcn_ds_atomic_fadd_v2bf16' needs target feature gfx940-insts}}
17+
}
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx940 \
2+
// RUN: %s -S -emit-llvm -o - | FileCheck %s
3+
4+
// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx940 \
5+
// RUN: -S -o - %s | FileCheck -check-prefix=GFX940 %s
6+
7+
// REQUIRES: amdgpu-registered-target
8+
9+
typedef half __attribute__((ext_vector_type(2))) half2;
10+
typedef short __attribute__((ext_vector_type(2))) short2;
11+
12+
// CHECK-LABEL: test_flat_add_f32
13+
// CHECK: call float @llvm.amdgcn.flat.atomic.fadd.f32.p0f32.f32(float* %{{.*}}, float %{{.*}})
14+
// GFX940-LABEL: test_flat_add_f32
15+
// GFX940: flat_atomic_add_f32
16+
half2 test_flat_add_f32(__generic float *addr, float x) {
17+
return __builtin_amdgcn_flat_atomic_fadd_f32(addr, x);
18+
}
19+
20+
// CHECK-LABEL: test_flat_add_2f16
21+
// CHECK: call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0v2f16.v2f16(<2 x half>* %{{.*}}, <2 x half> %{{.*}})
22+
// GFX940-LABEL: test_flat_add_2f16
23+
// GFX940: flat_atomic_pk_add_f16
24+
half2 test_flat_add_2f16(__generic half2 *addr, half2 x) {
25+
return __builtin_amdgcn_flat_atomic_fadd_v2f16(addr, x);
26+
}
27+
28+
// CHECK-LABEL: test_flat_add_2bf16
29+
// CHECK: call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0v2i16(<2 x i16>* %{{.*}}, <2 x i16> %{{.*}})
30+
// GFX940-LABEL: test_flat_add_2bf16
31+
// GFX940: flat_atomic_pk_add_bf16
32+
short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) {
33+
return __builtin_amdgcn_flat_atomic_fadd_v2bf16(addr, x);
34+
}
35+
36+
// CHECK-LABEL: test_global_add_2bf16
37+
// CHECK: call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1v2i16(<2 x i16> addrspace(1)* %{{.*}}, <2 x i16> %{{.*}})
38+
// GFX940-LABEL: test_global_add_2bf16
39+
// GFX940: global_atomic_pk_add_bf16
40+
short2 test_global_add_2bf16(__global short2 *addr, short2 x) {
41+
return __builtin_amdgcn_global_atomic_fadd_v2bf16(addr, x);
42+
}
43+
44+
// CHECK-LABEL: test_local_add_2bf16
45+
// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3)* %{{.*}}, <2 x i16> %
46+
// GFX940-LABEL: test_local_add_2bf16
47+
// GFX940: ds_pk_add_rtn_bf16
48+
short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
49+
return __builtin_amdgcn_ds_atomic_fadd_v2bf16(addr, x);
50+
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1978,6 +1978,19 @@ def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, ll
19781978
def int_amdgcn_mfma_f64_16x16x4f64 : AMDGPUMfmaIntrinsic<llvm_v4f64_ty, llvm_double_ty>;
19791979
def int_amdgcn_mfma_f64_4x4x4f64 : AMDGPUMfmaIntrinsic<llvm_double_ty, llvm_double_ty>;
19801980

1981+
//===----------------------------------------------------------------------===//
1982+
// gfx940 intrinsics
1983+
// ===----------------------------------------------------------------------===//
1984+
1985+
// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
1986+
def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>;
1987+
def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>;
1988+
def int_amdgcn_ds_fadd_v2bf16 : Intrinsic<
1989+
[llvm_v2i16_ty],
1990+
[LLVMQualPointerType<llvm_v2i16_ty, 3>, llvm_v2i16_ty],
1991+
[IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>]>,
1992+
GCCBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">;
1993+
19811994
//===----------------------------------------------------------------------===//
19821995
// Special Intrinsics for backend internal use only. No frontend
19831996
// should emit calls to these.

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1558,6 +1558,13 @@ def HasFmaLegacy32 : Predicate<"Subtarget->hasGFX10_3Insts()">,
15581558
def HasAtomicFaddInsts : Predicate<"Subtarget->hasAtomicFaddInsts()">,
15591559
AssemblerPredicate<(all_of FeatureAtomicFaddInsts)>;
15601560

1561+
// Differentiate between two functionally equivalent, but incompatible
1562+
// encoding-wise FP atomics between gfx90* and gfx940
1563+
def HasAtomicFaddInstsGFX90X : Predicate<"Subtarget->hasAtomicFaddInsts()">,
1564+
AssemblerPredicate<(all_of FeatureAtomicFaddInsts, (not FeatureGFX940Insts))>;
1565+
def HasAtomicFaddInstsGFX940 : Predicate<"Subtarget->hasAtomicFaddInsts()">,
1566+
AssemblerPredicate<(all_of FeatureAtomicFaddInsts, FeatureGFX940Insts)>;
1567+
15611568
def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">,
15621569
AssemblerPredicate<(all_of FeatureDsSrc2Insts)>;
15631570

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1297,6 +1297,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
12971297
Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
12981298
if (ST.hasGFX90AInsts())
12991299
Atomic.legalFor({{S64, LocalPtr}});
1300+
if (ST.hasGFX940Insts())
1301+
Atomic.legalFor({{V2S16, LocalPtr}});
13001302
}
13011303
if (ST.hasAtomicFaddInsts())
13021304
Atomic.legalFor({{S32, GlobalPtr}});

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4339,6 +4339,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
43394339
case Intrinsic::amdgcn_flat_atomic_fadd:
43404340
case Intrinsic::amdgcn_flat_atomic_fmin:
43414341
case Intrinsic::amdgcn_flat_atomic_fmax:
4342+
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
4343+
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
43424344
return getDefaultMappingAllVGPR(MI);
43434345
case Intrinsic::amdgcn_ds_ordered_add:
43444346
case Intrinsic::amdgcn_ds_ordered_swap: {

llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,9 +205,12 @@ def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>;
205205
def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd>;
206206
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>;
207207
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>;
208+
def : SourceOfDivergence<int_amdgcn_global_atomic_fadd_v2bf16>;
209+
def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd_v2bf16>;
208210
def : SourceOfDivergence<int_amdgcn_ds_fadd>;
209211
def : SourceOfDivergence<int_amdgcn_ds_fmin>;
210212
def : SourceOfDivergence<int_amdgcn_ds_fmax>;
213+
def : SourceOfDivergence<int_amdgcn_ds_fadd_v2bf16>;
211214
def : SourceOfDivergence<int_amdgcn_buffer_atomic_swap>;
212215
def : SourceOfDivergence<int_amdgcn_buffer_atomic_add>;
213216
def : SourceOfDivergence<int_amdgcn_buffer_atomic_sub>;

llvm/lib/Target/AMDGPU/DSInstructions.td

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,13 @@ let SubtargetPredicate = isGFX90APlus in {
462462
defm DS_ADD_RTN_F64 : DS_1A1D_RET_mc_gfx9<"ds_add_rtn_f64", VReg_64, "ds_add_f64">;
463463
} // End SubtargetPredicate = isGFX90APlus
464464

465+
let SubtargetPredicate = isGFX940Plus in {
466+
defm DS_PK_ADD_F16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_f16">;
467+
defm DS_PK_ADD_RTN_F16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_f16", VGPR_32, "ds_pk_add_f16">;
468+
defm DS_PK_ADD_BF16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_bf16">;
469+
defm DS_PK_ADD_RTN_BF16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_bf16", VGPR_32, "ds_pk_add_bf16">;
470+
} // End SubtargetPredicate = isGFX940Plus
471+
465472
defm DS_MSKOR_B32 : DS_1A2D_NORET_mc<"ds_mskor_b32">;
466473
defm DS_CMPST_B32 : DS_1A2D_NORET_mc<"ds_cmpst_b32">;
467474
defm DS_CMPST_F32 : DS_1A2D_NORET_mc<"ds_cmpst_f32">;
@@ -998,6 +1005,14 @@ def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_ret_64>;
9981005
def : DSAtomicRetPat<DS_ADD_F64, f64, atomic_load_fadd_local_noret_64>;
9991006
}
10001007

1008+
let SubtargetPredicate = isGFX940Plus in {
1009+
def : DSAtomicRetPat<DS_PK_ADD_RTN_F16, v2f16, atomic_load_fadd_v2f16_local_32>;
1010+
def : GCNPat <
1011+
(v2i16 (int_amdgcn_ds_fadd_v2bf16 i32:$ptr, v2i16:$src)),
1012+
(DS_PK_ADD_RTN_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
1013+
>;
1014+
}
1015+
10011016
def : Pat <
10021017
(SIds_ordered_count i32:$value, i16:$offset),
10031018
(DS_ORDERED_COUNT $value, (as_i16imm $offset))
@@ -1410,3 +1425,10 @@ let SubtargetPredicate = isGFX90APlus in {
14101425
def DS_ADD_F64_vi : DS_Real_vi<0x5c, DS_ADD_F64>;
14111426
def DS_ADD_RTN_F64_vi : DS_Real_vi<0x7c, DS_ADD_RTN_F64>;
14121427
} // End SubtargetPredicate = isGFX90APlus
1428+
1429+
let SubtargetPredicate = isGFX940Plus in {
1430+
def DS_PK_ADD_F16_vi : DS_Real_vi<0x17, DS_PK_ADD_F16>;
1431+
def DS_PK_ADD_RTN_F16_vi : DS_Real_vi<0xb7, DS_PK_ADD_RTN_F16>;
1432+
def DS_PK_ADD_BF16_vi : DS_Real_vi<0x18, DS_PK_ADD_BF16>;
1433+
def DS_PK_ADD_RTN_BF16_vi : DS_Real_vi<0xb8, DS_PK_ADD_RTN_BF16>;
1434+
} // End SubtargetPredicate = isGFX940Plus

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -630,6 +630,13 @@ let SubtargetPredicate = isGFX90APlus in {
630630
defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
631631
} // End SubtargetPredicate = isGFX90APlus
632632

633+
let SubtargetPredicate = isGFX940Plus in {
634+
defm FLAT_ATOMIC_ADD_F32 : FLAT_Atomic_Pseudo<"flat_atomic_add_f32", VGPR_32, f32, int_amdgcn_flat_atomic_fadd>;
635+
defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_f16", VGPR_32, v2f16, int_amdgcn_flat_atomic_fadd>;
636+
defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_bf16", VGPR_32, v2f16>;
637+
defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Atomic_Pseudo<"global_atomic_pk_add_bf16", VGPR_32, v2f16>;
638+
} // End SubtargetPredicate = isGFX940Plus
639+
633640
defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>;
634641
defm GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>;
635642
defm GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>;
@@ -1280,6 +1287,13 @@ def : FlatSignedAtomicPat <FLAT_ATOMIC_MAX_F64_RTN, atomic_load_fmax_flat_ret_64
12801287
def : FlatSignedAtomicPat <FLAT_ATOMIC_MAX_F64, atomic_load_fmax_flat_noret_64, f64>;
12811288
}
12821289

1290+
let OtherPredicates = [isGFX940Plus] in {
1291+
def : FlatSignedAtomicPat <FLAT_ATOMIC_ADD_F32_RTN, atomic_load_fadd_flat_32, f32>;
1292+
def : FlatSignedAtomicPat <FLAT_ATOMIC_PK_ADD_F16_RTN, atomic_load_fadd_v2f16_flat_32, v2f16>;
1293+
def : FlatSignedAtomicPat <FLAT_ATOMIC_PK_ADD_BF16_RTN, int_amdgcn_flat_atomic_fadd_v2bf16, v2i16>;
1294+
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_PK_ADD_BF16", int_amdgcn_global_atomic_fadd_v2bf16, v2i16>;
1295+
}
1296+
12831297
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
12841298

12851299
let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in {
@@ -1432,6 +1446,14 @@ multiclass FLAT_Real_AllAddr_vi<bits<7> op,
14321446
def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
14331447
}
14341448

1449+
class FLAT_Real_gfx940 <bits<7> op, FLAT_Pseudo ps> :
1450+
FLAT_Real <op, ps>,
1451+
SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX940> {
1452+
let AssemblerPredicate = isGFX940Plus;
1453+
let DecoderNamespace = "GFX9";
1454+
let Inst{25} = !if(ps.has_sccb, cpol{CPolBit.SCC}, ps.sccbValue);
1455+
}
1456+
14351457
def FLAT_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, FLAT_LOAD_UBYTE>;
14361458
def FLAT_LOAD_SBYTE_vi : FLAT_Real_vi <0x11, FLAT_LOAD_SBYTE>;
14371459
def FLAT_LOAD_USHORT_vi : FLAT_Real_vi <0x12, FLAT_LOAD_USHORT>;
@@ -1574,7 +1596,7 @@ defm SCRATCH_STORE_DWORDX2 : FLAT_Real_AllAddr_vi <0x1d>;
15741596
defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_vi <0x1e>;
15751597
defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_vi <0x1f>;
15761598

1577-
let SubtargetPredicate = HasAtomicFaddInsts in {
1599+
let SubtargetPredicate = HasAtomicFaddInstsGFX90X in {
15781600
defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Real_Atomics_vi <0x04d, 0>;
15791601
defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Real_Atomics_vi <0x04e, 0>;
15801602
}
@@ -1588,6 +1610,40 @@ let SubtargetPredicate = isGFX90AOnly in {
15881610
defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Real_Atomics_vi<0x51, 0>;
15891611
} // End SubtargetPredicate = isGFX90AOnly
15901612

1613+
multiclass FLAT_Real_AllAddr_gfx940<bits<7> op> {
1614+
def _gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME)>;
1615+
def _SADDR_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>;
1616+
}
1617+
1618+
multiclass FLAT_Real_Atomics_gfx940 <bits<7> op, FLAT_Pseudo ps> {
1619+
def _gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>;
1620+
def _RTN_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>;
1621+
}
1622+
1623+
multiclass FLAT_Global_Real_Atomics_gfx940<bits<7> op> :
1624+
FLAT_Real_AllAddr_gfx940<op> {
1625+
def _RTN_gfx940 : FLAT_Real_gfx940 <op, !cast<FLAT_Pseudo>(NAME#"_RTN")>;
1626+
def _SADDR_RTN_gfx940 : FLAT_Real_gfx940 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN")>;
1627+
}
1628+
1629+
let SubtargetPredicate = HasAtomicFaddInstsGFX940 in {
1630+
defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Real_Atomics_gfx940 <0x04d>;
1631+
defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Real_Atomics_gfx940 <0x04e>;
1632+
}
1633+
1634+
let SubtargetPredicate = isGFX940Plus in {
1635+
defm FLAT_ATOMIC_ADD_F64 : FLAT_Real_Atomics_gfx940<0x4f, FLAT_ATOMIC_ADD_F64>;
1636+
defm FLAT_ATOMIC_MIN_F64 : FLAT_Real_Atomics_gfx940<0x50, FLAT_ATOMIC_MIN_F64>;
1637+
defm FLAT_ATOMIC_MAX_F64 : FLAT_Real_Atomics_gfx940<0x51, FLAT_ATOMIC_MAX_F64>;
1638+
defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Real_Atomics_gfx940<0x4f>;
1639+
defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Real_Atomics_gfx940<0x50>;
1640+
defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Real_Atomics_gfx940<0x51>;
1641+
defm FLAT_ATOMIC_ADD_F32 : FLAT_Real_Atomics_vi<0x4d, FLAT_ATOMIC_ADD_F32>;
1642+
defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Real_Atomics_vi<0x4e, FLAT_ATOMIC_PK_ADD_F16>;
1643+
defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Real_Atomics_vi<0x52, FLAT_ATOMIC_PK_ADD_BF16>;
1644+
defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Real_Atomics_vi<0x52>;
1645+
} // End SubtargetPredicate = isGFX940Plus
1646+
15911647
//===----------------------------------------------------------------------===//
15921648
// GFX10.
15931649
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1270,7 +1270,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
12701270
case Intrinsic::amdgcn_global_atomic_fmax:
12711271
case Intrinsic::amdgcn_flat_atomic_fadd:
12721272
case Intrinsic::amdgcn_flat_atomic_fmin:
1273-
case Intrinsic::amdgcn_flat_atomic_fmax: {
1273+
case Intrinsic::amdgcn_flat_atomic_fmax:
1274+
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1275+
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
12741276
Info.opc = ISD::INTRINSIC_W_CHAIN;
12751277
Info.memVT = MVT::getVT(CI.getType());
12761278
Info.ptrVal = CI.getOperand(0);
@@ -1326,6 +1328,8 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
13261328
case Intrinsic::amdgcn_flat_atomic_fadd:
13271329
case Intrinsic::amdgcn_flat_atomic_fmin:
13281330
case Intrinsic::amdgcn_flat_atomic_fmax:
1331+
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1332+
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
13291333
case Intrinsic::amdgcn_global_atomic_csub: {
13301334
Value *Ptr = II->getArgOperand(0);
13311335
AccessTy = II->getType();
@@ -12448,6 +12452,9 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1244812452

1244912453
if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) &&
1245012454
Subtarget->hasAtomicFaddInsts()) {
12455+
if (Subtarget->hasGFX940Insts())
12456+
return AtomicExpansionKind::None;
12457+
1245112458
// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
1245212459
// floating point atomic instructions. May generate more efficient code,
1245312460
// but may not respect rounding and denormal modes, and may give incorrect

0 commit comments

Comments
 (0)